# To determine what features will best predict cpu/event

## only taking numerical features used

good reference to learn about different algorithms:<br>
http://blog.datadive.net/selecting-good-features-part-i-univariate-selection/ <br>
http://blog.datadive.net/selecting-good-features-part-ii-linear-models-and-regularization/


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score

### Get datasets

In [2]:
jobs = pd.read_csv('job.csv').set_index('pandaid')
del jobs['cputime']
del jobs['walltime']
del jobs['nevents']
del jobs['cpueff']
del jobs['wallPerEvent']
jobs.head()

Unnamed: 0_level_0,hs06,processingtype,cpuPerEvent
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3326375042,96,reprocessing,12.623182
3327803396,77,merge,0.3582
3330689156,90,merge,0.337855
3335362910,10,reprocessing,0.257667
3334380752,10,reprocessing,0.302045


In [3]:
benchmarks = pd.read_csv('benchmark.csv').set_index('pandaid')
del benchmarks['ip']
del benchmarks['site']
del benchmarks['cpuname']
benchmarks.head()

Unnamed: 0_level_0,bogomips,mpnum,meminfo,fastBmk,whetstone
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3322806223,5333.52,1,2044632.0,9.223674,3195.99
3322633663,5000.0,1,16330708.0,12.12938,2749.34
3322488785,4988.06,1,65840380.0,23.731048,3599.7
3322806277,5333.16,1,37140112.0,12.240734,3426.18
3322875248,5332.58,1,32878000.0,12.349914,2911.33


### join dataframes and filter out bad rows
We have to remove rows with mpnum > 1 as I don't know how hs06 is calculated for these.

In [4]:
benchmarks = benchmarks.join(jobs,how='inner')
benchmarks = benchmarks[benchmarks.hs06 > 0]
benchmarks = benchmarks[benchmarks.cpuPerEvent > 0]
benchmarks = benchmarks[benchmarks.mpnum == 1]
benchmarks.head()

Unnamed: 0_level_0,bogomips,mpnum,meminfo,fastBmk,whetstone,hs06,processingtype,cpuPerEvent
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3323688554,5199.3,1,65931600.0,15.404365,3616.23,12,evgen,112.09
3323737831,5199.28,1,65936052.0,14.863749,3612.79,10,simul,73.644
3323782011,5199.25,1,65936052.0,14.869888,3628.96,10,evgen,154.11
3323782017,4189.98,1,65853344.0,18.13602,3306.22,10,evgen,159.28
3323788151,5199.24,1,65855220.0,20.642202,3495.29,10,evgen,167.21


### Split over different processing types 

In [5]:
UniqueProcessingTypes = benchmarks.processingtype.unique()
print(UniqueProcessingTypes)

#create a dict for all the dataframes to be filled later
ProcessingType = {elem : pd.DataFrame for elem in UniqueProcessingTypes}

#filling up data frames
for key in ProcessingType.keys():
    ProcessingType[key] = benchmarks[:][benchmarks.processingtype == key]

['evgen' 'simul' 'reprocessing' 'merge' 'recon' 'pmerge' 'pile']


### split on features and target

In [6]:
data = ProcessingType['recon']
del data['mpnum'] # so it does not create problem as it has 0 variance

#target
Y = data['cpuPerEvent'].values

features=data.copy()
del features['cpuPerEvent']
del features['processingtype']
fn=list(features.columns.values)
features.head()

### take one processing type and feature ranking

In [7]:
# F-test captures only linear dependency
f_test, _ = f_regression(features, Y)
f_test /= np.max(f_test)

scs=[]
for i in range(len(fn)):
    scs.append((f_test[i],fn[i]))
sorted(scs, reverse=True)

[(2.8110535833284858, 'hs06'),
 (5.4599827201603244, 'meminfo'),
 (30.786980998364918, 'bogomips'),
 (66.476865646252548, 'fastBmk'),
 (102.75179169909188, 'whetstone')]

In [8]:
mi = mutual_info_regression(features, Y)
mi /= np.max(mi)

scs=[]
for i in range(len(fn)):
    scs.append((mi[i],fn[i]))
sorted(scs, reverse=True)

[(0.0, 'hs06'),
 (0.32649761063774396, 'meminfo'),
 (0.78518174009027031, 'whetstone'),
 (0.79097083159779791, 'bogomips'),
 (1.0, 'fastBmk')]

### RandomForestRegressor

In [9]:
rf = RandomForestRegressor(n_estimators=100, max_depth=4)
scores = cross_val_score(rf, features, Y, n_jobs=-1, cv=9, scoring="r2")
print(scores, '\nmean:', scores.mean())

rf.fit(features, Y)
scs=[]
for i in range(len(fn)):
    scs.append((rf.feature_importances_[i],fn[i]))

sorted(scs, reverse=True)

[ 0.78942212  0.79517551  0.42300441  0.51696794  0.59249003  0.72747613
  0.60887989 -0.06827818  0.61683765] mean: 0.555775054452


[(0.0058864526191476666, 'hs06'),
 (0.011268017543765628, 'meminfo'),
 (0.12978414420355888, 'bogomips'),
 (0.26620549921222103, 'whetstone'),
 (0.58685588642130693, 'fastBmk')]

### ExtraTreesRegressor

In [10]:
et = ExtraTreesRegressor(n_estimators=100, max_depth=4)
scores = cross_val_score(et, features, Y, n_jobs=-1, cv=9, scoring="r2")
print(scores, '\nmean:', scores.mean())

rf.fit(features, Y)
scs=[]
for i in range(len(fn)):
    scs.append((rf.feature_importances_[i],fn[i]))

sorted(scs, reverse=True)


[ 0.72329187  0.80760367  0.44145423  0.57187582  0.49230026  0.7049618
  0.63399341 -0.27923388  0.67044084] mean: 0.529632001491


[(0.0031377298414133003, 'hs06'),
 (0.0092685071658521814, 'meminfo'),
 (0.14346829332947897, 'bogomips'),
 (0.32748186562298515, 'whetstone'),
 (0.5166436040402701, 'fastBmk')]

#### to notice: linear models won't work as variables are strongly correlated. The only linear that could work is L2 regularization / Ridge regression

In [15]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score

In [21]:
# no good as our features are strongly correlated
lr = LinearRegression()
lr.fit(features, Y)
print ("Linear model:")

scs=[]
for i in range(len(fn)):
    scs.append((lr.coef_[i],fn[i]))
sorted(scs, reverse=True) 

Linear model:


[(-2.688077981199226e-08, 'meminfo'),
 (-0.0019606517912314643, 'whetstone'),
 (-0.0055934046896382594, 'bogomips'),
 (-0.71246612548817079, 'hs06'),
 (-0.86139057837546318, 'fastBmk')]

In [23]:
# everything is anti-correlated ?! hard to believe
ridge = Ridge(alpha=50)
ridge.fit(features, Y)
print ("Ridge model:")
scs=[]
for i in range(len(fn)):
    scs.append((ridge.coef_[i],fn[i]))
sorted(scs, reverse=True) 

Ridge model:


[(-2.6324216448886319e-08, 'meminfo'),
 (-0.002336868566149999, 'whetstone'),
 (-0.0053811367593338405, 'bogomips'),
 (-0.54561071581442433, 'hs06'),
 (-0.79549305607797993, 'fastBmk')]

In [11]:
# too slow
# estimator = SVR(kernel="linear")
# #selector = RFE(estimator, 5, step=1)
# selector = RFECV(estimator, cv=5, step=1)
# selector = selector.fit(features, Y)
#print( selector.support_ , selector.ranking_)