# To determine what features will best predict cpu/event
## all features used


good reference to learn about different algorithms:<br>
http://blog.datadive.net/selecting-good-features-part-i-univariate-selection/ <br>
http://blog.datadive.net/selecting-good-features-part-ii-linear-models-and-regularization/


In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVR
from sklearn.linear_model import RandomizedLasso

from sklearn.model_selection import cross_val_score

### Get datasets

In [2]:
jobs = pd.read_csv('job.csv').set_index('pandaid')

del jobs['cputime']
del jobs['walltime']
del jobs['nevents']
del jobs['cpueff']
del jobs['wallPerEvent']

jobs.head()

Unnamed: 0_level_0,hs06,processingtype,cpuPerEvent
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3326375042,96,reprocessing,12.623182
3327803396,77,merge,0.3582
3330689156,90,merge,0.337855
3335362910,10,reprocessing,0.257667
3334380752,10,reprocessing,0.302045


In [9]:
benchmarks = pd.read_csv('benchmark.csv').set_index('pandaid')

del benchmarks['ip']

le = {}
le['cpuname'] = preprocessing.LabelEncoder()
#le['ip'] = preprocessing.LabelEncoder()
le['site'] = preprocessing.LabelEncoder()

benchmarks['cpuname'] = le['cpuname'].fit_transform(benchmarks['cpuname'])
#benchmarks['ip'] = le['ip'].fit_transform(benchmarks['ip'])
benchmarks['site'] = le['site'].fit_transform(benchmarks['site'])
benchmarks.head()

Unnamed: 0_level_0,bogomips,site,mpnum,cpuname,meminfo,fastBmk,whetstone
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3322806223,5333.52,245,1,138,2044632.0,9.223674,3195.99
3322633663,5000.0,252,1,49,16330708.0,12.12938,2749.34
3322488785,4988.06,190,1,121,65840380.0,23.731048,3599.7
3322806277,5333.16,65,1,76,37140112.0,12.240734,3426.18
3322875248,5332.58,207,1,50,32878000.0,12.349914,2911.33


### join dataframes and filter out bad rows

In [4]:
benchmarks = benchmarks.join(jobs,how='inner')
benchmarks = benchmarks[benchmarks.hs06 > 0]
benchmarks = benchmarks[benchmarks.cpuPerEvent > 0]
benchmarks = benchmarks[benchmarks.mpnum == 1]
benchmarks.head()

Unnamed: 0_level_0,bogomips,ip,site,mpnum,cpuname,meminfo,fastBmk,whetstone,hs06,processingtype,cpuPerEvent
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3323688554,5199.3,14668,269,1,108,65931600.0,15.404365,3616.23,12,evgen,112.09
3323737831,5199.28,17141,232,1,108,65936052.0,14.863749,3612.79,10,simul,73.644
3323782011,5199.25,17123,232,1,108,65936052.0,14.869888,3628.96,10,evgen,154.11
3323782017,4189.98,17071,232,1,95,65853344.0,18.13602,3306.22,10,evgen,159.28
3323788151,5199.24,17021,232,1,105,65855220.0,20.642202,3495.29,10,evgen,167.21


### split over different processing types 

In [5]:
UniqueProcessingTypes = benchmarks.processingtype.unique()
print(UniqueProcessingTypes)

#create a dict for all the dataframes to be filled later
ProcessingType = {elem : pd.DataFrame for elem in UniqueProcessingTypes}

#filling up data frames
for key in ProcessingType.keys():
    ProcessingType[key] = benchmarks[:][benchmarks.processingtype == key]
    

['evgen' 'simul' 'reprocessing' 'merge' 'recon' 'pmerge' 'pile']


### split on features and target

In [6]:
data = ProcessingType['recon']
del data['mpnum']

#target
Y = data['cpuPerEvent'].values

#cpuPerEvent.head()
features=data.copy()
del features['cpuPerEvent']
del features['processingtype']
fn=list(features.columns.values)
features.head()


Unnamed: 0_level_0,bogomips,ip,site,cpuname,meminfo,fastBmk,whetstone,hs06
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3326206400,5599.17,7925,86,120,132096860.0,10.827068,3284.36,10
3326206645,5599.19,8034,86,120,132064092.0,12.456747,3261.11,10
3326207224,5599.18,7893,86,120,132096860.0,9.123163,3230.11,10
3326207237,5599.18,7886,86,120,132096860.0,13.677812,3519.79,10
3326207628,5599.18,7909,86,120,132096860.0,10.183876,3260.96,10


### RandomForestRegressor

In [7]:
rf = RandomForestRegressor(n_estimators=100, max_features=6, n_jobs=-1) # max_depth=4,
scores = cross_val_score(rf, features, Y, n_jobs=-1, cv=9, scoring="r2")
print(scores, 'mean:', scores.mean())

rf.fit(features, Y)
scs=[]
for i in range(len(fn)):
    scs.append((rf.feature_importances_[i],fn[i]))

sorted(scs)

[ 0.5988292   0.86726677  0.36458478  0.43124893  0.69970144  0.73192526
  0.66231406  0.07413274  0.50517098] mean: 0.548352684127


[(0.003044710948738824, 'hs06'),
 (0.013842856515421236, 'meminfo'),
 (0.021625406530735861, 'site'),
 (0.043619363210513476, 'ip'),
 (0.08603498909019533, 'bogomips'),
 (0.11143471349217662, 'cpuname'),
 (0.25463755354821499, 'whetstone'),
 (0.46576040666400353, 'fastBmk')]

### ExtraTreesRegressor

In [8]:
et = ExtraTreesRegressor(n_estimators=100, max_depth=4)
scores = cross_val_score(et, features, Y, n_jobs=-1, cv=9, scoring="r2")
print(scores, 'mean:', scores.mean())

rf.fit(features, Y)
scs=[]
for i in range(len(fn)):
    scs.append((rf.feature_importances_[i],fn[i]))

sorted(scs)


[ 0.56591116  0.85045268  0.41338584  0.34578846  0.37310933  0.7281929
  0.54801666 -0.58922442  0.59709743] mean: 0.425858892946


[(0.0044705256728213164, 'hs06'),
 (0.015729017444658721, 'meminfo'),
 (0.021550663288864316, 'site'),
 (0.047385066444223467, 'ip'),
 (0.10616786596575604, 'bogomips'),
 (0.11475006749333659, 'cpuname'),
 (0.26334747861017899, 'whetstone'),
 (0.42659931508016041, 'fastBmk')]

In [12]:
rlasso = RandomizedLasso(alpha=0.025)
rlasso.fit(features, Y)
 
scs=[]
for i in range(len(fn)):
    scs.append((rlasso.scores_[i],fn[i]))

sorted(scs)    

[(0.23000000000000001, 'ip'),
 (0.5, 'meminfo'),
 (0.51000000000000001, 'cpuname'),
 (0.66500000000000004, 'whetstone'),
 (0.92500000000000004, 'site'),
 (0.98499999999999999, 'hs06'),
 (1.0, 'bogomips'),
 (1.0, 'fastBmk')]