# Find the best regression model to predict cpu/event times

#### based on: processing type, processor name, (fastBmk, hs06, site?)

In [12]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.dummy import DummyRegressor

from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score

import pickle

### Get datasets

In [2]:
jobs = pd.read_csv('job.csv').set_index('pandaid')
del jobs['cputime']
del jobs['walltime']
del jobs['nevents']
del jobs['cpueff']
del jobs['wallPerEvent']
jobs.head()

Unnamed: 0_level_0,hs06,processingtype,cpuPerEvent
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3326375042,96,reprocessing,12.623182
3327803396,77,merge,0.3582
3330689156,90,merge,0.337855
3335362910,10,reprocessing,0.257667
3334380752,10,reprocessing,0.302045


In [3]:
le={}
le['processingtype'] = preprocessing.LabelEncoder()
jobs['processingtype'] = le['processingtype'].fit_transform(jobs['processingtype'])
jobs.head()

Unnamed: 0_level_0,hs06,processingtype,cpuPerEvent
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3326375042,96,7,12.623182
3327803396,77,2,0.3582
3330689156,90,2,0.337855
3335362910,10,7,0.257667
3334380752,10,7,0.302045


In [6]:
benchmarks = pd.read_csv('benchmark.csv').set_index('pandaid')

le['cpuname'] = preprocessing.LabelEncoder()
le['ip'] = preprocessing.LabelEncoder()
le['site'] = preprocessing.LabelEncoder()

benchmarks['cpuname'] = le['cpuname'].fit_transform(benchmarks['cpuname'])
benchmarks['ip'] = le['ip'].fit_transform(benchmarks['ip'])
benchmarks['site'] = le['site'].fit_transform(benchmarks['site'])

benchmarks.head()

Unnamed: 0_level_0,bogomips,ip,site,mpnum,cpuname,meminfo,fastBmk,whetstone
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3322806223,5333.52,10653,245,1,138,2044632.0,9.223674,3195.99
3322633663,5000.0,6131,252,1,49,16330708.0,12.12938,2749.34
3322488785,4988.06,10198,190,1,121,65840380.0,23.731048,3599.7
3322806277,5333.16,8534,65,1,76,37140112.0,12.240734,3426.18
3322875248,5332.58,4605,207,1,50,32878000.0,12.349914,2911.33


### join dataframes and filter out bad rows

In [7]:
benchmarks = benchmarks.join(jobs,how='inner')
benchmarks = benchmarks[benchmarks.hs06 > 0]
benchmarks = benchmarks[benchmarks.cpuPerEvent > 0]
benchmarks = benchmarks[benchmarks.mpnum == 1]
del benchmarks['mpnum']
benchmarks.head()


Unnamed: 0_level_0,bogomips,ip,site,cpuname,meminfo,fastBmk,whetstone,hs06,processingtype,cpuPerEvent
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3323688554,5199.3,14668,269,108,65931600.0,15.404365,3616.23,12,1,112.09
3323737831,5199.28,17141,232,108,65936052.0,14.863749,3612.79,10,8,73.644
3323782011,5199.25,17123,232,108,65936052.0,14.869888,3628.96,10,1,154.11
3323782017,4189.98,17071,232,95,65853344.0,18.13602,3306.22,10,1,159.28
3323788151,5199.24,17021,232,105,65855220.0,20.642202,3495.29,10,1,167.21


### split on features and target

In [55]:
#target
y = benchmarks['cpuPerEvent'].values

#cpuPerEvent.head()
X=benchmarks.copy()
del X['cpuPerEvent']

del X['bogomips']
del X['cpuname']
#del X['fastBmk']
del X['whetstone']
del X['hs06']
#del X['ip']
#del X['site']
#del X['meminfo']

X.head()

Unnamed: 0_level_0,ip,site,meminfo,fastBmk,processingtype
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3323688554,14668,269,65931600.0,15.404365,1
3323737831,17141,232,65936052.0,14.863749,8
3323782011,17123,232,65936052.0,14.869888,1
3323782017,17071,232,65853344.0,18.13602,1
3323788151,17021,232,65855220.0,20.642202,1


#### split on train and test dataframes

In [56]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.10, random_state=41)

#### create estimators, fit and test
http://scikit-learn.org/stable/modules/model_evaluation.html#explained-variance-score

In [57]:
ESTIMATORS = {
    "Dummy      ": DummyRegressor(),
    "Random for.": RandomForestRegressor(n_estimators=100, n_jobs=-1), #max_features=6,
    "Extra trees": ExtraTreesRegressor(n_estimators=10, random_state=0), #max_features=9, 
    "K-nn       ": KNeighborsRegressor(),
    "Linear reg.": LinearRegression(),
    "RidgeCV    ": RidgeCV(),
}

for name, estimator in ESTIMATORS.items():
    estimator.fit(X_train, y_train)
    y_test_predict = estimator.predict(X_test)
    #e = y_test_predict-y_test
    evs = explained_variance_score(y_test, y_test_predict)
    mae = mean_absolute_error(y_test, y_test_predict)
    mse = mean_squared_error(y_test, y_test_predict)
    r2 = r2_score(y_test, y_test_predict) 
    print(name,  "%2.4f" % evs, "{:>10.4f}".format(mae), "{:>10.4f}".format(mse), "{:>10.4f}".format(r2)) 

Extra trees 0.3863    82.7240 26364.0168     0.3863
K-nn        0.2116   113.2791 33897.6011     0.2110
RidgeCV     -1257.2292  5563.0523 54136971.0317 -1259.1176
Dummy       0.0000   139.3849 42963.5735    -0.0000
Random for. 0.4992    78.6822 21519.9257     0.4991
Linear reg. 0.1011   129.8576 38616.6058     0.1011


### All features included
            EVS    Mean abs err   mean squ e   r2
            
Extra trees 0.3177    79.8326 22726.6598     0.3164
K-nn        -0.1319   121.8788 37715.6020   -0.1345
Dummy       0.0000   136.0715 33333.2251    -0.0027
Random for. 0.2959    77.9410 23497.6861     0.2932
Linear reg. 0.1353   122.2244 28852.2256     0.1321


### only bogomips

Extra trees 0.1892    83.8061 26969.4907     0.1887
K-nn        -0.0689   114.3790 35633.9537   -0.0719
Dummy       0.0000   136.0715 33333.2251    -0.0027
Random for. 0.3052    81.3823 23108.9871     0.3049
Linear reg. 0.1143   125.1685 29547.4762     0.1112

### only CPU name
Extra trees 0.1448    86.5517 28465.8947     0.1437
K-nn        -0.1027   114.2076 36760.8306   -0.1058
Dummy       0.0000   136.0715 33333.2251    -0.0027
Random for. 0.2833    82.0722 23840.2957     0.2829
Linear reg. 0.1213   124.4278 29320.6519     0.1180

### only fastBMK
Extra trees 0.2792    81.7456 23993.9843     0.2782
K-nn        0.0069   113.1865 33069.6916     0.0052
Dummy       0.0000   136.0715 33333.2251    -0.0027
Random for. 0.3339    78.7366 22199.4186     0.3322
Linear reg. 0.1416   121.8495 28643.2254     0.1384

### only Whetstone
Extra trees 0.2411    83.4537 25250.6360     0.2404
K-nn        -0.1332   122.0277 37766.9293   -0.1361
Dummy       0.0000   136.0715 33333.2251    -0.0027
Random for. 0.3432    79.1615 21868.4357     0.3422
Linear reg. 0.1360   122.7514 28841.9530     0.1324

### only HS06
Extra trees 0.1388    87.1799 28666.4859     0.1377
K-nn        -0.1021   114.1199 36740.4781   -0.1052
Dummy       0.0000   136.0715 33333.2251    -0.0027
Random for. 0.2776    82.5629 24030.8529     0.2771
Linear reg. 0.1120   125.1009 29621.5737     0.1089
