# Online learning to predict performance 

In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# incremental training models

# Classif
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import Perceptron, SGDClassifier, PassiveAggressiveClassifier

# Regression
from sklearn.linear_model import SGDRegressor, PassiveAggressiveRegressor
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error


from sklearn.preprocessing import StandardScaler

## Read the data

In [22]:
df = pd.read_pickle('100experiments.pkl')


# Set the acc to the final accuracy
df['acc'] = df.accuracy.map(lambda a: a[-1])

# Set the time to the sum of the epoch durations
df['time'] = df.epoch_duration.map(sum)

# Set the parallelism to the first since it is constant
df.parallelism = df.parallelism.map(lambda l:l[0])

# change -1 to inf so the order is right in the plot


# ks = []
# for p, batch, k in zip(df.parallelism, df.batch_size, df.k):
#     ks.append(int((60000/p)/batch) if k == -1 else k)
    


In [8]:
df.columns

Index(['id', 'model_type', 'batch_size', 'epochs', 'dataset', 'lr',
       'function_name', 'default_parallelism', 'static_parallelism',
       'validate_every', 'k', 'goal_accuracy', 'validation_loss', 'accuracy',
       'train_loss', 'parallelism', 'epoch_duration', 'acc', 'time'],
      dtype='object')

## Try to fit the classifiers and compare performance

In [23]:
# divide into train and test
x_train, x_test = train_test_split(df, test_size=0.2, random_state=42)
s = StandardScaler()

y_train_acc, y_train_time = x_train['acc'], x_train['time']
y_test_acc, y_test_time = x_test['acc'], x_test['time']

x_train = x_train[['batch_size', 'lr', 'default_parallelism', 'k']]
x_test = x_test[['batch_size', 'lr', 'default_parallelism', 'k']]

x_train = s.fit_transform(x_train)
x_test = s.transform(x_test)


array([[ 1.18578398, -1.        ,  1.5709298 ,  1.33154276],
       [ 1.18578398, -1.        , -1.06071329, -0.54828231],
       [-1.13063124, -1.        , -0.68476427, -0.86158649],
       [-0.35849283, -1.        ,  0.06713375, -0.86158649],
       [ 1.18578398, -1.        , -0.68476427, -0.86158649],
       [-0.35849283, -1.        ,  1.5709298 ,  1.33154276],
       [-0.35849283, -1.        ,  1.5709298 , -0.86158649],
       [-0.35849283, -1.        , -1.06071329, -0.86158649],
       [ 1.18578398, -1.        , -0.68476427,  1.33154276],
       [-0.35849283, -1.        , -1.06071329,  1.33154276],
       [-1.13063124, -1.        , -0.68476427, -0.54828231],
       [ 1.18578398, -1.        ,  1.5709298 , -0.86158649],
       [ 1.18578398, -1.        , -1.06071329, -0.86158649],
       [-1.13063124, -1.        ,  1.5709298 ,  1.33154276],
       [ 1.18578398, -1.        ,  0.06713375, -0.54828231],
       [-0.35849283, -1.        ,  1.5709298 , -0.54828231],
       [-1.13063124, -1.

In [25]:
r = RandomForestRegressor(random_state=42)
params_rf = {
    'n_estimators': [50, 100, 150, 200, 500, 1000, 2000],
    'max_features': ['auto', 'log2', 'sqrt'],
    'max_depth': range(4,10),
    'criterion': ['mse', 'mae']
    
}

cv = GridSearchCV(estimator=r, param_grid=params_rf, n_jobs=8, cv=5, verbose=3)
cv.fit(x_train, y_train_time)

r = cv.best_estimator_
print(r)


y_pred = r.predict(x_test)
y_pred
mean_squared_error(y_pred, y_test_time)

Fitting 5 folds for each of 252 candidates, totalling 1260 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    4.9s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:   15.4s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:   40.6s
[Parallel(n_jobs=8)]: Done 496 tasks      | elapsed:  1.5min
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:  2.6min
[Parallel(n_jobs=8)]: Done 1136 tasks      | elapsed:  4.2min
[Parallel(n_jobs=8)]: Done 1260 out of 1260 | elapsed:  4.8min finished


RandomForestRegressor(criterion='mae', max_depth=4, max_features='log2',
                      n_estimators=2000, random_state=42)


694.2374873011186

In [27]:
# gradient bossting regressor
reg_time = GradientBoostingRegressor(random_state=42)

params = {
    'n_estimators': [50, 100, 150, 200, 500, 1000, 2000],
    'loss': ['ls', 'lad', 'huber'],
    'max_depth': range(3,10),
}


# First cross validation
print('Fitting the time...')
cv_time = GridSearchCV(estimator=reg_time, param_grid=params, n_jobs=8, cv=5, verbose=3)
cv_time.fit(x_train, y_train_time)

Fitting the time...
Fitting 5 folds for each of 147 candidates, totalling 735 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:   13.1s
[Parallel(n_jobs=8)]: Done 432 tasks      | elapsed:  1.5min
[Parallel(n_jobs=8)]: Done 656 tasks      | elapsed:  4.4min
[Parallel(n_jobs=8)]: Done 735 out of 735 | elapsed:  6.3min finished


GridSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=42),
             n_jobs=8,
             param_grid={'loss': ['ls', 'lad', 'huber'],
                         'max_depth': range(3, 10),
                         'n_estimators': [50, 100, 150, 200, 500, 1000, 2000]},
             verbose=3)

In [58]:
# fit the passive aggressive
from sklearn.metrics import mean_squared_error

pa = PassiveAggressiveRegressor()
sg = SGDRegressor()

for clf in [pa, sg]:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(mean_squared_error(y_pred, y_test))

594.164592719591
459.39315124290204


In [104]:
# online training
sg = PassiveAggressiveRegressor()
for feature, label in zip(x_train, y_train):
#     print(feature.reshape(-1,), np.array([label]))
    sg.partial_fit(feature.reshape(1, -1), np.array([label]))

In [100]:
y_pred = sg.predict(x_test)
print(mean_squared_error(y_pred, y_test))

818.1059058685958


In [105]:
d = s.transform(np.array([128, 0.01, 8, 16]).reshape(1,-1))
d
sg.predict(d)

array([82.50197023])

In [126]:
[[samples.batch_size, samples.lr, samples.default_parallelism, k] for k in Ks]

[[256.0, 0.01, 1.0, 2],
 [256.0, 0.01, 1.0, 8],
 [256.0, 0.01, 1.0, 16],
 [256.0, 0.01, 1.0, 64],
 [256.0, 0.01, 1.0, -1]]

In [123]:
Ks = [2,8, 16, 64, -1]

samples = x.iloc[2]
# [[[s.batch_size, s.lr, s.default_parallelism, K] for s in] for K in Ks]

## Create the class for evaluating possible K levels

In [167]:
class KOptimizer:
    
    Ks = [2,8, 16, 64, -1]
    
    def __init__(self, X, y_acc, y_time):
        self.scaler = StandardScaler()
        data = self.scaler.fit_transform(X)
        self.time_reg = PassiveAggressiveRegressor(random_state=42)
        self.acc_reg = PassiveAggressiveRegressor(random_state=42)
        
        # fit both regressors
        self.time_reg.fit(data, y_time)
        self.acc_reg.fit(data, y_acc)
        
    def __call__(self, X, y):
        _x = np.array([[X.batch_size, X.lr, X.default_parallelism, k] for k in self.Ks])
        stdata = self.scaler.transform(_x)
        preds_acc, preds_time = self.acc_reg.predict(stdata), self.time_reg.predict(stdata)
        print(_x, preds_acc, preds_time)
        
    def update(self, x: np.ndarray, time: float, acc: float):
        _x = self.scaler.transform(x.reshape(1, -1))
        self.time_reg.partial_fit(_x, np.array([time]))
        self.acc_reg.partial_fit(_x, np.array([acc]))

In [173]:
# op = KOptimizer(x_train, y_train_acc, y_train_time)
preds = op(df.iloc[15], df.iloc[15])
preds

[[ 6.4e+01  1.0e-02  8.0e+00  2.0e+00]
 [ 6.4e+01  1.0e-02  8.0e+00  8.0e+00]
 [ 6.4e+01  1.0e-02  8.0e+00  1.6e+01]
 [ 6.4e+01  1.0e-02  8.0e+00  6.4e+01]
 [ 6.4e+01  1.0e-02  8.0e+00 -1.0e+00]] [46.42098938 46.29519273 46.12746386 45.12109066 46.4838877 ] [ 0.38979026 -0.17957509 -0.93872889 -5.49365172  0.67447294]


In [174]:
for f, t, a in zip(s.transform(x_test), y_test_time, y_test_acc):
    print(t)
    op.update(f, t, a)

42.964120499
43.45463995
48.608870431
49.348767040000006
125.67063405100001
157.845865048
146.150401646
77.11826426100001
111.183158916
91.873677077
193.47928402899998
30.571016984000003
74.792759029
127.452963605
33.157986481
25.206731129
49.644195835000005
129.857463547
44.282958637
242.788765065
