# Online learning to predict performance 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# incremental training models

# Classif
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import Perceptron, SGDClassifier, PassiveAggressiveClassifier

# Regression
from sklearn.linear_model import SGDRegressor, PassiveAggressiveRegressor
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error


from sklearn.preprocessing import StandardScaler

## Read the data

In [None]:
df = pd.read_pickle('100experiments.pkl')


# Set the acc to the final accuracy
df['acc'] = df.accuracy.map(lambda a: a[-1])

# Set the time to the sum of the epoch durations
df['time'] = df.epoch_duration.map(sum)

# Set the parallelism to the first since it is constant
df.parallelism = df.parallelism.map(lambda l:l[0])

# change -1 to inf so the order is right in the plot


# ks = []
# for p, batch, k in zip(df.parallelism, df.batch_size, df.k):
#     ks.append(int((60000/p)/batch) if k == -1 else k)
    


In [None]:
df.columns

## Try to fit the classifiers and compare performance

In [None]:
# divide into train and test
x_train, x_test = train_test_split(df, test_size=0.2, random_state=42)
s = StandardScaler()

y_train_acc, y_train_time = x_train['acc'], x_train['time']
y_test_acc, y_test_time = x_test['acc'], x_test['time']

x_train = x_train[['batch_size', 'lr', 'default_parallelism', 'k']]
x_test = x_test[['batch_size', 'lr', 'default_parallelism', 'k']]

x_train = s.fit_transform(x_train)
x_test = s.transform(x_test)


In [None]:
r = RandomForestRegressor(random_state=42)
params_rf = {
    'n_estimators': [50, 100, 150, 200, 500, 1000, 2000],
    'max_features': ['auto', 'log2', 'sqrt'],
    'max_depth': range(4,10),
    'criterion': ['mse', 'mae']
    
}

cv = GridSearchCV(estimator=r, param_grid=params_rf, n_jobs=8, cv=5, verbose=3)
cv.fit(x_train, y_train_time)

r = cv.best_estimator_
print(r)


y_pred = r.predict(x_test)
y_pred
mean_squared_error(y_pred, y_test_time)

In [None]:
# gradient bossting regressor
reg_time = GradientBoostingRegressor(random_state=42)

params = {
    'n_estimators': [50, 100, 150, 200, 500, 1000, 2000],
    'loss': ['ls', 'lad', 'huber'],
    'max_depth': range(3,10),
}


# First cross validation
print('Fitting the time...')
cv_time = GridSearchCV(estimator=reg_time, param_grid=params, n_jobs=8, cv=5, verbose=3)
cv_time.fit(x_train, y_train_time)

In [None]:
# fit the passive aggressive
from sklearn.metrics import mean_squared_error

pa = PassiveAggressiveRegressor()
sg = SGDRegressor()

for clf in [pa, sg]:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(mean_squared_error(y_pred, y_test))

In [None]:
# online training
sg = PassiveAggressiveRegressor()
for feature, label in zip(x_train, y_train):
#     print(feature.reshape(-1,), np.array([label]))
    sg.partial_fit(feature.reshape(1, -1), np.array([label]))

In [None]:
y_pred = sg.predict(x_test)
print(mean_squared_error(y_pred, y_test))

In [None]:
d = s.transform(np.array([128, 0.01, 8, 16]).reshape(1,-1))
d
sg.predict(d)

In [None]:
[[samples.batch_size, samples.lr, samples.default_parallelism, k] for k in Ks]

In [None]:
Ks = [2,8, 16, 64, -1]

samples = x.iloc[2]
# [[[s.batch_size, s.lr, s.default_parallelism, K] for s in] for K in Ks]

## Create the class for evaluating possible K levels

In [None]:
class KOptimizer:
    
    Ks = [2,8, 16, 64, -1]
    
    def __init__(self, X, y_acc, y_time):
        self.scaler = StandardScaler()
        data = self.scaler.fit_transform(X)
        self.time_reg = PassiveAggressiveRegressor(random_state=42)
        self.acc_reg = PassiveAggressiveRegressor(random_state=42)
        
        # fit both regressors
        self.time_reg.fit(data, y_time)
        self.acc_reg.fit(data, y_acc)
        
    def __call__(self, X, y):
        _x = np.array([[X.batch_size, X.lr, X.default_parallelism, k] for k in self.Ks])
        stdata = self.scaler.transform(_x)
        preds_acc, preds_time = self.acc_reg.predict(stdata), self.time_reg.predict(stdata)
        print(_x, preds_acc, preds_time)
        
    def update(self, x: np.ndarray, time: float, acc: float):
        _x = self.scaler.transform(x.reshape(1, -1))
        self.time_reg.partial_fit(_x, np.array([time]))
        self.acc_reg.partial_fit(_x, np.array([acc]))

In [None]:
# op = KOptimizer(x_train, y_train_acc, y_train_time)
preds = op(df.iloc[15], df.iloc[15])
preds

In [None]:
for f, t, a in zip(s.transform(x_test), y_test_time, y_test_acc):
    print(t)
    op.update(f, t, a)