## Part 1

In [14]:
import numpy as np
import pandas as pd
import optuna
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn

from sklearn.model_selection import KFold, ShuffleSplit, cross_val_score

from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from scipy.stats.mstats import winsorize

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('data/cosmote.csv', parse_dates=True)

In [3]:
df = df.fillna(method='ffill')
df['energy_mean_base'] = df['energy_mean'].copy()
df['energy_mean'] = winsorize(df['energy_mean'], limits=[0.05, 0.05])

In [4]:
df = df.drop(columns=['TCH_CONGESTION', 'TCH_BLOCKING', 'AVG_UL_MAC_UE_TPUT'])  # ? -> df.corr().energy_mean.sort_values(ascending=False)

In [5]:
random_split = list(ShuffleSplit(n_splits=1, test_size=0.1, random_state=1).split(df))[0]
train_index = random_split[0]
test_index = random_split[1]

In [6]:
trainDf = df.loc[train_index, ~df.columns.isin(['ID', 'PERIOD_START_TIME', 'energy_mean', 'energy_mean_base'])]

random_split = list(ShuffleSplit(n_splits=1, test_size=0.2, random_state=1).split(trainDf))[0]
train_index = random_split[0]
val_index = random_split[1]

trainDf = df.loc[train_index, ~df.columns.isin(['ID', 'PERIOD_START_TIME', 'energy_mean', 'energy_mean_base'])]
y_train = df.loc[train_index, 'energy_mean']
train_ndarray = np.array(trainDf)
y_train_ndarray = np.array(y_train)

valDf = df.loc[val_index, ~df.columns.isin(['ID', 'PERIOD_START_TIME', 'energy_mean', 'energy_mean_base'])]
y_val = df.loc[val_index, 'energy_mean']
val_ndarray = np.array(valDf)
y_val_ndarray = np.array(y_val)

testDf = df.loc[test_index, ~df.columns.isin(['ID', 'PERIOD_START_TIME', 'energy_mean', 'energy_mean_base'])]
y_test = df.loc[test_index, 'energy_mean']
test_ndarray = np.array(testDf)
y_test_ndarray = np.array(y_test)

In [7]:
def objective(trial):
    global train_ndarray
    global y_train_ndarray
    global y_val_ndarray

    classifier_name = trial.suggest_categorical("classifier", ["SVR", "RandomForest", "ElasticNet", "KNeighbors"])
    if classifier_name == "SVR":
        kernel = trial.suggest_categorical('kernel', ["rbf"])
        tol_svr = trial.suggest_float('tol_svr', 1e-3, 10, log=True)
        c = trial.suggest_float("c", 1e-1, 1e4, log=True)
        classifier_obj = sklearn.svm.SVR(C=c, kernel=kernel, tol=tol_svr)
    elif classifier_name == "RandomForest":
        max_depth = trial.suggest_int("max_depth", 2, 32, log=True)
        criterion = trial.suggest_categorical("criterion", ['squared_error', 'friedman_mse'])
        max_features = trial.suggest_categorical("max_features", ['sqrt', 'log2', None])
        bootstrap = trial.suggest_categorical("bootstrap", [True, False])
        n_estimators = trial.suggest_int("n_estimators", 10, 960, 50)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 32, log=True)
        min_samples_split = trial.suggest_int("min_samples_split", 5, 50, 5)
        
        classifier_obj = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators, criterion=criterion, max_features=max_features, bootstrap=bootstrap, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
    elif classifier_name == "KNeighbors":
        n_neighbors = trial.suggest_int("n_neighbors", 3, 19, 2)
        weights = trial.suggest_categorical("weights", ['uniform', 'distance'])
        
        classifier_obj = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights)
    elif classifier_name == "ElasticNet":
        alpha = trial.suggest_float("alpha", 0.1, 1)
        l1_ratio = trial.suggest_float("l1_ratio", 0, 1)
        tol_en = trial.suggest_float("tol_en", 1e-10, 1)
        selection = trial.suggest_categorical("selection", ['cyclic', 'random'])
        
        classifier_obj = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, tol=tol_en, selection=selection, precompute=True)
    
    model = classifier_obj.fit(train_ndarray, y_train_ndarray)
    preds = model.predict(val_ndarray)
    train_preds = model.predict(train_ndarray)
    
    train_preds = model.predict(train_ndarray)
    train_rmse = mean_squared_error(y_train_ndarray, train_preds, squared=False)
    train_mape = mean_absolute_percentage_error(y_train_ndarray, train_preds)
    
    val_rmse = mean_squared_error(y_val_ndarray, preds, squared=False)
    val_mape = mean_absolute_percentage_error(y_val_ndarray, preds)
    
    return val_rmse, val_mape, train_rmse, train_mape

In [8]:
study = optuna.create_study(directions=['minimize', 'minimize', 'minimize', 'minimize'])
study.optimize(objective, n_trials=1000, timeout=3600, n_jobs=-1)

[I 2023-11-05 12:12:07,578] A new study created in memory with name: no-name-d3249c9a-b4a4-4cae-b37a-55f63ac27452
[I 2023-11-05 12:12:08,190] Trial 1 finished with values: [466.3607091219861, 0.08336629576230259, 416.45159384556194, 0.08118375534870377] and parameters: {'classifier': 'ElasticNet', 'alpha': 0.1649873061530471, 'l1_ratio': 0.7290313349722478, 'tol_en': 0.1674265677380557, 'selection': 'random'}. 
  model = cd_fast.enet_coordinate_descent_gram(
[I 2023-11-05 12:12:08,334] Trial 5 finished with values: [441.0214589529545, 0.08196748879072828, 396.43815414756256, 0.08018800367715655] and parameters: {'classifier': 'ElasticNet', 'alpha': 0.4499687270064092, 'l1_ratio': 0.09301077482008757, 'tol_en': 0.11071462285322087, 'selection': 'random'}. 
[I 2023-11-05 12:12:28,237] Trial 6 finished with values: [395.7981469233278, 0.058516693512927924, 230.64177928335172, 0.03920606451554938] and parameters: {'classifier': 'KNeighbors', 'n_neighbors': 3, 'weights': 'uniform'}. 
[I 202

KeyboardInterrupt: 

In [None]:
trials = study.trials_dataframe()
trials.to_csv('out/models_energy_regression.csv')

## 1.1 Conclusion from results

In [27]:
df = pd.read_csv('out/models_energy_regression.csv')
df2 = pd.read_csv('out/models_SVR_energy_regression.csv')
df = pd.concat([df, df2])
df = df.drop(columns=['number', 'system_attrs_nsga2:generation', 'state'])
df = df.rename(columns={'values_0':'val_rmse', 'values_1':'val_mape', 'values_2':'train_rmse', 'values_3':'train_mape'})
df = df.sort_values('val_mape', ascending=True).reset_index(drop=True)

In [44]:
df.loc[df.params_classifier == 'KNeighbors'].head(200)

Unnamed: 0.1,Unnamed: 0,val_rmse,val_mape,train_rmse,train_mape,datetime_start,datetime_complete,duration,params_alpha,params_bootstrap,params_classifier,params_criterion,params_l1_ratio,params_max_depth,params_max_features,params_min_samples_leaf,params_min_samples_split,params_n_estimators,params_n_neighbors,params_selection,params_tol_en,params_weights,params_c,params_kernel,params_tol_svr
182,43,378.170242,0.056237,0.0,0.0,2023-11-05 14:34:30.366030,2023-11-05 14:34:38.635589,0 days 00:00:08.269559,,,KNeighbors,,,,,,,,7.0,,,distance,,,
183,63,378.170242,0.056237,0.0,0.0,2023-11-05 14:35:13.157599,2023-11-05 14:35:21.910349,0 days 00:00:08.752750,,,KNeighbors,,,,,,,,7.0,,,distance,,,
184,638,378.170242,0.056237,0.0,0.0,2023-11-05 14:48:25.466424,2023-11-05 14:48:34.588319,0 days 00:00:09.121895,,,KNeighbors,,,,,,,,7.0,,,distance,,,
185,752,378.170242,0.056237,0.0,0.0,2023-11-05 14:52:10.456256,2023-11-05 14:52:19.740395,0 days 00:00:09.284139,,,KNeighbors,,,,,,,,7.0,,,distance,,,
186,261,378.170242,0.056237,0.0,0.0,2023-11-05 14:39:28.255883,2023-11-05 14:39:38.052915,0 days 00:00:09.797032,,,KNeighbors,,,,,,,,7.0,,,distance,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,605,409.212917,0.057957,0.0,0.0,2023-11-05 14:47:21.419452,2023-11-05 14:47:30.068949,0 days 00:00:08.649497,,,KNeighbors,,,,,,,,17.0,,,distance,,,
400,948,409.212917,0.057957,0.0,0.0,2023-11-05 14:57:17.889357,2023-11-05 14:57:26.841086,0 days 00:00:08.951729,,,KNeighbors,,,,,,,,17.0,,,distance,,,
401,338,409.212917,0.057957,0.0,0.0,2023-11-05 14:41:04.633870,2023-11-05 14:41:14.243584,0 days 00:00:09.609714,,,KNeighbors,,,,,,,,17.0,,,distance,,,
402,293,409.212917,0.057957,0.0,0.0,2023-11-05 14:40:09.001691,2023-11-05 14:40:18.172831,0 days 00:00:09.171140,,,KNeighbors,,,,,,,,17.0,,,distance,,,
