# Importing libraries

In [40]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
# General Libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go

# Algorithms (Regression)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as XGBoost

# Preprocessing / Feature Selection / Model Selection / Metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from multiprocessing import Pool, Process

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

pd.options.display.max_columns = 200

# Functions

In [42]:
def quality_metrics(y_true: float, y_pred: float) -> dict:
  dict_metrics = dict()
  dict_metrics['MSE'] = mean_squared_error(y_true, y_pred)
  dict_metrics['MAE'] = mean_absolute_error(y_true, y_pred)
  dict_metrics['RMSE'] = np.sqrt(dict_metrics['MSE'])
  return dict_metrics

# Importing datasets

In [43]:
#Real WP6++ dataset
real_df = pd.read_csv("/content/drive/MyDrive/Files For Project/AtomPairsData.csv")
real_df = real_df.drop(["Unnamed: 0"], axis = 1)
#Synthetic WP6++ dataset
gen_df = pd.read_csv("/content/drive/MyDrive/Files For Project/GeneratedData.csv")
gen_df = gen_df.drop(["Unnamed: 0"], axis = 1)

print(f"Length of real dataset: {real_df.shape[0]}")
print(f"Length of synthetic dataset: {gen_df.shape[0]}")

Length of real dataset: 21
Length of synthetic dataset: 31


There will be 3 datasets:

* 7 synthetic samples + 21 real samples (25% of synthetic data)  
* 14 synthetic + 21 real (50%)
* 31 syntheric + 21 real (60%)



# Ratio of 7:21 (25%)

In [44]:
adding = gen_df.head(7)
df = pd.concat([real_df, adding], ignore_index=True)
df.shape

(28, 13)

In [45]:
X1 = df.drop(['Ka_mean'], axis=1)
y1 = df["Ka_mean"]
print(f'General quantity of samples: {X1.shape[0]}')
print(f'Quantity of features: {X1.shape[1]}')

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.1, shuffle=False, random_state=42)
print(f"Length of training data: {len(X_train1)}")
print(f"Length of testing data: {len(X_test1)}")

General quantity of samples: 28
Quantity of features: 12
Length of training data: 25
Length of testing data: 3


## Building machine learning models

### Linear Regression

In [46]:
# Creating and fitting model
Lin_regressor1 = LinearRegression(n_jobs=-1)
grid_search_cv_linear1 = GridSearchCV(Lin_regressor1, {}, cv=5)
grid_search_cv_linear1.fit(X_train1, y_train1.to_numpy())

# Model predictions
y_pred_Linear_train1 = grid_search_cv_linear1.best_estimator_.predict(X_train1)
y_pred_Linear_test1 = grid_search_cv_linear1.best_estimator_.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_Linear_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_Linear_test1)
for name_metric, error in lin_reg_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02666
Train MAE: 0.09439
Train RMSE: 0.16328

Test MSE: 0.03082
Test MAE: 0.13329
Test RMSE: 0.17556


### Ridge Regression

In [47]:
# Creating and fitting model
Ridge_regressor1 = Ridge()
ridge_reg_parameters1 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge1 = GridSearchCV(Ridge_regressor1, ridge_reg_parameters1, cv=5)

grid_search_cv_ridge1.fit(X_train1, y_train1)
grid_search_cv_ridge1.best_params_

# Model predictions
Ridge_best_reg1 = grid_search_cv_ridge1.best_estimator_
y_pred_Ridge_train1 = Ridge_best_reg1.predict(X_train1)
y_pred_Ridge_test1 = Ridge_best_reg1.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_Ridge_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_Ridge_test1)
for name_metric, error in ridge_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.03965
Train MAE: 0.12373
Train RMSE: 0.19913

Test MSE: 0.00493
Test MAE: 0.04928
Test RMSE: 0.07024


### Lasso Regresssion

In [48]:
# Creating and fitting model
Lasso_regressor1 = Lasso()
lasso_reg_parameters1 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso1 = GridSearchCV(Lasso_regressor1, lasso_reg_parameters1, cv=5)

grid_search_cv_lasso1.fit(X_train1, y_train1)
grid_search_cv_lasso1.best_params_

# Model predictions
Lasso_best_reg1 = grid_search_cv_lasso1.best_estimator_
y_pred_Lasso_train1 = Lasso_best_reg1.predict(X_train1)
y_pred_Lasso_test1 = Lasso_best_reg1.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_Lasso_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_Lasso_test1)
for name_metric, error in lasso_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06744
Train MAE: 0.16594
Train RMSE: 0.25969

Test MSE: 0.00535
Test MAE: 0.06892
Test RMSE: 0.07315


### ElasticNet Regression

In [49]:
# Creating and fitting model
ElasticNet_regressor1 = ElasticNet()
elasticnet_reg_parameters1 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet1 = GridSearchCV(ElasticNet_regressor1, elasticnet_reg_parameters1, cv=5)

grid_search_cv_elasticnet1.fit(X_train1, y_train1)
grid_search_cv_elasticnet1.best_params_

# Model predictions
Elasticnet_best_reg1 = grid_search_cv_elasticnet1.best_estimator_
y_pred_Elasticnet_train1 = Elasticnet_best_reg1.predict(X_train1)
y_pred_Elasticnet_test1 = Elasticnet_best_reg1.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_Elasticnet_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_Elasticnet_test1)
for name_metric, error in elasticnet_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04243
Train MAE: 0.12839
Train RMSE: 0.20600

Test MSE: 0.00398
Test MAE: 0.05391
Test RMSE: 0.06309


### Random Forest

In [50]:
# Creating and fitting model
Forest_regressor1 = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters1 = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest1 = GridSearchCV(Forest_regressor1, forest_reg_parameters1, cv=3)
grid_search_cv_forest1.fit(X_train1, y_train1)
grid_search_cv_forest1.best_params_

# Model predictions
Forest_best_reg1 = grid_search_cv_forest1.best_estimator_
y_pred_Forest_train1 = Forest_best_reg1.predict(X_train1)
y_pred_Forest_test1 = Forest_best_reg1.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_Forest_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_Forest_test1)
for name_metric, error in rforest_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02380
Train MAE: 0.07971
Train RMSE: 0.15426

Test MSE: 0.00126
Test MAE: 0.03053
Test RMSE: 0.03552


### k-NN Regression

In [51]:
# Creating and fitting model
KNeighbors_regressor1 = KNeighborsRegressor()
kNN_reg_parameters1 = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN1 = GridSearchCV(KNeighbors_regressor1, kNN_reg_parameters1, cv=5)
grid_search_cv_kNN1.fit(X_train1, y_train1)
grid_search_cv_kNN1.best_params_

# Model predictions
kNN_best_reg1 = grid_search_cv_kNN1.best_estimator_
y_pred_kNN_train1 = kNN_best_reg1.predict(X_train1)
y_pred_kNN_test1 = kNN_best_reg1.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_kNN_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_kNN_test1)
for name_metric, error in knn_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04723
Train MAE: 0.12069
Train RMSE: 0.21732

Test MSE: 0.00150
Test MAE: 0.03507
Test RMSE: 0.03870


### Boosting

In [52]:
# Creating and fitting model
GBoosting_regressor1 = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters1 = {}
grid_search_cv_gboost1 = GridSearchCV(GBoosting_regressor1, gboost_reg_parameters1, cv=5)
grid_search_cv_gboost1.fit(X_train1, y_train1)
#print(grid_search_cv_gboost.best_params_)

# Model predictions
GBoost_best_reg1 = grid_search_cv_gboost1.best_estimator_
y_pred_gboost_train1 = GBoost_best_reg1.predict(X_train1)
y_pred_gboost_test1 = GBoost_best_reg1.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_gboost_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_gboost_test1)
for name_metric, error in gboost_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01938
Train MAE: 0.04003
Train RMSE: 0.13922

Test MSE: 0.00191
Test MAE: 0.03462
Test RMSE: 0.04369


## Evaluation

In [53]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics1,lasso_metrics1, elasticnet_metrics1, rforest_metrics1, knn_metrics1, gboost_metrics1]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "WP6 + generated data (25%)",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()

> The metrics are already **much** better than just after using the standard WP6 dataset!

### Table of metrics

In [54]:
data = []

for i in range(3):
    row = []
    for j in range(6):
        key = list(model_metrics[j].keys())[i]
        row.append(model_metrics[j][key])
    data.append(row)

WP6_25_metrics = pd.DataFrame(data, columns=model_name[:len(data[0])])

WP6_25_metrics

Unnamed: 0,Ridge,Lasso,ElasticNet,RForest,kNN,Gradient Boosting
0,0.004933,0.005351,0.003981,0.001261,0.001498,0.001909
1,0.049281,0.068924,0.053907,0.030526,0.035074,0.034623
2,0.070236,0.073148,0.063093,0.035517,0.038702,0.04369


# Ratio of 14:21 (50%)

In [55]:
adding = gen_df.head(14)
df = pd.concat([real_df, adding], ignore_index=True)
df.shape

(35, 13)

In [56]:
X2 = df.drop(['Ka_mean'], axis=1)
y2 = df["Ka_mean"]
print(f'General quantity of samples: {X2.shape[0]}')
print(f'Quantity of features: {X2.shape[1]}')

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.1, shuffle=False, random_state=42)
print(f"Length of training data: {len(X_train2)}")
print(f"Length of testing data: {len(X_test2)}")

General quantity of samples: 35
Quantity of features: 12
Length of training data: 31
Length of testing data: 4


## Building machine learning models

### Linear Regression

In [57]:
# Creating and fitting model
Lin_regressor2 = LinearRegression(n_jobs=-1)
grid_search_cv_linear2 = GridSearchCV(Lin_regressor2, {}, cv=5)
grid_search_cv_linear2.fit(X_train2, y_train2.to_numpy())

# Model predictions
y_pred_Linear_train2 = grid_search_cv_linear2.best_estimator_.predict(X_train2)
y_pred_Linear_test2 = grid_search_cv_linear2.best_estimator_.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_Linear_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_Linear_test2)
for name_metric, error in lin_reg_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02358
Train MAE: 0.09390
Train RMSE: 0.15356

Test MSE: 0.05829
Test MAE: 0.19480
Test RMSE: 0.24144


### Ridge Regression

In [58]:
# Creating and fitting model
Ridge_regressor2 = Ridge()
ridge_reg_parameters2 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge2 = GridSearchCV(Ridge_regressor2, ridge_reg_parameters2, cv=5)

grid_search_cv_ridge2.fit(X_train2, y_train2)
grid_search_cv_ridge2.best_params_

# Model predictions
Ridge_best_reg2 = grid_search_cv_ridge2.best_estimator_
y_pred_Ridge_train2 = Ridge_best_reg2.predict(X_train2)
y_pred_Ridge_test2 = Ridge_best_reg2.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_Ridge_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_Ridge_test2)
for name_metric, error in ridge_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02359
Train MAE: 0.09394
Train RMSE: 0.15357

Test MSE: 0.05782
Test MAE: 0.19289
Test RMSE: 0.24045


### Lasso Regresssion

In [59]:
# Creating and fitting model
Lasso_regressor2 = Lasso()
lasso_reg_parameters2 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso2 = GridSearchCV(Lasso_regressor2, lasso_reg_parameters2, cv=5)

grid_search_cv_lasso2.fit(X_train2, y_train2)
grid_search_cv_lasso2.best_params_

# Model predictions
Lasso_best_reg2 = grid_search_cv_lasso2.best_estimator_
y_pred_Lasso_train2 = Lasso_best_reg2.predict(X_train2)
y_pred_Lasso_test2 = Lasso_best_reg2.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_Lasso_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_Lasso_test2)
for name_metric, error in lasso_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02358
Train MAE: 0.09391
Train RMSE: 0.15356

Test MSE: 0.05824
Test MAE: 0.19457
Test RMSE: 0.24132


### ElasticNet Regression

In [60]:
# Creating and fitting model
ElasticNet_regressor2 = ElasticNet()
elasticnet_reg_parameters2 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet2 = GridSearchCV(ElasticNet_regressor2, elasticnet_reg_parameters2, cv=5)

grid_search_cv_elasticnet2.fit(X_train2, y_train2)
grid_search_cv_elasticnet2.best_params_

# Model predictions
Elasticnet_best_reg2 = grid_search_cv_elasticnet2.best_estimator_
y_pred_Elasticnet_train2 = Elasticnet_best_reg2.predict(X_train2)
y_pred_Elasticnet_test2 = Elasticnet_best_reg2.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_Elasticnet_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_Elasticnet_test2)
for name_metric, error in elasticnet_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02359
Train MAE: 0.09396
Train RMSE: 0.15357

Test MSE: 0.05777
Test MAE: 0.19266
Test RMSE: 0.24036


### Random Forest

In [None]:
# Creating and fitting model
Forest_regressor2 = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters2 = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest2 = GridSearchCV(Forest_regressor2, forest_reg_parameters2, cv=3)
grid_search_cv_forest2.fit(X_train2, y_train2)
grid_search_cv_forest2.best_params_

# Model predictions
Forest_best_reg2 = grid_search_cv_forest2.best_estimator_
y_pred_Forest_train2 = Forest_best_reg2.predict(X_train2)
y_pred_Forest_test2 = Forest_best_reg2.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_Forest_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_Forest_test2)
for name_metric, error in rforest_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

### k-NN Regression

In [None]:
# Creating and fitting model
KNeighbors_regressor2 = KNeighborsRegressor()
kNN_reg_parameters2 = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN2 = GridSearchCV(KNeighbors_regressor2, kNN_reg_parameters2, cv=5)
grid_search_cv_kNN2.fit(X_train2, y_train2)
grid_search_cv_kNN2.best_params_

# Model predictions
kNN_best_reg2 = grid_search_cv_kNN2.best_estimator_
y_pred_kNN_train2 = kNN_best_reg2.predict(X_train2)
y_pred_kNN_test2 = kNN_best_reg2.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_kNN_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_kNN_test2)
for name_metric, error in knn_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

### Boosting

In [None]:
# Creating and fitting model
GBoosting_regressor2 = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters2 = {}
grid_search_cv_gboost2 = GridSearchCV(GBoosting_regressor2, gboost_reg_parameters2, cv=5)
grid_search_cv_gboost2.fit(X_train2, y_train2)
#print(grid_search_cv_gboost.best_params_)

# Model predictions
GBoost_best_reg2 = grid_search_cv_gboost2.best_estimator_
y_pred_gboost_train2 = GBoost_best_reg2.predict(X_train2)
y_pred_gboost_test2 = GBoost_best_reg2.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_gboost_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_gboost_test2)
for name_metric, error in gboost_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

## Evaluation

In [None]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics2,lasso_metrics2, elasticnet_metrics2, rforest_metrics2, knn_metrics2, gboost_metrics2]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "WP6 + generated data (50%)",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()

> Better than the original WP6 dataset, but worse than previous obtained metrics

### Table of metrics

In [None]:
data = []

for i in range(3):
    row = []
    for j in range(6):
        key = list(model_metrics[j].keys())[i]
        row.append(model_metrics[j][key])
    data.append(row)

WP6_50_metrics = pd.DataFrame(data, columns=model_name[:len(data[0])])

WP6_50_metrics

# Ratio of 31:21 (60%)

In [None]:
df = pd.concat([real_df, gen_df], ignore_index=True)
df.shape

In [None]:
X3 = df.drop(['Ka_mean'], axis=1)
y3 = df["Ka_mean"]
print(f'General quantity of samples: {X3.shape[0]}')
print(f'Quantity of features: {X3.shape[1]}')

X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.1, shuffle=False, random_state=42)
print(f"Length of training data: {len(X_train3)}")
print(f"Length of testing data: {len(X_test3)}")

## Building machine learning models

### Linear Regression

In [None]:
# Creating and fitting model
Lin_regressor3 = LinearRegression(n_jobs=-1)
grid_search_cv_linear3 = GridSearchCV(Lin_regressor3, {}, cv=5)
grid_search_cv_linear3.fit(X_train3, y_train3.to_numpy())

# Model predictions
y_pred_Linear_train3 = grid_search_cv_linear3.best_estimator_.predict(X_train3)
y_pred_Linear_test3 = grid_search_cv_linear3.best_estimator_.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_Linear_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_Linear_test3)
for name_metric, error in lin_reg_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

### Ridge Regression

In [None]:
# Creating and fitting model
Ridge_regressor3 = Ridge()
ridge_reg_parameters3 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge3 = GridSearchCV(Ridge_regressor3, ridge_reg_parameters3, cv=5)

grid_search_cv_ridge3.fit(X_train3, y_train3)
grid_search_cv_ridge3.best_params_

# Model predictions
Ridge_best_reg3 = grid_search_cv_ridge3.best_estimator_
y_pred_Ridge_train3 = Ridge_best_reg3.predict(X_train3)
y_pred_Ridge_test3 = Ridge_best_reg3.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_Ridge_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_Ridge_test3)
for name_metric, error in ridge_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.03112
Train MAE: 0.10912
Train RMSE: 0.17640

Test MSE: 0.00651
Test MAE: 0.06636
Test RMSE: 0.08071


### Lasso Regresssion

In [None]:
# Creating and fitting model
Lasso_regressor3 = Lasso()
lasso_reg_parameters3 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso3 = GridSearchCV(Lasso_regressor3, lasso_reg_parameters3, cv=5)

grid_search_cv_lasso3.fit(X_train3, y_train3)
grid_search_cv_lasso3.best_params_

# Model predictions
Lasso_best_reg3 = grid_search_cv_lasso3.best_estimator_
y_pred_Lasso_train3 = Lasso_best_reg3.predict(X_train3)
y_pred_Lasso_test3 = Lasso_best_reg3.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_Lasso_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_Lasso_test3)
for name_metric, error in lasso_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04659
Train MAE: 0.12281
Train RMSE: 0.21585

Test MSE: 0.00321
Test MAE: 0.04814
Test RMSE: 0.05667


### ElasticNet Regression

In [None]:
# Creating and fitting model
ElasticNet_regressor3 = ElasticNet()
elasticnet_reg_parameters3 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet3 = GridSearchCV(ElasticNet_regressor3, elasticnet_reg_parameters3, cv=5)

grid_search_cv_elasticnet3.fit(X_train3, y_train3)
grid_search_cv_elasticnet3.best_params_

# Model predictions
Elasticnet_best_reg3 = grid_search_cv_elasticnet3.best_estimator_
y_pred_Elasticnet_train3 = Elasticnet_best_reg3.predict(X_train3)
y_pred_Elasticnet_test3 = Elasticnet_best_reg3.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_Elasticnet_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_Elasticnet_test3)
for name_metric, error in elasticnet_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04659
Train MAE: 0.12281
Train RMSE: 0.21585

Test MSE: 0.00321
Test MAE: 0.04814
Test RMSE: 0.05667


### Random Forest

In [None]:
# Creating and fitting model
Forest_regressor3 = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters3 = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest3 = GridSearchCV(Forest_regressor3, forest_reg_parameters3, cv=3)
grid_search_cv_forest3.fit(X_train3, y_train3)
grid_search_cv_forest3.best_params_

# Model predictions
Forest_best_reg3 = grid_search_cv_forest3.best_estimator_
y_pred_Forest_train3 = Forest_best_reg3.predict(X_train3)
y_pred_Forest_test3 = Forest_best_reg3.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_Forest_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_Forest_test3)
for name_metric, error in rforest_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01416
Train MAE: 0.05724
Train RMSE: 0.11900

Test MSE: 0.02456
Test MAE: 0.09548
Test RMSE: 0.15672


### k-NN Regression

In [None]:
# Creating and fitting model
KNeighbors_regressor3 = KNeighborsRegressor()
kNN_reg_parameters3 = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN3 = GridSearchCV(KNeighbors_regressor3, kNN_reg_parameters3, cv=5)
grid_search_cv_kNN3.fit(X_train3, y_train3)
grid_search_cv_kNN3.best_params_

# Model predictions
kNN_best_reg3 = grid_search_cv_kNN3.best_estimator_
y_pred_kNN_train3 = kNN_best_reg3.predict(X_train3)
y_pred_kNN_test3 = kNN_best_reg3.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_kNN_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_kNN_test3)
for name_metric, error in knn_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.03686
Train MAE: 0.11120
Train RMSE: 0.19198

Test MSE: 0.00838
Test MAE: 0.08044
Test RMSE: 0.09156


### Boosting

In [None]:
# Creating and fitting model
GBoosting_regressor3 = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters3 = {}
grid_search_cv_gboost3 = GridSearchCV(GBoosting_regressor3, gboost_reg_parameters3, cv=5)
grid_search_cv_gboost3.fit(X_train3, y_train3)
#print(grid_search_cv_gboost.best_params_)

# Model predictions
GBoost_best_reg3 = grid_search_cv_gboost3.best_estimator_
y_pred_gboost_train3 = GBoost_best_reg3.predict(X_train3)
y_pred_gboost_test3 = GBoost_best_reg3.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_gboost_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_gboost_test3)
for name_metric, error in gboost_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01053
Train MAE: 0.02192
Train RMSE: 0.10264

Test MSE: 0.02720
Test MAE: 0.09256
Test RMSE: 0.16493


## Evaluation

In [None]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics3,lasso_metrics3, elasticnet_metrics3, rforest_metrics3, knn_metrics3, gboost_metrics3]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "WP6 + generated data (60%)",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()

> Metrics are good, but there is also a risk of smearing the data as the quantity of synthetic samples is greater than the quantity of original samples.

### Table of metrics

In [None]:
data = []

for i in range(3):
    row = []
    for j in range(6):
        key = list(model_metrics[j].keys())[i]
        row.append(model_metrics[j][key])
    data.append(row)

WP6_60_metrics = pd.DataFrame(data, columns=model_name[:len(data[0])])

WP6_60_metrics

Unnamed: 0,Ridge,Lasso,ElasticNet,RForest,kNN,Gradient Boosting
0,0.006514,0.003211,0.003211,0.024561,0.008383,0.027203
1,0.066358,0.048143,0.048143,0.095477,0.080436,0.092559
2,0.08071,0.056667,0.056667,0.156718,0.091558,0.164932


# General Table

In [None]:
General_WP6_gen = pd.concat([WP6_25_metrics, WP6_50_metrics, WP6_60_metrics], ignore_index=True)
General_WP6_gen.to_csv("METRICS_WP6_gen.csv")
General_WP6_gen

Unnamed: 0,Ridge,Lasso,ElasticNet,RForest,kNN,Gradient Boosting
0,0.004933,0.005351,0.003981,0.001261,0.001498,0.001909
1,0.049281,0.068924,0.053907,0.030526,0.035074,0.034623
2,0.070236,0.073148,0.063093,0.035517,0.038702,0.04369
3,0.057818,0.058236,0.057774,0.008543,0.009653,0.034264
4,0.192894,0.194575,0.192661,0.074378,0.086137,0.126986
5,0.240453,0.241321,0.240361,0.092429,0.098251,0.185105
6,0.006514,0.003211,0.003211,0.024561,0.008383,0.027203
7,0.066358,0.048143,0.048143,0.095477,0.080436,0.092559
8,0.08071,0.056667,0.056667,0.156718,0.091558,0.164932
