# Importing libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# General Libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go

# Algorithms (Regression)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as XGBoost

# Preprocessing / Feature Selection / Model Selection / Metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from multiprocessing import Pool, Process

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

pd.options.display.max_columns = 200

# Functions

In [None]:
def quality_metrics(y_true: float, y_pred: float) -> dict:
  dict_metrics = dict()
  dict_metrics['MSE'] = mean_squared_error(y_true, y_pred)
  dict_metrics['MAE'] = mean_absolute_error(y_true, y_pred)
  dict_metrics['RMSE'] = np.sqrt(dict_metrics['MSE'])
  return dict_metrics

# Importing datasets

In [None]:
#Real WP6++ dataset
real_df = pd.read_csv("/content/drive/MyDrive/Files For Project/AtomPairsData++.csv")
real_df = real_df.drop(["Unnamed: 0"], axis = 1)
#Synthetic WP6++ dataset
gen_df = pd.read_csv("/content/drive/MyDrive/Files For Project/GeneratedData2++.csv")
gen_df = gen_df.drop(["Unnamed: 0"], axis = 1)

print(f"Length of real dataset: {real_df.shape[0]}")
print(f"Length of synthetic dataset: {gen_df.shape[0]}")

Length of real dataset: 33
Length of synthetic dataset: 30


There will be 3 datasets:


* 11 synthetic samples + 33 real samples (25% of synthetic data)  

* 22 synthetic + 33 real (40%)

* 30 syntheric + 33 real (48%)



# Ratio of 11:33 (25%)

In [None]:
adding = gen_df.head(11)
df = pd.concat([real_df, adding], ignore_index=True)
df.shape

(44, 13)

In [None]:
X1 = df.drop(['Ka_mean'], axis=1)
y1 = df["Ka_mean"]
print(f'General quantity of samples: {X1.shape[0]}')
print(f'Quantity of features: {X1.shape[1]}')

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.1, shuffle=False, random_state=42)
print(f"Length of training data: {len(X_train1)}")
print(f"Length of testing data: {len(X_test1)}")

General quantity of samples: 44
Quantity of features: 12
Length of training data: 39
Length of testing data: 5


## Building machine learning models

### Linear Regression

In [None]:
# Creating and fitting model
Lin_regressor1 = LinearRegression(n_jobs=-1)
grid_search_cv_linear1 = GridSearchCV(Lin_regressor1, {}, cv=5)
grid_search_cv_linear1.fit(X_train1, y_train1.to_numpy())

# Model predictions
y_pred_Linear_train1 = grid_search_cv_linear1.best_estimator_.predict(X_train1)
y_pred_Linear_test1 = grid_search_cv_linear1.best_estimator_.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_Linear_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_Linear_test1)
for name_metric, error in lin_reg_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02726
Train MAE: 0.11869
Train RMSE: 0.16509

Test MSE: 0.03013
Test MAE: 0.13994
Test RMSE: 0.17357


### Ridge Regression

In [None]:
# Creating and fitting model
Ridge_regressor1 = Ridge()
ridge_reg_parameters1 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge1 = GridSearchCV(Ridge_regressor1, ridge_reg_parameters1, cv=5)

grid_search_cv_ridge1.fit(X_train1, y_train1)
grid_search_cv_ridge1.best_params_

# Model predictions
Ridge_best_reg1 = grid_search_cv_ridge1.best_estimator_
y_pred_Ridge_train1 = Ridge_best_reg1.predict(X_train1)
y_pred_Ridge_test1 = Ridge_best_reg1.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_Ridge_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_Ridge_test1)
for name_metric, error in ridge_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04375
Train MAE: 0.12769
Train RMSE: 0.20917

Test MSE: 0.04217
Test MAE: 0.12982
Test RMSE: 0.20534


### Lasso Regresssion

In [None]:
# Creating and fitting model
Lasso_regressor1 = Lasso()
lasso_reg_parameters1 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso1 = GridSearchCV(Lasso_regressor1, lasso_reg_parameters1, cv=5)

grid_search_cv_lasso1.fit(X_train1, y_train1)
grid_search_cv_lasso1.best_params_

# Model predictions
Lasso_best_reg1 = grid_search_cv_lasso1.best_estimator_
y_pred_Lasso_train1 = Lasso_best_reg1.predict(X_train1)
y_pred_Lasso_test1 = Lasso_best_reg1.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_Lasso_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_Lasso_test1)
for name_metric, error in lasso_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06775
Train MAE: 0.16751
Train RMSE: 0.26029

Test MSE: 0.06557
Test MAE: 0.15929
Test RMSE: 0.25607


### ElasticNet Regression

In [None]:
# Creating and fitting model
ElasticNet_regressor1 = ElasticNet()
elasticnet_reg_parameters1 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet1 = GridSearchCV(ElasticNet_regressor1, elasticnet_reg_parameters1, cv=5)

grid_search_cv_elasticnet1.fit(X_train1, y_train1)
grid_search_cv_elasticnet1.best_params_

# Model predictions
Elasticnet_best_reg1 = grid_search_cv_elasticnet1.best_estimator_
y_pred_Elasticnet_train1 = Elasticnet_best_reg1.predict(X_train1)
y_pred_Elasticnet_test1 = Elasticnet_best_reg1.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_Elasticnet_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_Elasticnet_test1)
for name_metric, error in elasticnet_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.05205
Train MAE: 0.14567
Train RMSE: 0.22815

Test MSE: 0.04665
Test MAE: 0.14001
Test RMSE: 0.21598


### Random Forest

In [None]:
# Creating and fitting model
Forest_regressor1 = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters1 = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest1 = GridSearchCV(Forest_regressor1, forest_reg_parameters1, cv=3)
grid_search_cv_forest1.fit(X_train1, y_train1)
grid_search_cv_forest1.best_params_

# Model predictions
Forest_best_reg1 = grid_search_cv_forest1.best_estimator_
y_pred_Forest_train1 = Forest_best_reg1.predict(X_train1)
y_pred_Forest_test1 = Forest_best_reg1.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_Forest_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_Forest_test1)
for name_metric, error in rforest_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01807
Train MAE: 0.06669
Train RMSE: 0.13443

Test MSE: 0.08392
Test MAE: 0.18830
Test RMSE: 0.28968


### k-NN Regression

In [None]:
# Creating and fitting model
KNeighbors_regressor1 = KNeighborsRegressor()
kNN_reg_parameters1 = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN1 = GridSearchCV(KNeighbors_regressor1, kNN_reg_parameters1, cv=5)
grid_search_cv_kNN1.fit(X_train1, y_train1)
grid_search_cv_kNN1.best_params_

# Model predictions
kNN_best_reg1 = grid_search_cv_kNN1.best_estimator_
y_pred_kNN_train1 = kNN_best_reg1.predict(X_train1)
y_pred_kNN_test1 = kNN_best_reg1.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_kNN_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_kNN_test1)
for name_metric, error in knn_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04965
Train MAE: 0.10498
Train RMSE: 0.22283

Test MSE: 0.04644
Test MAE: 0.13575
Test RMSE: 0.21551


### Boosting

In [None]:
# Creating and fitting model
GBoosting_regressor1 = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters1 = {}
grid_search_cv_gboost1 = GridSearchCV(GBoosting_regressor1, gboost_reg_parameters1, cv=5)
grid_search_cv_gboost1.fit(X_train1, y_train1)
#print(grid_search_cv_gboost.best_params_)

# Model predictions
GBoost_best_reg1 = grid_search_cv_gboost1.best_estimator_
y_pred_gboost_train1 = GBoost_best_reg1.predict(X_train1)
y_pred_gboost_test1 = GBoost_best_reg1.predict(X_test1)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train1, y_pred=y_pred_gboost_train1).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics1 = quality_metrics(y_true=y_test1, y_pred=y_pred_gboost_test1)
for name_metric, error in gboost_metrics1.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01242
Train MAE: 0.02597
Train RMSE: 0.11146

Test MSE: 0.16538
Test MAE: 0.26971
Test RMSE: 0.40667


## Evaluation

In [None]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics1,lasso_metrics1, elasticnet_metrics1, rforest_metrics1, knn_metrics1, gboost_metrics1]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "WP6++ + generated data (25%)",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()

> Metrics are worse than metrics WP6++ without synthetic data

### Table of metrics

In [None]:
data = []

for i in range(3):
    row = []
    for j in range(6):
        key = list(model_metrics[j].keys())[i]
        row.append(model_metrics[j][key])
    data.append(row)

WP6_add_25_metrics = pd.DataFrame(data, columns=model_name[:len(data[0])])

WP6_add_25_metrics

Unnamed: 0,Ridge,Lasso,ElasticNet,RForest,kNN,Gradient Boosting
0,0.042166,0.065572,0.046648,0.083917,0.046445,0.165381
1,0.129815,0.15929,0.140007,0.188302,0.135751,0.269709
2,0.205345,0.256071,0.215981,0.289684,0.21551,0.406671


# Ratio of 22:33 (40%)

In [None]:
adding = gen_df.head(22)
df = pd.concat([real_df, adding], ignore_index=True)
df.shape

(55, 13)

In [None]:
X2 = df.drop(['Ka_mean'], axis=1)
y2 = df["Ka_mean"]
print(f'General quantity of samples: {X2.shape[0]}')
print(f'Quantity of features: {X2.shape[1]}')

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.1, shuffle=False, random_state=42)
print(f"Length of training data: {len(X_train2)}")
print(f"Length of testing data: {len(X_test2)}")

General quantity of samples: 55
Quantity of features: 12
Length of training data: 49
Length of testing data: 6


## Building machine learning models

### Linear Regression

In [None]:
# Creating and fitting model
Lin_regressor2 = LinearRegression(n_jobs=-1)
grid_search_cv_linear2 = GridSearchCV(Lin_regressor2, {}, cv=5)
grid_search_cv_linear2.fit(X_train2, y_train2.to_numpy())

# Model predictions
y_pred_Linear_train2 = grid_search_cv_linear2.best_estimator_.predict(X_train2)
y_pred_Linear_test2 = grid_search_cv_linear2.best_estimator_.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_Linear_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_Linear_test2)
for name_metric, error in lin_reg_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02804
Train MAE: 0.11897
Train RMSE: 0.16745

Test MSE: 0.04563
Test MAE: 0.17968
Test RMSE: 0.21361


### Ridge Regression

In [None]:
# Creating and fitting model
Ridge_regressor2 = Ridge()
ridge_reg_parameters2 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge2 = GridSearchCV(Ridge_regressor2, ridge_reg_parameters2, cv=5)

grid_search_cv_ridge2.fit(X_train2, y_train2)
grid_search_cv_ridge2.best_params_

# Model predictions
Ridge_best_reg2 = grid_search_cv_ridge2.best_estimator_
y_pred_Ridge_train2 = Ridge_best_reg2.predict(X_train2)
y_pred_Ridge_test2 = Ridge_best_reg2.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_Ridge_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_Ridge_test2)
for name_metric, error in ridge_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.03969
Train MAE: 0.12079
Train RMSE: 0.19922

Test MSE: 0.03460
Test MAE: 0.12590
Test RMSE: 0.18602


### Lasso Regresssion

In [None]:
# Creating and fitting model
Lasso_regressor2 = Lasso()
lasso_reg_parameters2 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso2 = GridSearchCV(Lasso_regressor2, lasso_reg_parameters2, cv=5)

grid_search_cv_lasso2.fit(X_train2, y_train2)
grid_search_cv_lasso2.best_params_

# Model predictions
Lasso_best_reg2 = grid_search_cv_lasso2.best_estimator_
y_pred_Lasso_train2 = Lasso_best_reg2.predict(X_train2)
y_pred_Lasso_test2 = Lasso_best_reg2.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_Lasso_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_Lasso_test2)
for name_metric, error in lasso_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06146
Train MAE: 0.15684
Train RMSE: 0.24792

Test MSE: 0.03752
Test MAE: 0.13519
Test RMSE: 0.19369


### ElasticNet Regression

In [None]:
# Creating and fitting model
ElasticNet_regressor2 = ElasticNet()
elasticnet_reg_parameters2 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet2 = GridSearchCV(ElasticNet_regressor2, elasticnet_reg_parameters2, cv=5)

grid_search_cv_elasticnet2.fit(X_train2, y_train2)
grid_search_cv_elasticnet2.best_params_

# Model predictions
Elasticnet_best_reg2 = grid_search_cv_elasticnet2.best_estimator_
y_pred_Elasticnet_train2 = Elasticnet_best_reg2.predict(X_train2)
y_pred_Elasticnet_test2 = Elasticnet_best_reg2.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_Elasticnet_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_Elasticnet_test2)
for name_metric, error in elasticnet_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06146
Train MAE: 0.15684
Train RMSE: 0.24792

Test MSE: 0.03752
Test MAE: 0.13519
Test RMSE: 0.19369


### Random Forest

In [None]:
# Creating and fitting model
Forest_regressor2 = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters2 = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest2 = GridSearchCV(Forest_regressor2, forest_reg_parameters2, cv=3)
grid_search_cv_forest2.fit(X_train2, y_train2)
grid_search_cv_forest2.best_params_

# Model predictions
Forest_best_reg2 = grid_search_cv_forest2.best_estimator_
y_pred_Forest_train2 = Forest_best_reg2.predict(X_train2)
y_pred_Forest_test2 = Forest_best_reg2.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_Forest_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_Forest_test2)
for name_metric, error in rforest_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01535
Train MAE: 0.06260
Train RMSE: 0.12390

Test MSE: 0.04840
Test MAE: 0.17616
Test RMSE: 0.22001


### k-NN Regression

In [None]:
# Creating and fitting model
KNeighbors_regressor2 = KNeighborsRegressor()
kNN_reg_parameters2 = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN2 = GridSearchCV(KNeighbors_regressor2, kNN_reg_parameters2, cv=5)
grid_search_cv_kNN2.fit(X_train2, y_train2)
grid_search_cv_kNN2.best_params_

# Model predictions
kNN_best_reg2 = grid_search_cv_kNN2.best_estimator_
y_pred_kNN_train2 = kNN_best_reg2.predict(X_train2)
y_pred_kNN_test2 = kNN_best_reg2.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_kNN_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_kNN_test2)
for name_metric, error in knn_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04852
Train MAE: 0.10711
Train RMSE: 0.22027

Test MSE: 0.04158
Test MAE: 0.10778
Test RMSE: 0.20392


### Boosting

In [None]:
# Creating and fitting model
GBoosting_regressor2 = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters2 = {}
grid_search_cv_gboost2 = GridSearchCV(GBoosting_regressor2, gboost_reg_parameters2, cv=5)
grid_search_cv_gboost2.fit(X_train2, y_train2)
#print(grid_search_cv_gboost.best_params_)

# Model predictions
GBoost_best_reg2 = grid_search_cv_gboost2.best_estimator_
y_pred_gboost_train2 = GBoost_best_reg2.predict(X_train2)
y_pred_gboost_test2 = GBoost_best_reg2.predict(X_test2)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train2, y_pred=y_pred_gboost_train2).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics2 = quality_metrics(y_true=y_test2, y_pred=y_pred_gboost_test2)
for name_metric, error in gboost_metrics2.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.00989
Train MAE: 0.02079
Train RMSE: 0.09944

Test MSE: 0.19317
Test MAE: 0.32993
Test RMSE: 0.43951


## Evaluation

In [None]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics2,lasso_metrics2, elasticnet_metrics2, rforest_metrics2, knn_metrics2, gboost_metrics2]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "WP6++ + generated data (40%)",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()

> A bit better, but still worse than metrics without generated data

### Table of metrics

In [None]:
data = []

for i in range(3):
    row = []
    for j in range(6):
        key = list(model_metrics[j].keys())[i]
        row.append(model_metrics[j][key])
    data.append(row)

WP6_add_40_metrics = pd.DataFrame(data, columns=model_name[:len(data[0])])

WP6_add_40_metrics

Unnamed: 0,Ridge,Lasso,ElasticNet,RForest,kNN,Gradient Boosting
0,0.034602,0.037515,0.037515,0.048404,0.041582,0.193168
1,0.125902,0.135195,0.135195,0.176161,0.107783,0.329933
2,0.186016,0.193688,0.193688,0.220009,0.203917,0.439509


# Ratio of 30:33 (48%)

In [None]:
df = pd.concat([real_df, gen_df], ignore_index=True)
df.shape

(63, 13)

In [None]:
X3 = df.drop(['Ka_mean'], axis=1)
y3 = df["Ka_mean"]
print(f'General quantity of samples: {X3.shape[0]}')
print(f'Quantity of features: {X3.shape[1]}')

X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.1, shuffle=False, random_state=42)
print(f"Length of training data: {len(X_train3)}")
print(f"Length of testing data: {len(X_test3)}")

General quantity of samples: 63
Quantity of features: 12
Length of training data: 56
Length of testing data: 7


## Building machine learning models

### Linear Regression

In [None]:
# Creating and fitting model
Lin_regressor3 = LinearRegression(n_jobs=-1)
grid_search_cv_linear3 = GridSearchCV(Lin_regressor3, {}, cv=5)
grid_search_cv_linear3.fit(X_train3, y_train3.to_numpy())

# Model predictions
y_pred_Linear_train3 = grid_search_cv_linear3.best_estimator_.predict(X_train3)
y_pred_Linear_test3 = grid_search_cv_linear3.best_estimator_.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_Linear_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_Linear_test3)
for name_metric, error in lin_reg_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02972
Train MAE: 0.12757
Train RMSE: 0.17240

Test MSE: 0.04884
Test MAE: 0.15412
Test RMSE: 0.22100


### Ridge Regression

In [None]:
# Creating and fitting model
Ridge_regressor3 = Ridge()
ridge_reg_parameters3 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge3 = GridSearchCV(Ridge_regressor3, ridge_reg_parameters3, cv=5)

grid_search_cv_ridge3.fit(X_train3, y_train3)
grid_search_cv_ridge3.best_params_

# Model predictions
Ridge_best_reg3 = grid_search_cv_ridge3.best_estimator_
y_pred_Ridge_train3 = Ridge_best_reg3.predict(X_train3)
y_pred_Ridge_test3 = Ridge_best_reg3.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_Ridge_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_Ridge_test3)
for name_metric, error in ridge_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04150
Train MAE: 0.13108
Train RMSE: 0.20372

Test MSE: 0.01474
Test MAE: 0.08144
Test RMSE: 0.12139


### Lasso Regresssion

In [None]:
# Creating and fitting model
Lasso_regressor3 = Lasso()
lasso_reg_parameters3 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso3 = GridSearchCV(Lasso_regressor3, lasso_reg_parameters3, cv=5)

grid_search_cv_lasso3.fit(X_train3, y_train3)
grid_search_cv_lasso3.best_params_

# Model predictions
Lasso_best_reg3 = grid_search_cv_lasso3.best_estimator_
y_pred_Lasso_train3 = Lasso_best_reg3.predict(X_train3)
y_pred_Lasso_test3 = Lasso_best_reg3.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_Lasso_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_Lasso_test3)
for name_metric, error in lasso_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06419
Train MAE: 0.17081
Train RMSE: 0.25336

Test MSE: 0.00756
Test MAE: 0.08134
Test RMSE: 0.08697


### ElasticNet Regression

In [None]:
# Creating and fitting model
ElasticNet_regressor3 = ElasticNet()
elasticnet_reg_parameters3 = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet3 = GridSearchCV(ElasticNet_regressor3, elasticnet_reg_parameters3, cv=5)

grid_search_cv_elasticnet3.fit(X_train3, y_train3)
grid_search_cv_elasticnet3.best_params_

# Model predictions
Elasticnet_best_reg3 = grid_search_cv_elasticnet3.best_estimator_
y_pred_Elasticnet_train3 = Elasticnet_best_reg3.predict(X_train3)
y_pred_Elasticnet_test3 = Elasticnet_best_reg3.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_Elasticnet_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_Elasticnet_test3)
for name_metric, error in elasticnet_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06419
Train MAE: 0.17081
Train RMSE: 0.25336

Test MSE: 0.00756
Test MAE: 0.08134
Test RMSE: 0.08697


### Random Forest

In [None]:
# Creating and fitting model
Forest_regressor3 = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters3 = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest3 = GridSearchCV(Forest_regressor3, forest_reg_parameters3, cv=3)
grid_search_cv_forest3.fit(X_train3, y_train3)
grid_search_cv_forest3.best_params_

# Model predictions
Forest_best_reg3 = grid_search_cv_forest3.best_estimator_
y_pred_Forest_train3 = Forest_best_reg3.predict(X_train3)
y_pred_Forest_test3 = Forest_best_reg3.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_Forest_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_Forest_test3)
for name_metric, error in rforest_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01491
Train MAE: 0.06312
Train RMSE: 0.12211

Test MSE: 0.00854
Test MAE: 0.07532
Test RMSE: 0.09240


### k-NN Regression

In [None]:
# Creating and fitting model
KNeighbors_regressor3 = KNeighborsRegressor()
kNN_reg_parameters3 = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN3 = GridSearchCV(KNeighbors_regressor3, kNN_reg_parameters3, cv=5)
grid_search_cv_kNN3.fit(X_train3, y_train3)
grid_search_cv_kNN3.best_params_

# Model predictions
kNN_best_reg3 = grid_search_cv_kNN3.best_estimator_
y_pred_kNN_train3 = kNN_best_reg3.predict(X_train3)
y_pred_kNN_test3 = kNN_best_reg3.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_kNN_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_kNN_test3)
for name_metric, error in knn_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.05150
Train MAE: 0.12308
Train RMSE: 0.22693

Test MSE: 0.01069
Test MAE: 0.06144
Test RMSE: 0.10338


### Boosting

In [None]:
# Creating and fitting model
GBoosting_regressor3 = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters3 = {}
grid_search_cv_gboost3 = GridSearchCV(GBoosting_regressor3, gboost_reg_parameters3, cv=5)
grid_search_cv_gboost3.fit(X_train3, y_train3)
#print(grid_search_cv_gboost.best_params_)

# Model predictions
GBoost_best_reg3 = grid_search_cv_gboost3.best_estimator_
y_pred_gboost_train3 = GBoost_best_reg3.predict(X_train3)
y_pred_gboost_test3 = GBoost_best_reg3.predict(X_test3)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train3, y_pred=y_pred_gboost_train3).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics3 = quality_metrics(y_true=y_test3, y_pred=y_pred_gboost_test3)
for name_metric, error in gboost_metrics3.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.00865
Train MAE: 0.01837
Train RMSE: 0.09302

Test MSE: 0.00332
Test MAE: 0.05230
Test RMSE: 0.05760


## Evaluation

In [None]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics3,lasso_metrics3, elasticnet_metrics3, rforest_metrics3, knn_metrics3, gboost_metrics3]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "WP6++ + generated data (48%)",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()

> Original WP6++ metrics still better :)

> Moreover it comes out ambiguous here: the metrics seem to be good, but there is a feeling that the data is already so made up and smeared that the target metric is already maximally inaccurate in validation

### Table of metrics

In [None]:
data = []

for i in range(3):
    row = []
    for j in range(6):
        key = list(model_metrics[j].keys())[i]
        row.append(model_metrics[j][key])
    data.append(row)

WP6_add_48_metrics = pd.DataFrame(data, columns=model_name[:len(data[0])])

WP6_add_48_metrics

Unnamed: 0,Ridge,Lasso,ElasticNet,RForest,kNN,Gradient Boosting
0,0.014735,0.007564,0.007564,0.008538,0.010688,0.003317
1,0.081436,0.081339,0.081339,0.075318,0.061435,0.0523
2,0.121389,0.08697,0.08697,0.092403,0.103383,0.057597


# General Table

In [None]:
General_WP6_add_gen = pd.concat([WP6_add_25_metrics, WP6_add_40_metrics, WP6_add_48_metrics], ignore_index=True)
General_WP6_add_gen.to_csv("METRICS_WP6++_gen.csv")
General_WP6_add_gen

Unnamed: 0,Ridge,Lasso,ElasticNet,RForest,kNN,Gradient Boosting
0,0.042166,0.065572,0.046648,0.083917,0.046445,0.165381
1,0.129815,0.15929,0.140007,0.188302,0.135751,0.269709
2,0.205345,0.256071,0.215981,0.289684,0.21551,0.406671
3,0.034602,0.037515,0.037515,0.048404,0.041582,0.193168
4,0.125902,0.135195,0.135195,0.176161,0.107783,0.329933
5,0.186016,0.193688,0.193688,0.220009,0.203917,0.439509
6,0.014735,0.007564,0.007564,0.008538,0.010688,0.003317
7,0.081436,0.081339,0.081339,0.075318,0.061435,0.0523
8,0.121389,0.08697,0.08697,0.092403,0.103383,0.057597


So, in general, the metrics obtained here are worse than the metrics obtained by the training model only in WP6++.