# Importing libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# General Libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go

# Algorithms (Regression)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as XGBoost

# Preprocessing / Feature Selection / Model Selection / Metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from multiprocessing import Pool, Process
import random

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

pd.options.display.max_columns = 200

# Functions

In [None]:
def quality_metrics(y_true: float, y_pred: float) -> dict:
  dict_metrics = dict()
  dict_metrics['MSE'] = mean_squared_error(y_true, y_pred)
  dict_metrics['MAE'] = mean_absolute_error(y_true, y_pred)
  dict_metrics['RMSE'] = np.sqrt(dict_metrics['MSE'])
  return dict_metrics

# Importing WP6 Dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Files For Project/AtomPairsData.csv")
df = df.drop(["Unnamed: 0"], axis = 1)
df

Unnamed: 0,304,209,SMR_VSA4,PEOE_VSA6,FpDensityMorgan2,MinEStateIndex,VSA_EState1,fr_NH0,MinAbsEStateIndex,FpDensityMorgan1,VSA_EState9,MaxAbsEStateIndex,Ka_mean
0,1,0,0.0,0.465948,0.09699,0.905499,0.0,0.0,0.585935,0.070404,0.048112,0.220982,0.00042
1,1,0,0.5,0.0,0.070652,0.961411,0.077888,1.0,0.83092,0.098517,0.666724,0.027884,0.441126
2,1,0,0.5,0.0,1.0,0.92014,0.0,0.0,0.650086,0.756356,0.048112,0.209927,0.006185
3,1,0,0.0,0.203617,0.608696,0.625769,0.0,0.0,0.580032,1.0,0.0,0.178277,0.004812
4,1,0,0.457509,0.143555,0.06087,0.859423,0.071555,1.0,0.384046,0.025424,1.0,0.049802,0.0
5,1,0,0.0,0.199692,0.391304,0.909123,0.0,0.0,0.601815,0.350282,0.048112,0.207721,0.006969
6,1,0,0.0,0.066564,0.664596,0.912327,0.0,0.0,0.615851,0.610169,0.048112,0.197049,0.001184
7,1,0,0.0,0.314447,0.815857,0.0,1.0,0.5,0.195552,0.81655,0.047135,0.880993,0.000138
8,1,0,0.5,0.0,0.73913,0.708749,0.0,0.0,0.186018,0.707627,0.048112,0.851055,0.000273
9,0,0,0.0,0.251558,0.119565,0.948884,0.060886,1.0,0.77603,0.17161,0.912318,0.014492,0.156787


In [None]:
X = df.drop(['Ka_mean'], axis=1)
y = df["Ka_mean"]
print(f'General quantity of samples: {X.shape[0]}')
print(f'Quantity of features: {X.shape[1]}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=42)
print(f"Length of training data: {len(X_train)}")
print(f"Length of testing data: {len(X_test)}")

General quantity of samples: 21
Quantity of features: 12
Length of training data: 18
Length of testing data: 3


# Building machine learning models

## Linear Regression

In [None]:
# Creating and fitting model
Lin_regressor = LinearRegression(n_jobs=-1)
grid_search_cv_linear = GridSearchCV(Lin_regressor, {}, cv=5)
grid_search_cv_linear.fit(X_train, y_train.to_numpy())

# Model predictions
y_pred_Linear_train = grid_search_cv_linear.best_estimator_.predict(X_train)
y_pred_Linear_test = grid_search_cv_linear.best_estimator_.predict(X_test)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Linear_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Linear_test)
for name_metric, error in lin_reg_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.00448
Train MAE: 0.05473
Train RMSE: 0.06696

Test MSE: 0.91353
Test MAE: 0.95570
Test RMSE: 0.95579


## Ridge Regression

In [None]:
# Creating and fitting model
Ridge_regressor = Ridge()
ridge_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge = GridSearchCV(Ridge_regressor, ridge_reg_parameters, cv=5)

grid_search_cv_ridge.fit(X_train, y_train)

# Model predictions
Ridge_best_reg = grid_search_cv_ridge.best_estimator_
y_pred_Ridge_train = Ridge_best_reg.predict(X_train)
y_pred_Ridge_test = Ridge_best_reg.predict(X_test)

#The best Hyperparameters
ridge_best_key = list((grid_search_cv_ridge.best_params_).keys())[0]
ridge_best_value = list((grid_search_cv_ridge.best_params_).values())[0]
print(f'The best hyperparameters: {ridge_best_key} = {ridge_best_value}')
print()

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Ridge_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Ridge_test)
for name_metric, error in ridge_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

The best hyperparameters: alpha = 2.636650898730366

Train MSE: 0.02268
Train MAE: 0.08440
Train RMSE: 0.15060

Test MSE: 0.17028
Test MAE: 0.28076
Test RMSE: 0.41265


## Lasso Regresssion

In [None]:
# Creating and fitting model
Lasso_regressor = Lasso()
lasso_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso = GridSearchCV(Lasso_regressor, lasso_reg_parameters, cv=5)

grid_search_cv_lasso.fit(X_train, y_train)
grid_search_cv_lasso.best_params_

# Model predictions
Lasso_best_reg = grid_search_cv_lasso.best_estimator_
y_pred_Lasso_train = Lasso_best_reg.predict(X_train)
y_pred_Lasso_test = Lasso_best_reg.predict(X_test)

#The best Hyperparameters
lasso_best_key = list((grid_search_cv_lasso.best_params_).keys())[0]
lasso_best_value = list((grid_search_cv_lasso.best_params_).values())[0]
print(f'The best hyperparameters: {lasso_best_key} = {lasso_best_value}')
print()

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Lasso_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Lasso_test)
for name_metric, error in lasso_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

The best hyperparameters: alpha = 0.012742749857031322

Train MSE: 0.02152
Train MAE: 0.08332
Train RMSE: 0.14669

Test MSE: 0.20825
Test MAE: 0.30674
Test RMSE: 0.45634


## ElasticNet Regression

In [None]:
# Creating and fitting model
ElasticNet_regressor = ElasticNet()
elasticnet_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet = GridSearchCV(ElasticNet_regressor, elasticnet_reg_parameters, cv=5)

grid_search_cv_elasticnet.fit(X_train, y_train)
grid_search_cv_elasticnet.best_params_

# Model predictions
Elasticnet_best_reg = grid_search_cv_elasticnet.best_estimator_
y_pred_Elasticnet_train = Elasticnet_best_reg.predict(X_train)
y_pred_Elasticnet_test = Elasticnet_best_reg.predict(X_test)

#The best Hyperparameters
elasticnet_best_key = list((grid_search_cv_elasticnet.best_params_).keys())[0]
elasticnet_best_value = list((grid_search_cv_elasticnet.best_params_).values())[0]
print(f'The best hyperparameters: {elasticnet_best_key} = {elasticnet_best_value}')
print()


# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Elasticnet_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Elasticnet_test)
for name_metric, error in elasticnet_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

The best hyperparameters: alpha = 0.04832930238571752

Train MSE: 0.02903
Train MAE: 0.09811
Train RMSE: 0.17037

Test MSE: 0.24185
Test MAE: 0.33168
Test RMSE: 0.49179


## Random Forest

In [None]:
# Creating and fitting model
Forest_regressor = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest = GridSearchCV(Forest_regressor, forest_reg_parameters, cv=3)
grid_search_cv_forest.fit(X_train, y_train)
grid_search_cv_forest.best_params_

# Model predictions
Forest_best_reg = grid_search_cv_forest.best_estimator_
y_pred_Forest_train = Forest_best_reg.predict(X_train)
y_pred_Forest_test = Forest_best_reg.predict(X_test)

#The best Hyperparameters
forest_best_key = list((grid_search_cv_forest.best_params_).keys())[0]
forest_best_value = list((grid_search_cv_forest.best_params_).values())[0]
print(f'The best hyperparameters: {forest_best_key} = {forest_best_value}')
print()

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Forest_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Forest_test)
for name_metric, error in rforest_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

The best hyperparameters: max_depth = 8

Train MSE: 0.00801
Train MAE: 0.04799
Train RMSE: 0.08952

Test MSE: 0.25622
Test MAE: 0.33458
Test RMSE: 0.50619


## k-NN Regression

In [None]:
# Creating and fitting model
KNeighbors_regressor = KNeighborsRegressor()
kNN_reg_parameters = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN = GridSearchCV(KNeighbors_regressor, kNN_reg_parameters, cv=5)
grid_search_cv_kNN.fit(X_train, y_train)
grid_search_cv_kNN.best_params_

# Model predictions
kNN_best_reg = grid_search_cv_kNN.best_estimator_
y_pred_kNN_train = kNN_best_reg.predict(X_train)
y_pred_kNN_test = kNN_best_reg.predict(X_test)

#The best Hyperparameters
kNN_best_key = list((grid_search_cv_kNN.best_params_).keys())[0]
kNN_best_value = list((grid_search_cv_kNN.best_params_).values())[0]
print(f'The best hyperparameters: {kNN_best_key} = {kNN_best_value}')
print()

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_kNN_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_kNN_test)
for name_metric, error in knn_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

The best hyperparameters: n_neighbors = 8

Train MSE: 0.03550
Train MAE: 0.08388
Train RMSE: 0.18842

Test MSE: 0.22040
Test MAE: 0.28022
Test RMSE: 0.46947


## Boosting

In [None]:
np.random.seed(42)
random.seed(42)

# Creating and fitting model
GBoosting_regressor = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters = {}
grid_search_cv_gboost = GridSearchCV(GBoosting_regressor, gboost_reg_parameters, cv=5)
grid_search_cv_gboost.fit(X_train, y_train)

# Model predictions
GBoost_best_reg = grid_search_cv_gboost.best_estimator_
y_pred_gboost_train = GBoost_best_reg.predict(X_train)
y_pred_gboost_test = GBoost_best_reg.predict(X_test)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_gboost_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_gboost_test)
for name_metric, error in gboost_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.00000
Train MAE: 0.00069
Train RMSE: 0.00112

Test MSE: 0.32325
Test MAE: 0.34138
Test RMSE: 0.56855


# Evaluation

In [None]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics,lasso_metrics, elasticnet_metrics, rforest_metrics, knn_metrics, gboost_metrics]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "Metrics WP6 Dataset",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()

# General Tables

In [None]:
#Best hyperparameters table
hyperparams_dict = {"Name of ML model": ["Linear Regression", "Ridge", "Lasso", "ElasticNet",
                                         "Random Forest", "kNN Regression", "XGBoost"],
                    "Tuning Hyperparameter": ["-", "Alpha", "Alpha", "Alpha", "Max Depth", "Number of Neighbors", "Number of Estimators, Max Depth"],
                    "Value": [None, ridge_best_value, lasso_best_value, elasticnet_best_value, forest_best_value,
                              kNN_best_value, [50, 18]]}

best_hyperparams = pd.DataFrame(hyperparams_dict)
best_hyperparams

Unnamed: 0,Name of ML model,Tuning Hyperparameter,Value
0,Linear Regression,-,
1,Ridge,Alpha,2.636651
2,Lasso,Alpha,0.012743
3,ElasticNet,Alpha,0.048329
4,Random Forest,Max Depth,8
5,kNN Regression,Number of Neighbors,8
6,XGBoost,"Number of Estimators, Max Depth","[50, 18]"


In [None]:
#Metrics table
data = []

for i in range(3):
    row = []
    for j in range(6):
        key = list(model_metrics[j].keys())[i]
        row.append(model_metrics[j][key])
    data.append(row)

WP6_metrics = pd.DataFrame(data, columns=model_name[:len(data[0])])
WP6_metrics['Metric'] = ['MSE', 'MAE', 'RMSE']
WP6_metrics['Dataset'] = ['WP6', 'WP6', 'WP6']
WP6_metrics["Linear"] = list(lin_reg_metrics.values())

WP6_metrics = WP6_metrics[['Metric', 'Dataset', 'Linear'] + WP6_metrics.columns[:-3].tolist()]

WP6_metrics.to_csv("METRICS_WP6.csv")
WP6_metrics

Unnamed: 0,Metric,Dataset,Linear,Ridge,Lasso,ElasticNet,RForest,kNN,Gradient Boosting
0,MSE,WP6,0.913529,0.170277,0.208246,0.241854,0.256225,0.220398,0.323253
1,MAE,WP6,0.955701,0.280757,0.306738,0.331683,0.334577,0.280216,0.341377
2,RMSE,WP6,0.955787,0.412646,0.45634,0.491786,0.506187,0.469465,0.568553
