# Importing libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# General Libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go

# Algorithms (Regression)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as XGBoost

# Preprocessing / Feature Selection / Model Selection / Metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from multiprocessing import Pool, Process

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

pd.options.display.max_columns = 200

# Function

In [3]:
def quality_metrics(y_true: float, y_pred: float) -> dict:
  dict_metrics = dict()
  dict_metrics['MSE'] = mean_squared_error(y_true, y_pred)
  dict_metrics['MAE'] = mean_absolute_error(y_true, y_pred)
  dict_metrics['RMSE'] = np.sqrt(dict_metrics['MSE'])
  return dict_metrics

# Importring WP6++ Dataset

In [4]:
df = pd.read_csv("/content/drive/MyDrive/Files For Project/AtomPairsData++.csv")
df = df.drop(["Unnamed: 0"], axis = 1)
df

Unnamed: 0,209,193,SMR_VSA4,PEOE_VSA9,VSA_EState9,SlogP_VSA2,FpDensityMorgan2,FpDensityMorgan3,PEOE_VSA6,MinEStateIndex,VSA_EState5,Chi0v,Ka_mean
0,0,0,0.0,0.111125,0.383244,0.095394,0.09699,0.243292,0.332359,0.90494,0.062805,0.189597,0.0005096569
1,0,0,0.5,1.0,0.784061,1.0,0.070652,0.123547,0.0,0.955733,0.542489,0.307724,0.4411764
2,0,0,0.5,0.111125,0.383244,0.095394,1.0,0.968605,0.0,0.91824,0.546644,0.020002,0.006274364
3,0,0,0.0,0.113446,0.352071,0.224863,0.608696,0.434884,0.145239,0.650821,0.062805,0.048899,0.004901814
4,0,0,0.457509,0.895845,1.0,0.984119,0.06087,0.114651,0.102398,0.863082,0.062805,0.461054,9.004903e-05
5,0,0,0.0,0.111125,0.383244,0.095394,0.391304,0.632558,0.14244,0.908232,0.062805,0.063199,0.007058678
6,0,0,0.0,0.111125,0.383244,0.095394,0.664596,0.689037,0.04748,0.911142,0.062805,0.0,0.001274363
7,0,0,0.0,0.565724,0.382611,0.692287,0.815857,1.0,0.224294,0.082344,0.0,0.281175,0.0002282843
8,0,0,0.5,0.204893,0.383244,0.256514,0.73913,0.85,0.0,0.726203,0.428996,0.122074,0.0003625981
9,0,0,0.0,0.77775,0.943188,0.809212,0.119565,0.123547,0.179435,0.944353,0.062805,0.303794,0.1568626


In [5]:
X = df.drop(['Ka_mean'], axis=1)
y = df["Ka_mean"]
print(f'General quantity of samples: {X.shape[0]}')
print(f'Quantity of features: {X.shape[1]}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=42)
print(f"Length of training data: {len(X_train)}")
print(f"Length of testing data: {len(X_test)}")

General quantity of samples: 33
Quantity of features: 12
Length of training data: 29
Length of testing data: 4


# Building machine learning models

## Linear Regression

In [6]:
# Creating and fitting model
Lin_regressor = LinearRegression(n_jobs=-1)
grid_search_cv_linear = GridSearchCV(Lin_regressor, {}, cv=5)
grid_search_cv_linear.fit(X_train, y_train.to_numpy())

# Model predictions
y_pred_Linear_train = grid_search_cv_linear.best_estimator_.predict(X_train)
y_pred_Linear_test = grid_search_cv_linear.best_estimator_.predict(X_test)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Linear_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Linear_test)
for name_metric, error in lin_reg_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02390
Train MAE: 0.09616
Train RMSE: 0.15458

Test MSE: 0.05981
Test MAE: 0.15106
Test RMSE: 0.24455


## Ridge Regression

In [14]:
# Creating and fitting model
Ridge_regressor = Ridge()
ridge_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge = GridSearchCV(Ridge_regressor, ridge_reg_parameters, cv=5)

grid_search_cv_ridge.fit(X_train, y_train)
grid_search_cv_ridge.best_params_

# Model predictions
Ridge_best_reg = grid_search_cv_ridge.best_estimator_
y_pred_Ridge_train = Ridge_best_reg.predict(X_train)
y_pred_Ridge_test = Ridge_best_reg.predict(X_test)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Ridge_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Ridge_test)
for name_metric, error in ridge_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.03805
Train MAE: 0.10895
Train RMSE: 0.19506

Test MSE: 0.00454
Test MAE: 0.05871
Test RMSE: 0.06742


## Lasso Regresssion

In [8]:
# Creating and fitting model
Lasso_regressor = Lasso()
lasso_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso = GridSearchCV(Lasso_regressor, lasso_reg_parameters, cv=5)

grid_search_cv_lasso.fit(X_train, y_train)
grid_search_cv_lasso.best_params_

# Model predictions
Lasso_best_reg = grid_search_cv_lasso.best_estimator_
y_pred_Lasso_train = Lasso_best_reg.predict(X_train)
y_pred_Lasso_test = Lasso_best_reg.predict(X_test)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Lasso_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Lasso_test)
for name_metric, error in lasso_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06057
Train MAE: 0.14599
Train RMSE: 0.24611

Test MSE: 0.00771
Test MAE: 0.08782
Test RMSE: 0.08783


## ElasticNet Regression

In [9]:
# Creating and fitting model
ElasticNet_regressor = ElasticNet()
elasticnet_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet = GridSearchCV(ElasticNet_regressor, elasticnet_reg_parameters, cv=5)

grid_search_cv_elasticnet.fit(X_train, y_train)
grid_search_cv_elasticnet.best_params_

# Model predictions
Elasticnet_best_reg = grid_search_cv_elasticnet.best_estimator_
y_pred_Elasticnet_train = Elasticnet_best_reg.predict(X_train)
y_pred_Elasticnet_test = Elasticnet_best_reg.predict(X_test)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Elasticnet_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Elasticnet_test)
for name_metric, error in elasticnet_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06057
Train MAE: 0.14599
Train RMSE: 0.24611

Test MSE: 0.00771
Test MAE: 0.08782
Test RMSE: 0.08783


## Random Forest

In [10]:
# Creating and fitting model
Forest_regressor = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest = GridSearchCV(Forest_regressor, forest_reg_parameters, cv=3)
grid_search_cv_forest.fit(X_train, y_train)
grid_search_cv_forest.best_params_

# Model predictions
Forest_best_reg = grid_search_cv_forest.best_estimator_
y_pred_Forest_train = Forest_best_reg.predict(X_train)
y_pred_Forest_test = Forest_best_reg.predict(X_test)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Forest_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Forest_test)
for name_metric, error in rforest_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02020
Train MAE: 0.06741
Train RMSE: 0.14213

Test MSE: 0.00277
Test MAE: 0.04387
Test RMSE: 0.05261


## k-NN Regression

In [11]:
# Creating and fitting model
KNeighbors_regressor = KNeighborsRegressor()
kNN_reg_parameters = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN = GridSearchCV(KNeighbors_regressor, kNN_reg_parameters, cv=5)
grid_search_cv_kNN.fit(X_train, y_train)
grid_search_cv_kNN.best_params_

# Model predictions
kNN_best_reg = grid_search_cv_kNN.best_estimator_
y_pred_kNN_train = kNN_best_reg.predict(X_train)
y_pred_kNN_test = kNN_best_reg.predict(X_test)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_kNN_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_kNN_test)
for name_metric, error in knn_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.03370
Train MAE: 0.06642
Train RMSE: 0.18359

Test MSE: 0.00004
Test MAE: 0.00511
Test RMSE: 0.00593


## Boosting

In [12]:
# Creating and fitting model
GBoosting_regressor = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters = {}
grid_search_cv_gboost = GridSearchCV(GBoosting_regressor, gboost_reg_parameters, cv=5)
grid_search_cv_gboost.fit(X_train, y_train)
#print(grid_search_cv_gboost.best_params_)

# Model predictions
GBoost_best_reg = grid_search_cv_gboost.best_estimator_
y_pred_gboost_train = GBoost_best_reg.predict(X_train)
y_pred_gboost_test = GBoost_best_reg.predict(X_test)

# Calculating metrics
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_gboost_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_gboost_test)
for name_metric, error in gboost_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01671
Train MAE: 0.03452
Train RMSE: 0.12925

Test MSE: 0.00030
Test MAE: 0.00954
Test RMSE: 0.01738


# Evaluation

In [15]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics,lasso_metrics, elasticnet_metrics, rforest_metrics, knn_metrics, gboost_metrics]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "Metrics WP6++ Dataset",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()

Thus, WP6++ Dataset showed better results than WP6 Dataset