# Импортируем необходимые библиотеки

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# General Libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go

# Algorithms (Regression)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as XGBoost

# Preprocessing / Feature Selection / Model Selection / Metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from multiprocessing import Pool, Process

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

pd.options.display.max_columns = 200

# Функции

In [3]:
def quality_metrics(y_true: float, y_pred: float) -> dict:
  dict_metrics = dict()
  dict_metrics['MSE'] = mean_squared_error(y_true, y_pred)
  dict_metrics['MAE'] = mean_absolute_error(y_true, y_pred)
  dict_metrics['RMSE'] = np.sqrt(dict_metrics['MSE'])
  return dict_metrics

# Импорт датасета WP6

In [4]:
real_df = pd.read_csv("/content/drive/MyDrive/Files For Project/AtomPairsData++.csv")
real_df = real_df.drop(["Unnamed: 0"], axis = 1)

gen_df = pd.read_csv("/content/drive/MyDrive/Files For Project/GeneratedData2++.csv")
gen_df = gen_df.drop(["Unnamed: 0"], axis = 1)

print(f"Длина оригинального датасета: {real_df.shape[0]}")
print(f"Длина сгенерированного датасета: {gen_df.shape[0]}")

Длина оригинального датасета: 33
Длина сгенерированного датасета: 30


Я буду применять три варианта соотношения сгенерированных данных к оригинальным:



*   11 сгененированных + 33 оригинальных (т.е 25% от сгенерованных от всего количества)
*   22 сгенерированных (40% от всего датасета)
*   30 сгенерированных (48% от всего датасета)



# Соотношение 11:33 (25%)

In [5]:
adding = gen_df.head(11)
df = pd.concat([real_df, adding], ignore_index=True)
df.shape

(44, 13)

In [6]:
X = df.drop(['Ka_mean'], axis=1)
y = df["Ka_mean"]
print(f'Общее количество данных: {X.shape[0]}')
print(f'Количество признаков: {X.shape[1]}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=42)
print(f"Количество данных для обучения модели: {len(X_train)}")
print(f"Количество данных для тестирования модели: {len(X_test)}")

Общее количество данных: 44
Количество признаков: 12
Количество данных для обучения модели: 39
Количество данных для тестирования модели: 5


## Построение моделей МО

### Linear Regression

In [7]:
# Обучение модели
Lin_regressor = LinearRegression(n_jobs=-1)
grid_search_cv_linear = GridSearchCV(Lin_regressor, {}, cv=5)
grid_search_cv_linear.fit(X_train, y_train.to_numpy())

# Предсказание модели
y_pred_Linear_train = grid_search_cv_linear.best_estimator_.predict(X_train)
y_pred_Linear_test = grid_search_cv_linear.best_estimator_.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Linear_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Linear_test)
for name_metric, error in lin_reg_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02726
Train MAE: 0.11869
Train RMSE: 0.16509

Test MSE: 0.03013
Test MAE: 0.13994
Test RMSE: 0.17357


### Ridge Regression

In [8]:
# Обучение модели
Ridge_regressor = Ridge()
ridge_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge = GridSearchCV(Ridge_regressor, ridge_reg_parameters, cv=5)

grid_search_cv_ridge.fit(X_train, y_train)
grid_search_cv_ridge.best_params_

# Предсказание модели
Ridge_best_reg = grid_search_cv_ridge.best_estimator_
y_pred_Ridge_train = Ridge_best_reg.predict(X_train)
y_pred_Ridge_test = Ridge_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Ridge_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Ridge_test)
for name_metric, error in ridge_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04375
Train MAE: 0.12769
Train RMSE: 0.20917

Test MSE: 0.04217
Test MAE: 0.12982
Test RMSE: 0.20534


### Lasso Regresssion

In [9]:
# Обучение модели
Lasso_regressor = Lasso()
lasso_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso = GridSearchCV(Lasso_regressor, lasso_reg_parameters, cv=5)

grid_search_cv_lasso.fit(X_train, y_train)
grid_search_cv_lasso.best_params_

# Предсказание модели
Lasso_best_reg = grid_search_cv_lasso.best_estimator_
y_pred_Lasso_train = Lasso_best_reg.predict(X_train)
y_pred_Lasso_test = Lasso_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Lasso_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Lasso_test)
for name_metric, error in lasso_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06775
Train MAE: 0.16751
Train RMSE: 0.26029

Test MSE: 0.06557
Test MAE: 0.15929
Test RMSE: 0.25607


### ElasticNet Regression

In [10]:
# Обучение модели
ElasticNet_regressor = ElasticNet()
elasticnet_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet = GridSearchCV(ElasticNet_regressor, elasticnet_reg_parameters, cv=5)

grid_search_cv_elasticnet.fit(X_train, y_train)
grid_search_cv_elasticnet.best_params_

# Предсказание модели
Elasticnet_best_reg = grid_search_cv_elasticnet.best_estimator_
y_pred_Elasticnet_train = Elasticnet_best_reg.predict(X_train)
y_pred_Elasticnet_test = Elasticnet_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Elasticnet_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Elasticnet_test)
for name_metric, error in elasticnet_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.05205
Train MAE: 0.14567
Train RMSE: 0.22815

Test MSE: 0.04665
Test MAE: 0.14001
Test RMSE: 0.21598


### Random Forest

In [11]:
# Обучение модели
Forest_regressor = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest = GridSearchCV(Forest_regressor, forest_reg_parameters, cv=3)
grid_search_cv_forest.fit(X_train, y_train)
grid_search_cv_forest.best_params_

# Предсказание модели
Forest_best_reg = grid_search_cv_forest.best_estimator_
y_pred_Forest_train = Forest_best_reg.predict(X_train)
y_pred_Forest_test = Forest_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Forest_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Forest_test)
for name_metric, error in rforest_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01807
Train MAE: 0.06669
Train RMSE: 0.13443

Test MSE: 0.08392
Test MAE: 0.18830
Test RMSE: 0.28968


### k-NN Regression

In [12]:
# Обучение модели
KNeighbors_regressor = KNeighborsRegressor()
kNN_reg_parameters = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN = GridSearchCV(KNeighbors_regressor, kNN_reg_parameters, cv=5)
grid_search_cv_kNN.fit(X_train, y_train)
grid_search_cv_kNN.best_params_

# Предсказание модели
kNN_best_reg = grid_search_cv_kNN.best_estimator_
y_pred_kNN_train = kNN_best_reg.predict(X_train)
y_pred_kNN_test = kNN_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_kNN_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_kNN_test)
for name_metric, error in knn_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04965
Train MAE: 0.10498
Train RMSE: 0.22283

Test MSE: 0.04644
Test MAE: 0.13575
Test RMSE: 0.21551


### Boosting

In [13]:
# Обучение модели
GBoosting_regressor = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters = {}
grid_search_cv_gboost = GridSearchCV(GBoosting_regressor, gboost_reg_parameters, cv=5)
grid_search_cv_gboost.fit(X_train, y_train)
#print(grid_search_cv_gboost.best_params_)

# Предсказание модели
GBoost_best_reg = grid_search_cv_gboost.best_estimator_
y_pred_gboost_train = GBoost_best_reg.predict(X_train)
y_pred_gboost_test = GBoost_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_gboost_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_gboost_test)
for name_metric, error in gboost_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01242
Train MAE: 0.02597
Train RMSE: 0.11146

Test MSE: 0.16086
Test MAE: 0.26652
Test RMSE: 0.40107


## Evaluation

In [28]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics,lasso_metrics, elasticnet_metrics, rforest_metrics, knn_metrics, gboost_metrics]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "WP6+ + generated data (25%)",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()


> Метрики намного хуже, чем при обучении модели с WP6++ без синтезированных данных



# Соотношение 22:33 (40%)

In [30]:
adding = gen_df.head(22)
df = pd.concat([real_df, adding], ignore_index=True)
df.shape

(55, 13)

In [31]:
X = df.drop(['Ka_mean'], axis=1)
y = df["Ka_mean"]
print(f'Общее количество данных: {X.shape[0]}')
print(f'Количество признаков: {X.shape[1]}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=42)
print(f"Количество данных для обучения модели: {len(X_train)}")
print(f"Количество данных для тестирования модели: {len(X_test)}")

Общее количество данных: 55
Количество признаков: 12
Количество данных для обучения модели: 49
Количество данных для тестирования модели: 6


## Построение моделей МО

### Linear Regression

In [32]:
# Обучение модели
Lin_regressor = LinearRegression(n_jobs=-1)
grid_search_cv_linear = GridSearchCV(Lin_regressor, {}, cv=5)
grid_search_cv_linear.fit(X_train, y_train.to_numpy())

# Предсказание модели
y_pred_Linear_train = grid_search_cv_linear.best_estimator_.predict(X_train)
y_pred_Linear_test = grid_search_cv_linear.best_estimator_.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Linear_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Linear_test)
for name_metric, error in lin_reg_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02804
Train MAE: 0.11897
Train RMSE: 0.16745

Test MSE: 0.04563
Test MAE: 0.17968
Test RMSE: 0.21361


### Ridge Regression

In [33]:
# Обучение модели
Ridge_regressor = Ridge()
ridge_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge = GridSearchCV(Ridge_regressor, ridge_reg_parameters, cv=5)

grid_search_cv_ridge.fit(X_train, y_train)
grid_search_cv_ridge.best_params_

# Предсказание модели
Ridge_best_reg = grid_search_cv_ridge.best_estimator_
y_pred_Ridge_train = Ridge_best_reg.predict(X_train)
y_pred_Ridge_test = Ridge_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Ridge_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Ridge_test)
for name_metric, error in ridge_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.03969
Train MAE: 0.12079
Train RMSE: 0.19922

Test MSE: 0.03460
Test MAE: 0.12590
Test RMSE: 0.18602


### Lasso Regresssion

In [34]:
# Обучение модели
Lasso_regressor = Lasso()
lasso_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso = GridSearchCV(Lasso_regressor, lasso_reg_parameters, cv=5)

grid_search_cv_lasso.fit(X_train, y_train)
grid_search_cv_lasso.best_params_

# Предсказание модели
Lasso_best_reg = grid_search_cv_lasso.best_estimator_
y_pred_Lasso_train = Lasso_best_reg.predict(X_train)
y_pred_Lasso_test = Lasso_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Lasso_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Lasso_test)
for name_metric, error in lasso_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06146
Train MAE: 0.15684
Train RMSE: 0.24792

Test MSE: 0.03752
Test MAE: 0.13519
Test RMSE: 0.19369


### ElasticNet Regression

In [35]:
# Обучение модели
ElasticNet_regressor = ElasticNet()
elasticnet_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet = GridSearchCV(ElasticNet_regressor, elasticnet_reg_parameters, cv=5)

grid_search_cv_elasticnet.fit(X_train, y_train)
grid_search_cv_elasticnet.best_params_

# Предсказание модели
Elasticnet_best_reg = grid_search_cv_elasticnet.best_estimator_
y_pred_Elasticnet_train = Elasticnet_best_reg.predict(X_train)
y_pred_Elasticnet_test = Elasticnet_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Elasticnet_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Elasticnet_test)
for name_metric, error in elasticnet_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06146
Train MAE: 0.15684
Train RMSE: 0.24792

Test MSE: 0.03752
Test MAE: 0.13519
Test RMSE: 0.19369


### Random Forest

In [36]:
# Обучение модели
Forest_regressor = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest = GridSearchCV(Forest_regressor, forest_reg_parameters, cv=3)
grid_search_cv_forest.fit(X_train, y_train)
grid_search_cv_forest.best_params_

# Предсказание модели
Forest_best_reg = grid_search_cv_forest.best_estimator_
y_pred_Forest_train = Forest_best_reg.predict(X_train)
y_pred_Forest_test = Forest_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Forest_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Forest_test)
for name_metric, error in rforest_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01535
Train MAE: 0.06260
Train RMSE: 0.12390

Test MSE: 0.04840
Test MAE: 0.17616
Test RMSE: 0.22001


### k-NN Regression

In [37]:
# Обучение модели
KNeighbors_regressor = KNeighborsRegressor()
kNN_reg_parameters = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN = GridSearchCV(KNeighbors_regressor, kNN_reg_parameters, cv=5)
grid_search_cv_kNN.fit(X_train, y_train)
grid_search_cv_kNN.best_params_

# Предсказание модели
kNN_best_reg = grid_search_cv_kNN.best_estimator_
y_pred_kNN_train = kNN_best_reg.predict(X_train)
y_pred_kNN_test = kNN_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_kNN_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_kNN_test)
for name_metric, error in knn_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04852
Train MAE: 0.10711
Train RMSE: 0.22027

Test MSE: 0.04158
Test MAE: 0.10778
Test RMSE: 0.20392


### Boosting

In [38]:
# Обучение модели
GBoosting_regressor = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters = {}
grid_search_cv_gboost = GridSearchCV(GBoosting_regressor, gboost_reg_parameters, cv=5)
grid_search_cv_gboost.fit(X_train, y_train)
#print(grid_search_cv_gboost.best_params_)

# Предсказание модели
GBoost_best_reg = grid_search_cv_gboost.best_estimator_
y_pred_gboost_train = GBoost_best_reg.predict(X_train)
y_pred_gboost_test = GBoost_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_gboost_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_gboost_test)
for name_metric, error in gboost_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.00989
Train MAE: 0.02079
Train RMSE: 0.09944

Test MSE: 0.19783
Test MAE: 0.34453
Test RMSE: 0.44478


## Evaluation

In [39]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics,lasso_metrics, elasticnet_metrics, rforest_metrics, knn_metrics, gboost_metrics]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "WP6+ + generated data (40%)",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()


> Здесь еще похуже


# Соотношение 30:33 (48%)

In [40]:
df = pd.concat([real_df, gen_df], ignore_index=True)
df.shape

(63, 13)

In [41]:
X = df.drop(['Ka_mean'], axis=1)
y = df["Ka_mean"]
print(f'Общее количество данных: {X.shape[0]}')
print(f'Количество признаков: {X.shape[1]}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=42)
print(f"Количество данных для обучения модели: {len(X_train)}")
print(f"Количество данных для тестирования модели: {len(X_test)}")

Общее количество данных: 63
Количество признаков: 12
Количество данных для обучения модели: 56
Количество данных для тестирования модели: 7


## Построение моделей МО

### Linear Regression

In [42]:
# Обучение модели
Lin_regressor = LinearRegression(n_jobs=-1)
grid_search_cv_linear = GridSearchCV(Lin_regressor, {}, cv=5)
grid_search_cv_linear.fit(X_train, y_train.to_numpy())

# Предсказание модели
y_pred_Linear_train = grid_search_cv_linear.best_estimator_.predict(X_train)
y_pred_Linear_test = grid_search_cv_linear.best_estimator_.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Linear_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Linear_test)
for name_metric, error in lin_reg_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02972
Train MAE: 0.12757
Train RMSE: 0.17240

Test MSE: 0.04884
Test MAE: 0.15412
Test RMSE: 0.22100


### Ridge Regression

In [43]:
# Обучение модели
Ridge_regressor = Ridge()
ridge_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge = GridSearchCV(Ridge_regressor, ridge_reg_parameters, cv=5)

grid_search_cv_ridge.fit(X_train, y_train)
grid_search_cv_ridge.best_params_

# Предсказание модели
Ridge_best_reg = grid_search_cv_ridge.best_estimator_
y_pred_Ridge_train = Ridge_best_reg.predict(X_train)
y_pred_Ridge_test = Ridge_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Ridge_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Ridge_test)
for name_metric, error in ridge_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04150
Train MAE: 0.13108
Train RMSE: 0.20372

Test MSE: 0.01474
Test MAE: 0.08144
Test RMSE: 0.12139


### Lasso Regresssion

In [44]:
# Обучение модели
Lasso_regressor = Lasso()
lasso_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso = GridSearchCV(Lasso_regressor, lasso_reg_parameters, cv=5)

grid_search_cv_lasso.fit(X_train, y_train)
grid_search_cv_lasso.best_params_

# Предсказание модели
Lasso_best_reg = grid_search_cv_lasso.best_estimator_
y_pred_Lasso_train = Lasso_best_reg.predict(X_train)
y_pred_Lasso_test = Lasso_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Lasso_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Lasso_test)
for name_metric, error in lasso_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06419
Train MAE: 0.17081
Train RMSE: 0.25336

Test MSE: 0.00756
Test MAE: 0.08134
Test RMSE: 0.08697


### ElasticNet Regression

In [45]:
# Обучение модели
ElasticNet_regressor = ElasticNet()
elasticnet_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet = GridSearchCV(ElasticNet_regressor, elasticnet_reg_parameters, cv=5)

grid_search_cv_elasticnet.fit(X_train, y_train)
grid_search_cv_elasticnet.best_params_

# Предсказание модели
Elasticnet_best_reg = grid_search_cv_elasticnet.best_estimator_
y_pred_Elasticnet_train = Elasticnet_best_reg.predict(X_train)
y_pred_Elasticnet_test = Elasticnet_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Elasticnet_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Elasticnet_test)
for name_metric, error in elasticnet_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06419
Train MAE: 0.17081
Train RMSE: 0.25336

Test MSE: 0.00756
Test MAE: 0.08134
Test RMSE: 0.08697


### Random Forest

In [46]:
# Обучение модели
Forest_regressor = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest = GridSearchCV(Forest_regressor, forest_reg_parameters, cv=3)
grid_search_cv_forest.fit(X_train, y_train)
grid_search_cv_forest.best_params_

# Предсказание модели
Forest_best_reg = grid_search_cv_forest.best_estimator_
y_pred_Forest_train = Forest_best_reg.predict(X_train)
y_pred_Forest_test = Forest_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Forest_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Forest_test)
for name_metric, error in rforest_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01491
Train MAE: 0.06312
Train RMSE: 0.12211

Test MSE: 0.00854
Test MAE: 0.07532
Test RMSE: 0.09240


### k-NN Regression

In [47]:
# Обучение модели
KNeighbors_regressor = KNeighborsRegressor()
kNN_reg_parameters = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN = GridSearchCV(KNeighbors_regressor, kNN_reg_parameters, cv=5)
grid_search_cv_kNN.fit(X_train, y_train)
grid_search_cv_kNN.best_params_

# Предсказание модели
kNN_best_reg = grid_search_cv_kNN.best_estimator_
y_pred_kNN_train = kNN_best_reg.predict(X_train)
y_pred_kNN_test = kNN_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_kNN_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_kNN_test)
for name_metric, error in knn_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.05150
Train MAE: 0.12308
Train RMSE: 0.22693

Test MSE: 0.01069
Test MAE: 0.06144
Test RMSE: 0.10338


### Boosting

In [48]:
# Обучение модели
GBoosting_regressor = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters = {}
grid_search_cv_gboost = GridSearchCV(GBoosting_regressor, gboost_reg_parameters, cv=5)
grid_search_cv_gboost.fit(X_train, y_train)
#print(grid_search_cv_gboost.best_params_)

# Предсказание модели
GBoost_best_reg = grid_search_cv_gboost.best_estimator_
y_pred_gboost_train = GBoost_best_reg.predict(X_train)
y_pred_gboost_test = GBoost_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_gboost_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_gboost_test)
for name_metric, error in gboost_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.00865
Train MAE: 0.01837
Train RMSE: 0.09302

Test MSE: 0.00304
Test MAE: 0.04918
Test RMSE: 0.05515


## Evaluation

In [50]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics,lasso_metrics, elasticnet_metrics, rforest_metrics, knn_metrics, gboost_metrics]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "WP6+ + generated data (48%)",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()


> Здесь выходит неодназначно: вроде метрики хорошие, однако есть ощущение, что дата уже настолько выдумана и размазана, что целевая метрика уже максимально неточна в валидации 🤔


# Выводы



> Думаю, применять похожие молекулы + генерировать данные по ним = потерять целевую метрику, поэтому эти методы в совокупности использовать не стоит. Однако по отдельности они приносят весьма неплохие результаты

