# Импортируем необходимые библиотеки

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# General Libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go

# Algorithms (Regression)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as XGBoost

# Preprocessing / Feature Selection / Model Selection / Metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from multiprocessing import Pool, Process

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

pd.options.display.max_columns = 200

# Функции

In [6]:
def quality_metrics(y_true: float, y_pred: float) -> dict:
  dict_metrics = dict()
  dict_metrics['MSE'] = mean_squared_error(y_true, y_pred)
  dict_metrics['MAE'] = mean_absolute_error(y_true, y_pred)
  dict_metrics['RMSE'] = np.sqrt(dict_metrics['MSE'])
  return dict_metrics

# Импорт датасета WP6

In [11]:
real_df = pd.read_csv("/content/drive/MyDrive/Files For Project/AtomPairsData.csv")
real_df = real_df.drop(["Unnamed: 0"], axis = 1)

gen_df = pd.read_csv("/content/drive/MyDrive/Files For Project/GeneratedData.csv")
gen_df = gen_df.drop(["Unnamed: 0"], axis = 1)

print(f"Длина оригинального датасета: {real_df.shape[0]}")
print(f"Длина сгенерированного датасета: {gen_df.shape[0]}")

Длина оригинального датасета: 21
Длина сгенерированного датасета: 31


Я буду применять три варианта соотношения сгенерированных данных к оригинальным:



*   7 сгененированных + 21 оригинальных (т.е 25% от сгенерованных от всего количества)
*   14 сгенерированных (50% от всего датасета)
*   31 сгенерированных (60% от всего датасета)



# Соотношение 7:21 (25%)

In [13]:
adding = gen_df.head(7)
df = pd.concat([real_df, adding], ignore_index=True)
df.shape

(28, 13)

In [14]:
X = df.drop(['Ka_mean'], axis=1)
y = df["Ka_mean"]
print(f'Общее количество данных: {X.shape[0]}')
print(f'Количество признаков: {X.shape[1]}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=42)
print(f"Количество данных для обучения модели: {len(X_train)}")
print(f"Количество данных для тестирования модели: {len(X_test)}")

Общее количество данных: 28
Количество признаков: 12
Количество данных для обучения модели: 25
Количество данных для тестирования модели: 3


## Построение моделей МО

### Linear Regression

In [15]:
# Обучение модели
Lin_regressor = LinearRegression(n_jobs=-1)
grid_search_cv_linear = GridSearchCV(Lin_regressor, {}, cv=5)
grid_search_cv_linear.fit(X_train, y_train.to_numpy())

# Предсказание модели
y_pred_Linear_train = grid_search_cv_linear.best_estimator_.predict(X_train)
y_pred_Linear_test = grid_search_cv_linear.best_estimator_.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Linear_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Linear_test)
for name_metric, error in lin_reg_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02666
Train MAE: 0.09439
Train RMSE: 0.16328

Test MSE: 0.03082
Test MAE: 0.13329
Test RMSE: 0.17556


### Ridge Regression

In [16]:
# Обучение модели
Ridge_regressor = Ridge()
ridge_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge = GridSearchCV(Ridge_regressor, ridge_reg_parameters, cv=5)

grid_search_cv_ridge.fit(X_train, y_train)
grid_search_cv_ridge.best_params_

# Предсказание модели
Ridge_best_reg = grid_search_cv_ridge.best_estimator_
y_pred_Ridge_train = Ridge_best_reg.predict(X_train)
y_pred_Ridge_test = Ridge_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Ridge_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Ridge_test)
for name_metric, error in ridge_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.03965
Train MAE: 0.12373
Train RMSE: 0.19913

Test MSE: 0.00493
Test MAE: 0.04928
Test RMSE: 0.07024


### Lasso Regresssion

In [17]:
# Обучение модели
Lasso_regressor = Lasso()
lasso_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso = GridSearchCV(Lasso_regressor, lasso_reg_parameters, cv=5)

grid_search_cv_lasso.fit(X_train, y_train)
grid_search_cv_lasso.best_params_

# Предсказание модели
Lasso_best_reg = grid_search_cv_lasso.best_estimator_
y_pred_Lasso_train = Lasso_best_reg.predict(X_train)
y_pred_Lasso_test = Lasso_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Lasso_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Lasso_test)
for name_metric, error in lasso_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.06744
Train MAE: 0.16594
Train RMSE: 0.25969

Test MSE: 0.00535
Test MAE: 0.06892
Test RMSE: 0.07315


### ElasticNet Regression

In [18]:
# Обучение модели
ElasticNet_regressor = ElasticNet()
elasticnet_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet = GridSearchCV(ElasticNet_regressor, elasticnet_reg_parameters, cv=5)

grid_search_cv_elasticnet.fit(X_train, y_train)
grid_search_cv_elasticnet.best_params_

# Предсказание модели
Elasticnet_best_reg = grid_search_cv_elasticnet.best_estimator_
y_pred_Elasticnet_train = Elasticnet_best_reg.predict(X_train)
y_pred_Elasticnet_test = Elasticnet_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Elasticnet_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Elasticnet_test)
for name_metric, error in elasticnet_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04243
Train MAE: 0.12839
Train RMSE: 0.20600

Test MSE: 0.00398
Test MAE: 0.05391
Test RMSE: 0.06309


### Random Forest

In [19]:
# Обучение модели
Forest_regressor = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest = GridSearchCV(Forest_regressor, forest_reg_parameters, cv=3)
grid_search_cv_forest.fit(X_train, y_train)
grid_search_cv_forest.best_params_

# Предсказание модели
Forest_best_reg = grid_search_cv_forest.best_estimator_
y_pred_Forest_train = Forest_best_reg.predict(X_train)
y_pred_Forest_test = Forest_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Forest_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Forest_test)
for name_metric, error in rforest_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02380
Train MAE: 0.07971
Train RMSE: 0.15426

Test MSE: 0.00126
Test MAE: 0.03053
Test RMSE: 0.03552


### k-NN Regression

In [20]:
# Обучение модели
KNeighbors_regressor = KNeighborsRegressor()
kNN_reg_parameters = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN = GridSearchCV(KNeighbors_regressor, kNN_reg_parameters, cv=5)
grid_search_cv_kNN.fit(X_train, y_train)
grid_search_cv_kNN.best_params_

# Предсказание модели
kNN_best_reg = grid_search_cv_kNN.best_estimator_
y_pred_kNN_train = kNN_best_reg.predict(X_train)
y_pred_kNN_test = kNN_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_kNN_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_kNN_test)
for name_metric, error in knn_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04723
Train MAE: 0.12069
Train RMSE: 0.21732

Test MSE: 0.00150
Test MAE: 0.03507
Test RMSE: 0.03870


### Boosting

In [21]:
# Обучение модели
GBoosting_regressor = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters = {}
grid_search_cv_gboost = GridSearchCV(GBoosting_regressor, gboost_reg_parameters, cv=5)
grid_search_cv_gboost.fit(X_train, y_train)
#print(grid_search_cv_gboost.best_params_)

# Предсказание модели
GBoost_best_reg = grid_search_cv_gboost.best_estimator_
y_pred_gboost_train = GBoost_best_reg.predict(X_train)
y_pred_gboost_test = GBoost_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_gboost_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_gboost_test)
for name_metric, error in gboost_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01938
Train MAE: 0.04003
Train RMSE: 0.13922

Test MSE: 0.00188
Test MAE: 0.03355
Test RMSE: 0.04331


## Evaluation

In [24]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics,lasso_metrics, elasticnet_metrics, rforest_metrics, knn_metrics, gboost_metrics]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "WP6 + generated data (25%)",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()


> Метрики уже **намного** лучше, чем при обычном использовании датасета!



# Соотношение 14:21 (50%)

In [25]:
adding = gen_df.head(14)
df = pd.concat([real_df, adding], ignore_index=True)
df.shape

(35, 13)

In [26]:
X = df.drop(['Ka_mean'], axis=1)
y = df["Ka_mean"]
print(f'Общее количество данных: {X.shape[0]}')
print(f'Количество признаков: {X.shape[1]}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=42)
print(f"Количество данных для обучения модели: {len(X_train)}")
print(f"Количество данных для тестирования модели: {len(X_test)}")

Общее количество данных: 35
Количество признаков: 12
Количество данных для обучения модели: 31
Количество данных для тестирования модели: 4


## Построение моделей МО

### Linear Regression

In [27]:
# Обучение модели
Lin_regressor = LinearRegression(n_jobs=-1)
grid_search_cv_linear = GridSearchCV(Lin_regressor, {}, cv=5)
grid_search_cv_linear.fit(X_train, y_train.to_numpy())

# Предсказание модели
y_pred_Linear_train = grid_search_cv_linear.best_estimator_.predict(X_train)
y_pred_Linear_test = grid_search_cv_linear.best_estimator_.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Linear_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Linear_test)
for name_metric, error in lin_reg_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02358
Train MAE: 0.09390
Train RMSE: 0.15356

Test MSE: 0.05829
Test MAE: 0.19480
Test RMSE: 0.24144


### Ridge Regression

In [28]:
# Обучение модели
Ridge_regressor = Ridge()
ridge_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge = GridSearchCV(Ridge_regressor, ridge_reg_parameters, cv=5)

grid_search_cv_ridge.fit(X_train, y_train)
grid_search_cv_ridge.best_params_

# Предсказание модели
Ridge_best_reg = grid_search_cv_ridge.best_estimator_
y_pred_Ridge_train = Ridge_best_reg.predict(X_train)
y_pred_Ridge_test = Ridge_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Ridge_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Ridge_test)
for name_metric, error in ridge_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02359
Train MAE: 0.09394
Train RMSE: 0.15357

Test MSE: 0.05782
Test MAE: 0.19289
Test RMSE: 0.24045


### Lasso Regresssion

In [29]:
# Обучение модели
Lasso_regressor = Lasso()
lasso_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso = GridSearchCV(Lasso_regressor, lasso_reg_parameters, cv=5)

grid_search_cv_lasso.fit(X_train, y_train)
grid_search_cv_lasso.best_params_

# Предсказание модели
Lasso_best_reg = grid_search_cv_lasso.best_estimator_
y_pred_Lasso_train = Lasso_best_reg.predict(X_train)
y_pred_Lasso_test = Lasso_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Lasso_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Lasso_test)
for name_metric, error in lasso_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02358
Train MAE: 0.09391
Train RMSE: 0.15356

Test MSE: 0.05824
Test MAE: 0.19457
Test RMSE: 0.24132


### ElasticNet Regression

In [30]:
# Обучение модели
ElasticNet_regressor = ElasticNet()
elasticnet_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet = GridSearchCV(ElasticNet_regressor, elasticnet_reg_parameters, cv=5)

grid_search_cv_elasticnet.fit(X_train, y_train)
grid_search_cv_elasticnet.best_params_

# Предсказание модели
Elasticnet_best_reg = grid_search_cv_elasticnet.best_estimator_
y_pred_Elasticnet_train = Elasticnet_best_reg.predict(X_train)
y_pred_Elasticnet_test = Elasticnet_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Elasticnet_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Elasticnet_test)
for name_metric, error in elasticnet_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02359
Train MAE: 0.09396
Train RMSE: 0.15357

Test MSE: 0.05777
Test MAE: 0.19266
Test RMSE: 0.24036


### Random Forest

In [31]:
# Обучение модели
Forest_regressor = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest = GridSearchCV(Forest_regressor, forest_reg_parameters, cv=3)
grid_search_cv_forest.fit(X_train, y_train)
grid_search_cv_forest.best_params_

# Предсказание модели
Forest_best_reg = grid_search_cv_forest.best_estimator_
y_pred_Forest_train = Forest_best_reg.predict(X_train)
y_pred_Forest_test = Forest_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Forest_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Forest_test)
for name_metric, error in rforest_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01921
Train MAE: 0.06691
Train RMSE: 0.13859

Test MSE: 0.00854
Test MAE: 0.07438
Test RMSE: 0.09243


### k-NN Regression

In [32]:
# Обучение модели
KNeighbors_regressor = KNeighborsRegressor()
kNN_reg_parameters = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN = GridSearchCV(KNeighbors_regressor, kNN_reg_parameters, cv=5)
grid_search_cv_kNN.fit(X_train, y_train)
grid_search_cv_kNN.best_params_

# Предсказание модели
kNN_best_reg = grid_search_cv_kNN.best_estimator_
y_pred_kNN_train = kNN_best_reg.predict(X_train)
y_pred_kNN_test = kNN_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_kNN_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_kNN_test)
for name_metric, error in knn_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04343
Train MAE: 0.11271
Train RMSE: 0.20840

Test MSE: 0.00965
Test MAE: 0.08614
Test RMSE: 0.09825


### Boosting

In [33]:
# Обучение модели
GBoosting_regressor = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters = {}
grid_search_cv_gboost = GridSearchCV(GBoosting_regressor, gboost_reg_parameters, cv=5)
grid_search_cv_gboost.fit(X_train, y_train)
#print(grid_search_cv_gboost.best_params_)

# Предсказание модели
GBoost_best_reg = grid_search_cv_gboost.best_estimator_
y_pred_gboost_train = GBoost_best_reg.predict(X_train)
y_pred_gboost_test = GBoost_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_gboost_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_gboost_test)
for name_metric, error in gboost_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01563
Train MAE: 0.03231
Train RMSE: 0.12502

Test MSE: 0.03129
Test MAE: 0.11591
Test RMSE: 0.17690


## Evaluation

In [35]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics,lasso_metrics, elasticnet_metrics, rforest_metrics, knn_metrics, gboost_metrics]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "WP6 + generated data (50%)",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()


> Тут уже похуже      :(



# Соотношение 31:21 (60%)

In [37]:
df = pd.concat([real_df, gen_df], ignore_index=True)
df.shape

(52, 13)

In [38]:
X = df.drop(['Ka_mean'], axis=1)
y = df["Ka_mean"]
print(f'Общее количество данных: {X.shape[0]}')
print(f'Количество признаков: {X.shape[1]}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=42)
print(f"Количество данных для обучения модели: {len(X_train)}")
print(f"Количество данных для тестирования модели: {len(X_test)}")

Общее количество данных: 52
Количество признаков: 12
Количество данных для обучения модели: 46
Количество данных для тестирования модели: 6


## Построение моделей МО

### Linear Regression

In [39]:
# Обучение модели
Lin_regressor = LinearRegression(n_jobs=-1)
grid_search_cv_linear = GridSearchCV(Lin_regressor, {}, cv=5)
grid_search_cv_linear.fit(X_train, y_train.to_numpy())

# Предсказание модели
y_pred_Linear_train = grid_search_cv_linear.best_estimator_.predict(X_train)
y_pred_Linear_test = grid_search_cv_linear.best_estimator_.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Linear_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lin_reg_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Linear_test)
for name_metric, error in lin_reg_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.02538
Train MAE: 0.10486
Train RMSE: 0.15932

Test MSE: 0.01883
Test MAE: 0.11394
Test RMSE: 0.13723


### Ridge Regression

In [40]:
# Обучение модели
Ridge_regressor = Ridge()
ridge_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_ridge = GridSearchCV(Ridge_regressor, ridge_reg_parameters, cv=5)

grid_search_cv_ridge.fit(X_train, y_train)
grid_search_cv_ridge.best_params_

# Предсказание модели
Ridge_best_reg = grid_search_cv_ridge.best_estimator_
y_pred_Ridge_train = Ridge_best_reg.predict(X_train)
y_pred_Ridge_test = Ridge_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Ridge_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
ridge_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Ridge_test)
for name_metric, error in ridge_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.03112
Train MAE: 0.10912
Train RMSE: 0.17640

Test MSE: 0.00651
Test MAE: 0.06636
Test RMSE: 0.08071


### Lasso Regresssion

In [41]:
# Обучение модели
Lasso_regressor = Lasso()
lasso_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_lasso = GridSearchCV(Lasso_regressor, lasso_reg_parameters, cv=5)

grid_search_cv_lasso.fit(X_train, y_train)
grid_search_cv_lasso.best_params_

# Предсказание модели
Lasso_best_reg = grid_search_cv_lasso.best_estimator_
y_pred_Lasso_train = Lasso_best_reg.predict(X_train)
y_pred_Lasso_test = Lasso_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Lasso_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
lasso_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Lasso_test)
for name_metric, error in lasso_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04659
Train MAE: 0.12281
Train RMSE: 0.21585

Test MSE: 0.00321
Test MAE: 0.04814
Test RMSE: 0.05667


### ElasticNet Regression

In [42]:
# Обучение модели
ElasticNet_regressor = ElasticNet()
elasticnet_reg_parameters = {'alpha': np.logspace(-10, 1, 20)}
grid_search_cv_elasticnet = GridSearchCV(ElasticNet_regressor, elasticnet_reg_parameters, cv=5)

grid_search_cv_elasticnet.fit(X_train, y_train)
grid_search_cv_elasticnet.best_params_

# Предсказание модели
Elasticnet_best_reg = grid_search_cv_elasticnet.best_estimator_
y_pred_Elasticnet_train = Elasticnet_best_reg.predict(X_train)
y_pred_Elasticnet_test = Elasticnet_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Elasticnet_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
elasticnet_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Elasticnet_test)
for name_metric, error in elasticnet_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.04659
Train MAE: 0.12281
Train RMSE: 0.21585

Test MSE: 0.00321
Test MAE: 0.04814
Test RMSE: 0.05667


### Random Forest

In [43]:
# Обучение модели
Forest_regressor = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
forest_reg_parameters = {'max_depth': np.arange(4, 20, 2)}
grid_search_cv_forest = GridSearchCV(Forest_regressor, forest_reg_parameters, cv=3)
grid_search_cv_forest.fit(X_train, y_train)
grid_search_cv_forest.best_params_

# Предсказание модели
Forest_best_reg = grid_search_cv_forest.best_estimator_
y_pred_Forest_train = Forest_best_reg.predict(X_train)
y_pred_Forest_test = Forest_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_Forest_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
rforest_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_Forest_test)
for name_metric, error in rforest_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01416
Train MAE: 0.05724
Train RMSE: 0.11900

Test MSE: 0.02456
Test MAE: 0.09548
Test RMSE: 0.15672


### k-NN Regression

In [44]:
# Обучение модели
KNeighbors_regressor = KNeighborsRegressor()
kNN_reg_parameters = {'n_neighbors': np.arange(4, 20, 2)}                  # <- np.arange(2, 6, 1)
grid_search_cv_kNN = GridSearchCV(KNeighbors_regressor, kNN_reg_parameters, cv=5)
grid_search_cv_kNN.fit(X_train, y_train)
grid_search_cv_kNN.best_params_

# Предсказание модели
kNN_best_reg = grid_search_cv_kNN.best_estimator_
y_pred_kNN_train = kNN_best_reg.predict(X_train)
y_pred_kNN_test = kNN_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_kNN_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
knn_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_kNN_test)
for name_metric, error in knn_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.03686
Train MAE: 0.11120
Train RMSE: 0.19198

Test MSE: 0.00838
Test MAE: 0.08044
Test RMSE: 0.09156


### Boosting

In [45]:
# Обучение модели
GBoosting_regressor = GradientBoostingRegressor(n_estimators=50, max_depth=18)
gboost_reg_parameters = {}
grid_search_cv_gboost = GridSearchCV(GBoosting_regressor, gboost_reg_parameters, cv=5)
grid_search_cv_gboost.fit(X_train, y_train)
#print(grid_search_cv_gboost.best_params_)

# Предсказание модели
GBoost_best_reg = grid_search_cv_gboost.best_estimator_
y_pred_gboost_train = GBoost_best_reg.predict(X_train)
y_pred_gboost_test = GBoost_best_reg.predict(X_test)

# Посчет метрик качества модели
for name_metric, error in quality_metrics(y_true=y_train, y_pred=y_pred_gboost_train).items():
    print(f'Train {name_metric}: {error:.5f}')

print()
gboost_metrics = quality_metrics(y_true=y_test, y_pred=y_pred_gboost_test)
for name_metric, error in gboost_metrics.items():
    print(f'Test {name_metric}: {error:.5f}')

Train MSE: 0.01053
Train MAE: 0.02192
Train RMSE: 0.10264

Test MSE: 0.02303
Test MAE: 0.08948
Test RMSE: 0.15177


## Evaluation

In [47]:
fig = go.Figure()

model_name = ['Ridge', "Lasso","ElasticNet", 'RForest', 'kNN', 'Gradient Boosting']
model_metrics = [ridge_metrics,lasso_metrics, elasticnet_metrics, rforest_metrics, knn_metrics, gboost_metrics]

for name, metrics in zip(model_name, model_metrics):
    fig.add_trace(go.Histogram(histfunc= 'max',
                            x = list(metrics.keys()),
                                y = list(metrics.values()),
                                name = name,
                                ))

fig.update_layout(font_size = 25,
                    font_color='black',
                    title = "Model Evaluation",
                    plot_bgcolor = 'rgba(250,250,250,1)',
                    width = 1100,
                    height = 550,
                    # LEGEND
                    legend_title = "Models",
                    legend_font_size = 25,
                    legend_x = 1.02,
                    legend_y = 1,
                    #legend_bordercolor = 'black',
                    #legend_borderwidth = 1,
                    legend_itemsizing = 'trace',
                    legend_itemwidth=100,
                    # X-axis
                    xaxis_title = "WP6 + generated data (60%)",
                    xaxis_nticks = 7,
                    xaxis_ticklen = 16,
                    xaxis_tickwidth = 3,
                    xaxis_ticks = 'outside',
                    # Y-axis
                    yaxis_title = "Error",
                    yaxis_nticks = 10,
                    yaxis_ticklen = 16,
                    yaxis_tickwidth = 3,
                    yaxis_ticks = 'outside'
                    )

fig.add_shape(type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1.0,
                y1=1.0,
        line=dict(
            color="black",
                width=1,))
fig.layout.font.family = 'sans-serif'

fig.show()


> А здесь вышло неодназначно 🤔
