In [59]:
!pip install lightgbm



In [60]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from matplotlib import rcParams
from statistics import mean
from tqdm import tqdm

In [61]:
from lightgbm import LGBMRegressor
import sklearn
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.metrics import make_scorer, r2_score, mean_absolute_error, mean_squared_error

In [62]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [63]:
df = pd.read_csv('pubchem_regression_actual.csv')
df.head()

Unnamed: 0,Class,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,#HeavyAtoms,...,PUBCHEM_841,PUBCHEM_842,PUBCHEM_845,PUBCHEM_846,PUBCHEM_848,PUBCHEM_866,PUBCHEM_867,PUBCHEM_869,PUBCHEM_874,Activity
0,0.0,182.17,6.0,6.0,121.38,5.0,0.0,-3.59,38.2,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.6
1,0.0,182.17,6.0,6.0,121.38,5.0,0.0,-3.59,38.2,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.6
2,0.0,275.34,4.0,3.0,106.39,3.0,2.0,1.67,77.32,19.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.57
3,0.0,415.29,5.0,2.0,75.11,4.0,1.0,4.98,90.27,29.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.57
4,0.0,543.48,17.0,8.0,285.14,9.0,1.0,-5.6,101.05,34.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.54


In [64]:
# del df['Class']

### Обучение модели

In [65]:
split = KFold(n_splits=5, random_state=41, shuffle=True)
scores= {
         "Q2": "r2",
         "MSE": make_scorer(mean_squared_error, squared=False)
         }

In [66]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [67]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=38)

In [68]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [69]:
MLR_model = LGBMRegressor(random_state=102, boosting_type='dart', data_sample_strategy='bagging', learning_rate=0.1, n_estimators=50, num_iterations=200, num_leaves=21)

In [70]:
cv_scores = cross_validate(MLR_model, X_train, y_train, scoring=scores, cv=split)
print(f"On cross-validation:")
print(f"Mean RMSE score is {cv_scores['test_MSE'].mean().round(3)} ± {cv_scores['test_MSE'].std().round(3)}")
print(f"Mean Q2 score is {cv_scores['test_Q2'].mean().round(3)} ± {cv_scores['test_Q2'].std().round(3)}")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3075
[LightGBM] [Info] Number of data points in the train set: 551, number of used features: 324
[LightGBM] [Info] Start training from score -0.093485
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001969 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3042
[LightGBM] [Info] Number of data points in the train set: 551, number of used features: 321
[LightGBM] [Info] Start training from score -0.053339








[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003123 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3053
[LightGBM] [Info] Number of data points in the train set: 551, number of used features: 320
[LightGBM] [Info] Start training from score -0.056171




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004897 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3075
[LightGBM] [Info] Number of data points in the train set: 551, number of used features: 324
[LightGBM] [Info] Start training from score -0.095717




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3078
[LightGBM] [Info] Number of data points in the train set: 552, number of used features: 325
[LightGBM] [Info] Start training from score -0.087844
On cross-validation:
Mean RMSE score is 0.321 ± 0.018
Mean Q2 score is 0.728 ± 0.04


In [71]:
MLR_model.fit(X_train, y_train)
y_pred = MLR_model.predict(X_train)

print(f"R2: {r2_score(y_train, y_pred).round(3)}")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008002 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3604
[LightGBM] [Info] Number of data points in the train set: 689, number of used features: 347
[LightGBM] [Info] Start training from score -0.077315
R2: 0.954


In [72]:
MLR_model.fit(X_train, y_train)
y_pred = MLR_model.predict(X_val)

print(f"R2_val: {r2_score(y_val, y_pred).round(3)}")

print(f"RMSE_val: {root_mean_squared_error(y_val, y_pred).round(3)}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003786 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3604
[LightGBM] [Info] Number of data points in the train set: 689, number of used features: 347
[LightGBM] [Info] Start training from score -0.077315




R2_val: 0.733
RMSE_val: 0.34


In [73]:
joblib.dump(MLR_model, "best_regression_model.joblib")

['best_regression_model.joblib']