In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

from statistics import mean
from tqdm import tqdm

In [None]:
from xgboost import XGBRegressor, XGBClassifier
import sklearn
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.metrics import make_scorer, r2_score, mean_absolute_error, mean_squared_error, roc_auc_score, f1_score

In [None]:
df = pd.read_csv('fragments_classification_actual.csv')
df.head()

Unnamed: 0,Activity,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,#HeavyAtoms,...,c[X],E_1,E_2,E_3,E_4,E_5,E_6,E_7,E_8,E_9
0,1,252.73,0.0,0.0,0.0,0.0,1.0,2.45,30.57,4.0,...,0,0.270833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,98.14,1.0,1.0,20.23,1.0,1.0,0.78,29.75,7.0,...,0,0.0,0.0,8.895833,-0.888889,2.246528,0.0,5.531389,3.465139,0.0
2,1,144.6,1.0,1.0,20.23,2.0,1.0,1.51,39.07,9.0,...,0,0.0,0.0,9.221389,0.074514,2.213333,0.0,6.856096,1.788441,0.0
3,1,175.19,2.0,1.0,52.32,2.0,1.0,1.46,48.68,13.0,...,0,4.688287,10.446373,0.0,5.589179,2.32662,9.010249,3.605958,0.0,0.0
4,1,360.47,4.0,1.0,35.94,6.0,1.0,2.66,101.66,26.0,...,1,19.006491,4.491719,10.364971,0.572907,2.614048,6.624001,10.376729,4.365801,0.0


### Обучение модели

In [None]:
split = KFold(n_splits=5, random_state=41, shuffle=True)
scores= {
         "F1":  make_scorer(f1_score),
         "AUC": make_scorer(roc_auc_score, needs_threshold=True)
         }

In [None]:
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

In [None]:
X_train, X_val, y_train, y_val = X, X, y, y

In [None]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [None]:
MLR_model = RandomForestClassifier(random_state=102, bootstrap=True, class_weight='balanced', criterion='gini', max_depth=None, max_features='log2', n_estimators=200)

In [None]:
roc_auc_scorer = sklearn.metrics.make_scorer(roc_auc_score, greater_is_better=True,
                             needs_threshold=True)

In [None]:
grid_search = GridSearchCV(
    estimator=MLR_model,
    param_grid=parameters,
    # scoring='neg_mean_squared_error',
    scoring=roc_auc_scorer,
    verbose=3)


In [None]:
cv_scores = cross_validate(MLR_model, X_train, y_train, scoring=scores, cv=split)
print(f"On cross-validation:")
print(f"Mean ROC_AUC score is {cv_scores['test_AUC'].mean().round(3)} ± {cv_scores['test_AUC'].std().round(3)}")
print(f"Mean F1 score is {cv_scores['test_F1'].mean().round(3)} ± {cv_scores['test_F1'].std().round(3)}")

On cross-validation:
Mean ROC_AUC score is 0.963 ± 0.006
Mean F1 score is 0.911 ± 0.004


In [None]:
MLR_model.fit(X_train, y_train)
y_pred = MLR_model.predict(X_train)

print(f"F1: {f1_score(y_train, y_pred)}")
print(f"ROC_AUC: {roc_auc_score(y_train, y_pred)}")

F1: 0.9859473023839397
ROC_AUC: 0.9851365980616008


In [None]:
MLR_model.fit(X_train, y_train)
y_pred = MLR_model.predict(X_val)

print(f"F1: {f1_score(y_val, y_pred)}")
print(f"ROC_AUC: {roc_auc_score(y_val, y_pred)}")

F1: 0.9859473023839397
ROC_AUC: 0.9851365980616008


In [None]:
X_train.shape

(6748, 444)

In [None]:
import joblib

In [None]:
joblib.dump(MLR_model, "best_classification_model.joblib")

['best_classification_model_new.joblib']