In [4]:
!pip install xgboost



In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

# from cats2d.rd_cats2d import CATS2D
# from rdkit.Avalon import pyAvalonTools
# from rdkit import Chem
# from rdkit.Chem import Crippen,Descriptors, MACCSkeys, GraphDescriptors, Lipinski, QED
from statistics import mean
from tqdm import tqdm

In [6]:
from xgboost import XGBRegressor, XGBClassifier
import sklearn
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.metrics import make_scorer, r2_score, mean_absolute_error, mean_squared_error, roc_auc_score, f1_score

In [7]:
df = pd.read_csv('avalon_refined_classification_395.csv')
df.head()

Unnamed: 0,Activity,MW,AlogP,PSA,#ROTB,#ALERTS,MlogP,#HeavyAtoms,#NHOH,#AromaticHeterocycles,...,A_502,A_503,A_504,A_505,A_506,A_507,A_508,A_509,A_510,A_511
0,0,398.4,3.7,141.31,6.0,1.0,3.7,28.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,520.48,-1.13,206.3,9.0,4.0,-1.13,36.0,4.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
2,0,305.5,3.2,33.12,0.0,1.0,3.2,13.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,342.27,2.11,73.1,6.0,5.0,2.11,19.0,2.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0,461.47,-1.24,149.15,3.0,1.0,-1.24,33.0,5.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


### Обучение модели

In [8]:
split = KFold(n_splits=5, random_state=41, shuffle=True)
scores= {
         "F1":  make_scorer(f1_score),
         "AUC": make_scorer(roc_auc_score, needs_threshold=True)
         }

In [9]:
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

In [10]:
X_train, X_test, y_train, y_test = X, X, y, y

In [11]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
!nvidia-smi

Mon Jan 15 19:43:01 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [13]:
# model parameters were obtained from GridSearch
model = XGBClassifier(random_state=42, learning_rate=0.001, max_depth=8, n_estimators=1000, subsample=0.3, tree_method='gpu_hist', gpu_id=0)

In [14]:
parameters = {
    'max_depth': range(6, 12, 2),
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [1, 0.75, 0.5, 0.3, 0.1],
    'n_estimators': [1000, 1500, 2000],
}

In [15]:
# roc_auc_scorer = sklearn.metrics.make_scorer(roc_auc_score, greater_is_better=True,
#                              needs_threshold=True)

In [16]:
# grid_search = GridSearchCV(
#     estimator=MLR_model,
#     param_grid=parameters,
#     # scoring='neg_mean_squared_error',
#     scoring=roc_auc_scorer,
#     verbose=3)


In [17]:
# grid_search.fit(X, y)

In [18]:
# grid_search.best_params_

In [19]:
# grid_search.best_score_

In [None]:
cv_scores = cross_validate(model, X_train, y_train, scoring=scores, cv=split)
print(f"On cross-validation:")
print(f"Mean ROC_AUC score is {cv_scores['test_AUC'].mean().round(3)} ± {cv_scores['test_AUC'].std().round(3)}")
print(f"Mean F1 score is {cv_scores['test_F1'].mean().round(3)} ± {cv_scores['test_F1'].std().round(3)}")

In [None]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f"F1: {f1_score(y_test, y_pred).round(3)}")

y_pred = model.predict_proba(X_test)
print(f"ROC_AUC: {roc_auc_score(y_test, y_pred[:, 1]).round(3)}")