In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

from statistics import mean
from tqdm import tqdm

In [None]:
from xgboost import XGBRegressor, XGBClassifier
import sklearn
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.metrics import make_scorer, r2_score, mean_absolute_error, mean_squared_error, roc_auc_score, f1_score

In [None]:
df = pd.read_csv('regression_dataset.csv')
df.head()

Unnamed: 0,Class,MW,AlogP,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,...,c[X],E_1,E_2,E_3,E_4,E_5,E_6,E_7,E_8,Activity
0,0,182.17,-3.59,6.0,6.0,121.38,5.0,0.0,-3.59,38.2,...,0,0.0,0.0,52.176914,0.0,0.0,0.0,-6.391944,-1.451636,-1.6
1,0,275.34,1.67,4.0,3.0,106.39,3.0,2.0,1.67,77.32,...,0,0.0,19.201681,5.07883,12.969815,-0.134704,7.406556,0.0,1.464642,-1.57
2,0,415.29,4.98,5.0,2.0,75.11,4.0,1.0,4.98,90.27,...,0,76.40847,19.134407,11.887089,-2.200566,-1.765334,6.596278,-9.113484,-0.113526,-1.57
3,0,543.48,-5.6,17.0,8.0,285.14,9.0,1.0,-5.6,101.05,...,0,84.727417,11.389837,53.05147,0.0,-0.728697,-1.556118,-16.981647,-1.05756,-1.54
4,0,314.38,1.18,6.0,4.0,137.5,3.0,4.0,1.18,88.75,...,0,0.0,12.180389,16.443945,13.096557,0.335015,7.534264,1.814401,0.0,-1.54


### **Model training**

In [None]:
split = KFold(n_splits=5, random_state=41, shuffle=True)
scores= {
         "Q2": "r2",
         "MSE": make_scorer(mean_squared_error, squared=False)
         }

In [None]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
X_train, X_test, y_train, y_test = X, X, y, y

In [None]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
!nvidia-smi

Mon Dec 11 16:47:08 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# these model parameters were calculated by GridSearch
model = XGBRegressor(random_state=42, max_depth=10, learning_rate=0.01, subsample=0.5, n_estimators=1500, tree_method='gpu_hist', gpu_id=0)

In [None]:
parameters = {
    'max_depth': range(6, 12, 2),
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [1, 0.75, 0.5, 0.3, 0.1],
    'n_estimators': [1000, 1500, 2000],
}

In [None]:
roc_auc_scorer = sklearn.metrics.make_scorer(roc_auc_score, greater_is_better=True,
                             needs_threshold=True)

In [None]:
rmse_mod = sklearn.metrics.make_scorer(mean_squared_error, greater_is_better=False, squared=False)

In [None]:
grid_search = GridSearchCV(
    estimator=model,
    param_grid=parameters,
    # scoring='neg_mean_squared_error',
    scoring=rmse_mod,
    verbose=3)


In [None]:
# grid_search.fit(X, y)

In [None]:
# grid_search.best_params_

In [None]:
# grid_search.best_score_

In [None]:
cv_scores = cross_validate(model, X_train, y_train, scoring=scores, cv=split)
print(f"On cross-validation:")
print(f"Mean RMSE score is {cv_scores['test_MSE'].mean().round(3)} ± {cv_scores['test_MSE'].std().round(3)}")
print(f"Mean Q2 score is {cv_scores['test_Q2'].mean().round(3)} ± {cv_scores['test_Q2'].std().round(3)}")


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



On cross-validation:
Mean RMSE score is 0.308 ± 0.022
Mean Q2 score is 0.792 ± 0.023



    E.g. tree_method = "hist", device = "cuda"



In [None]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f"R2: {r2_score(y_test, y_pred).round(3)}")


    E.g. tree_method = "hist", device = "cuda"



R2: 0.988



    E.g. tree_method = "hist", device = "cuda"

