### XGBoost Covalent Unit Count Prediction

#### Importing Libraries

In [90]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor # Importing our Boosting Model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score # Used to evaluted precision of KNN model
from sklearn.model_selection import train_test_split #Split the dataset into training and testing sets 

#### Dataset Visualization

In [91]:
# set our dataset equal to ds variable
ds = pd.read_csv('DDH Data with Properties.csv')

# view first 5 lines of dataset
ds.head() 

Unnamed: 0,CID,SMILES,MolecularFormula,MolecularWeight,InChI,InChIKey,IUPACName,XLogP,ExactMass,MonoisotopicMass,...,FeatureAcceptorCount3D,FeatureDonorCount3D,FeatureAnionCount3D,FeatureCationCount3D,FeatureRingCount3D,FeatureHydrophobeCount3D,ConformerModelRMSD3D,EffectiveRotorCount3D,ConformerCount3D,pIC50
0,2744814,ClC1=CC(NC(=O)CSC2=NC=CC(=N2)C2=CSC(=N2)C2=CC=...,C21H14Cl2N4OS2,473.4,InChI=1S/C21H14Cl2N4OS2/c22-14-8-15(23)10-16(9...,LILOEJREEQFTPM-UHFFFAOYSA-N,"N-(3,5-dichlorophenyl)-2-[4-(2-phenyl-1,3-thia...",5.6,471.998609,471.998609,...,3.0,1.0,0.0,1.0,4.0,0.0,1.0,7.0,10.0,-0.477121255
1,2821293,CN1N=C(C=C1C(F)(F)F)C1=CC=C(S1)C1=CC=NC(SCC(=O...,C21H15ClF3N5OS2,510.0,"InChI=1S/C21H15ClF3N5OS2/c1-30-18(21(23,24)25)...",AWQBIBTZJKFLEW-UHFFFAOYSA-N,N-(4-chlorophenyl)-2-[4-[5-[1-methyl-5-(triflu...,4.9,509.035865,509.035865,...,3.0,1.0,0.0,1.0,4.0,0.0,1.2,8.0,10.0,-1
2,2820912,CSC1=C(C(C)=C(S1)C1=NC(C)=CS1)C1=CC=NC(SCC(=O)...,C22H19ClN4OS4,519.1,InChI=1S/C22H19ClN4OS4/c1-12-10-30-20(25-12)19...,WRXXISITJDZVCL-UHFFFAOYSA-N,N-(4-chlorophenyl)-2-[4-[4-methyl-2-methylsulf...,6.3,518.013024,518.013024,...,3.0,1.0,0.0,1.0,4.0,1.0,1.0,8.0,10.0,-1.041392685
3,2820914,CSC1=C(C(C)=C(S1)C1=NC(C)=CS1)C1=CC=NC(SCC(=O)...,C22H19ClN4OS4,519.1,InChI=1S/C22H19ClN4OS4/c1-12-10-30-20(25-12)19...,NNVVKOVHRSDRSQ-UHFFFAOYSA-N,N-(2-chlorophenyl)-2-[4-[4-methyl-2-methylsulf...,6.3,518.013024,518.013024,...,3.0,1.0,0.0,1.0,4.0,1.0,1.2,8.0,10.0,BLINDED
4,2744846,CC1=NC(=CS1)C1=NC(=CS1)C1=NC(SCC(=O)NC2=CC=C(C...,C19H14ClN5OS3,460.0,InChI=1S/C19H14ClN5OS3/c1-11-22-16(9-27-11)18-...,JEZYTEDGOJCVQS-UHFFFAOYSA-N,"N-(4-chlorophenyl)-2-[4-[2-(2-methyl-1,3-thiaz...",4.4,459.004901,459.004901,...,4.0,1.0,0.0,1.0,4.0,0.0,1.0,7.0,10.0,-1.146128036


#### Preprocessing

In [92]:
# dropping  non-float values
ds.drop(['CID','SMILES','InChI','InChIKey','IUPACName','MolecularFormula'],axis=1,inplace=True)

# replacing non existing values in columns
ds.replace(np.nan,0,inplace=True)

# replacing binded value with float value
ds = ds[ds['pIC50'] != 'BLINDED']
# ds.loc[ds['pIC50']=='BLINDED',:]
ds['pIC50']=ds['pIC50'].astype('float64')

#### Splitting Dataset

In [93]:
# Setting X equal to all columns except for covalent unit count
X=ds.loc[:,ds.columns!='CovalentUnitCount']

# Setting y equal to covalent unit count
y=ds['CovalentUnitCount']

# 80/20 train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Scaling Data

In [94]:
# Initialize the scaler
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Scalling split data
X_train = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

#### Building and Training Model

In [95]:
# XGBoost model built using XGBoost Regreesor
boost_model = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100, max_depth=3, random_state=42)
# fitting our XGBoost model with the X train and X test sets that have been scaled accordingly with their features.
boost_model.fit(X_train, y_train)
# producing our prediction using our scaled test set
y_pred = boost_model.predict(X_test)



#### Evaluating Model

In [96]:
# measuring how precise our model was in its predictions
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

Mean Squared Error: 2.4666821156062076e-05
