# LOADING PRE-PROCESSED DATASET

In [81]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

df = pd.read_csv('dataset.csv')
df

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,COC(=O)C1=CC=C(C=C1)COC(=O)C2=CC=CC=N2,0
1,CCCCN1C2=C(C(=O)NC1=O)N(C(=N2)CN(C)CC3=C(C(=CC...,1
2,CC(CCC1=CC=CC=C1)NC(=O)C2=CC=C(O2)Br,0
3,COC1=CC(=C(C=C1)OC)NC(=O)CN2C(=O)C3=CC=CC=C3S2...,0
4,CCOC(=O)C1=CC=C(C=C1)N2C(=O)CC(C2=O)N3CCCC(C3)C,0
...,...,...
1955,COC1=CC=C(C=C1)NC(=S)NC(=O)C2=CC(=CN=C2)Br,0
1956,CCCCOC1=CC=CC(=C1)C(=O)NNC(=S)NC2CCCCC2,0
1957,COC1=C(C=C(C=C1)C2=NOC(=N2)CCCC(=O)NC3=CC=C(C=...,0
1958,CC1=CC=C(C=C1)N2C(=O)C3=CC(=C(C=C3N(C2=O)CC(=O...,0


# COMPUTING MOLECULAR DESCRIPTORS

In [82]:
smiles_list = df['PUBCHEM_EXT_DATASOURCE_SMILES'].tolist() 
mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]

In [83]:
def morgan_fps(data):
    fps = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) for mol in data]
    fp_array = [np.array(fp) for fp in fps]  
    column_names = ["morgan_" + str(i) for i in range(2048)]  
    return pd.DataFrame(fp_array, columns=column_names)

X = morgan_fps(mols)
X



Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1955,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1956,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1957,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1958,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


MORGAN DESCRIPTORS GENERATED 2048 FEATURES. DATA STRUCTURE 1960 rows X 2048 columns

In [84]:
y = df["PUBCHEM_ACTIVITY_OUTCOME"]
y

0       0
1       1
2       0
3       0
4       0
       ..
1955    0
1956    0
1957    0
1958    0
1959    0
Name: PUBCHEM_ACTIVITY_OUTCOME, Length: 1960, dtype: int64

# TRAIN -TEST WITH RATIO 80:20

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 18)
X_train

Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
737,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
118,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
755,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1726,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
837,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1144,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
275,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
X_train

Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
737,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
118,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
755,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1726,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
837,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1144,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
275,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
y_train

737     0
118     0
38      1
755     0
1174    0
       ..
1726    0
837     1
1144    0
275     1
1322    1
Name: PUBCHEM_ACTIVITY_OUTCOME, Length: 1568, dtype: int64

In [88]:
X_test

Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
279,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
682,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
774,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1292,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1555,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
y_test

279     0
682     0
774     0
63      0
1292    0
       ..
1856    0
1008    0
1555    1
1683    0
1905    0
Name: PUBCHEM_ACTIVITY_OUTCOME, Length: 392, dtype: int64

# BALANCING DATA USING SMOTE

In [90]:
smote = SMOTE(sampling_strategy='minority',random_state = 9)

X_sm, y_sm = smote.fit_resample(X_train, y_train)


In [91]:
X_sm

Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1982,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1983,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1984,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
y_sm

0       0
1       0
2       1
3       0
4       0
       ..
1981    1
1982    1
1983    1
1984    1
1985    1
Name: PUBCHEM_ACTIVITY_OUTCOME, Length: 1986, dtype: int64

In [93]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# BUILDING RANDOM FOREST MODEL

In [22]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_features = 5 , n_estimators = 185, random_state=2)
rfc.fit(X_sm, y_sm)
 
preds = rfc.predict(X_test)
proba = rfc.predict_proba(X_test)

In [23]:
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
roc_auc = roc_auc_score(y_test, proba[:, 1]) 


print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1} accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")

sensitivity: 0.8456375838926175, specificity: 0.8806584362139918, precision: 0.8129032258064516, f1: 0.8289473684210525 accuracy: 0.8673469387755102, roc_auc: 0.9070345513298533 
balanced_accuracy: 0.8631480100533047


# TUNNING HYPERPARAMETER

In [None]:
param_grid = {
    'n_estimators': np.linspace(100, 1000, num=5, dtype=int).tolist(),  # Range of values
    'max_features': ['sqrt', 'log2', None],  
    'max_depth': [10, 20, 30, None],  
    'min_samples_split': np.arange(2, 11, 2).tolist(),  # Even numbers from 2 to 10
    'min_samples_leaf': np.arange(1, 6, 1).tolist()  # 1 to 5
}

rfc = RandomForestClassifier(random_state=2)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)

grid_search = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    scoring='accuracy',  
    cv=cv,  
    n_jobs=-1,  
    verbose=1  
)

grid_search.fit(X_sm, y_sm)

print("Best parameters found using GridSearchCV:")
print(grid_search.best_params_)
print()

print("Best accuracy achieved:")
print(grid_search.best_score_)

In [24]:
rfc = RandomForestClassifier(max_features = 5, n_estimators = 185, random_state = 3)
rfc.fit(X_sm, y_sm)
 
preds = rfc.predict(X_test)
proba = rfc.predict_proba(X_test)

In [25]:
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
roc_auc = roc_auc_score(y_test, proba[:, 1]) 


print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1} accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")

sensitivity: 0.8456375838926175, specificity: 0.8600823045267489, precision: 0.7875, f1: 0.8155339805825242 accuracy: 0.8545918367346939, roc_auc: 0.9130831054768415 
balanced_accuracy: 0.8528599442096831


# VALIDATING RANDOM FOREST MODEL

In [39]:
dp= pd.read_csv('val_data.csv')
dp

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CCN1CCN(CC1)CC(=O)NC2=C(C=CC(=C2)C(F)(F)F)OCCO...,1
1,CC(=O)NC1=CC(=O)C2=C(N1)N=CC=C2,1
2,CN1[C@H]2C[C@H](N(C[C@H]2CO1)CC3=CC=C(C=C3)Cl)...,1
3,CCOC(=O)N1CCN(CC1)C2=C(C(=O)C2=O)N3CCN(CC3)C4=...,0
4,C1CN(CCN1C2=CC=CC=C2)C3=C(C=CC=N3)C(=O)NC4=CC=...,1
5,CN(CC1=C(C=CC2=C1OC(=O)C3=CC=CC=C23)O)C4CCCCC4,1
6,CC1=C2COC(CC2=C(C(=N1)N3CCOCC3)C#N)(C)C,0
7,COC1=CC=C(C=C1)C2C3=C(C=C(C=C3)O)OC4=C2C(=N)N(...,0
8,COC1=C(C(=C2CCC3=C(C2=C1)ON=C3)OC)OC,0
9,CN(C)C1=CC=C(C=C1)C(CCN2CCOCC2)C3=C(C=C(C=C3OC...,1


In [40]:
valdata = dp.copy()
colname = ["PUBCHEM_EXT_DATASOURCE_SMILES", "PUBCHEM_ACTIVITY_OUTCOME"]
valdata = valdata[colname]

valdata.drop_duplicates(inplace = True, ignore_index = True)
valdata.reset_index(inplace = True, drop = True)
valdata

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CCN1CCN(CC1)CC(=O)NC2=C(C=CC(=C2)C(F)(F)F)OCCO...,1
1,CC(=O)NC1=CC(=O)C2=C(N1)N=CC=C2,1
2,CN1[C@H]2C[C@H](N(C[C@H]2CO1)CC3=CC=C(C=C3)Cl)...,1
3,CCOC(=O)N1CCN(CC1)C2=C(C(=O)C2=O)N3CCN(CC3)C4=...,0
4,C1CN(CCN1C2=CC=CC=C2)C3=C(C=CC=N3)C(=O)NC4=CC=...,1
5,CN(CC1=C(C=CC2=C1OC(=O)C3=CC=CC=C23)O)C4CCCCC4,1
6,CC1=C2COC(CC2=C(C(=N1)N3CCOCC3)C#N)(C)C,0
7,COC1=CC=C(C=C1)C2C3=C(C=C(C=C3)O)OC4=C2C(=N)N(...,0
8,COC1=C(C(=C2CCC3=C(C2=C1)ON=C3)OC)OC,0
9,CN(C)C1=CC=C(C=C1)C(CCN2CCOCC2)C3=C(C=C(C=C3OC...,1


In [41]:
smi_list = valdata["PUBCHEM_EXT_DATASOURCE_SMILES"].to_list()
molsv = [Chem.MolFromSmiles(smi) for smi in smi_list]

X_val = morgan_fps(molsv)
X_val



Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
y_val = valdata["PUBCHEM_ACTIVITY_OUTCOME"]
y_val

0     1
1     1
2     1
3     0
4     1
5     1
6     0
7     0
8     0
9     1
10    0
11    1
12    1
13    0
14    0
15    1
16    1
17    0
18    1
19    0
20    1
21    1
22    1
23    0
24    0
25    1
26    0
27    0
28    1
29    0
30    0
31    0
32    1
33    0
34    1
35    0
36    1
37    1
38    0
39    0
Name: PUBCHEM_ACTIVITY_OUTCOME, dtype: int64

In [43]:
y_valpred = rfc.predict(X_val)
y_valproba = rfc.predict_proba(X_val)

In [44]:
tn, fp, fn, tp = confusion_matrix(y_val, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


sensitivity: 0.8, specificity: 0.8, precision: 0.8, f1: 0.8000000000000002, accuracy: 0.8, roc_auc: 0.9130831054768415 
balanced_accuracy: 0.8


# Y_RANDOMIZATION TEST

In [46]:
y_val_randomized = np.random.permutation(y_val)
y_val_randomized

array([1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0])

In [47]:
y_valpred = rfc.predict(X_val)

In [48]:
tn, fp, fn, tp = confusion_matrix(y_val_randomized, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


sensitivity: 0.6, specificity: 0.6, precision: 0.6, f1: 0.6, accuracy: 0.6, roc_auc: 0.9130831054768415 
balanced_accuracy: 0.6


# BUILDING GRADIENT BOOST CLASSIFIER

In [94]:
from sklearn.ensemble import GradientBoostingClassifier

gbm = GradientBoostingClassifier(n_estimators=150, max_depth=9, learning_rate=0.2, random_state=5)

# Fit the model to your training data
gbm.fit(X_sm, y_sm)

# Make predictions using the test set
preds = gbm.predict(X_test)
proba = gbm.predict_proba(X_test)


In [95]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
roc_auc = roc_auc_score(y_test, proba[:, 1]) 


print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1} accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")

sensitivity: 0.8321678321678322, specificity: 0.8875502008032129, precision: 0.8095238095238095, f1: 0.8206896551724138 accuracy: 0.8673469387755102, roc_auc: 0.9202123178026793 
balanced_accuracy: 0.8598590164855225


# VALIDATING GRADIENT BOOST CLASSIFIER

In [124]:
dp= pd.read_csv('val_data.csv')
dp

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CCN1CCN(CC1)CC(=O)NC2=C(C=CC(=C2)C(F)(F)F)OCCO...,1
1,CC(=O)NC1=CC(=O)C2=C(N1)N=CC=C2,1
2,CN1[C@H]2C[C@H](N(C[C@H]2CO1)CC3=CC=C(C=C3)Cl)...,1
3,CCOC(=O)N1CCN(CC1)C2=C(C(=O)C2=O)N3CCN(CC3)C4=...,0
4,C1CN(CCN1C2=CC=CC=C2)C3=C(C=CC=N3)C(=O)NC4=CC=...,1
5,CN(CC1=C(C=CC2=C1OC(=O)C3=CC=CC=C23)O)C4CCCCC4,1
6,CC1=C2COC(CC2=C(C(=N1)N3CCOCC3)C#N)(C)C,0
7,COC1=CC=C(C=C1)C2C3=C(C=C(C=C3)O)OC4=C2C(=N)N(...,0
8,COC1=C(C(=C2CCC3=C(C2=C1)ON=C3)OC)OC,0
9,CN(C)C1=CC=C(C=C1)C(CCN2CCOCC2)C3=C(C=C(C=C3OC...,1


In [125]:
valdata = dp.copy()
colname = ["PUBCHEM_EXT_DATASOURCE_SMILES", "PUBCHEM_ACTIVITY_OUTCOME"]
valdata = valdata[colname]

valdata.drop_duplicates(inplace = True, ignore_index = True)
valdata.reset_index(inplace = True, drop = True)
valdata

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CCN1CCN(CC1)CC(=O)NC2=C(C=CC(=C2)C(F)(F)F)OCCO...,1
1,CC(=O)NC1=CC(=O)C2=C(N1)N=CC=C2,1
2,CN1[C@H]2C[C@H](N(C[C@H]2CO1)CC3=CC=C(C=C3)Cl)...,1
3,CCOC(=O)N1CCN(CC1)C2=C(C(=O)C2=O)N3CCN(CC3)C4=...,0
4,C1CN(CCN1C2=CC=CC=C2)C3=C(C=CC=N3)C(=O)NC4=CC=...,1
5,CN(CC1=C(C=CC2=C1OC(=O)C3=CC=CC=C23)O)C4CCCCC4,1
6,CC1=C2COC(CC2=C(C(=N1)N3CCOCC3)C#N)(C)C,0
7,COC1=CC=C(C=C1)C2C3=C(C=C(C=C3)O)OC4=C2C(=N)N(...,0
8,COC1=C(C(=C2CCC3=C(C2=C1)ON=C3)OC)OC,0
9,CN(C)C1=CC=C(C=C1)C(CCN2CCOCC2)C3=C(C=C(C=C3OC...,1


In [126]:
smi_list = valdata["PUBCHEM_EXT_DATASOURCE_SMILES"].to_list()
molsv = [Chem.MolFromSmiles(smi) for smi in smi_list]

X_val = morgan_fps(molsv)
X_val



Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [127]:
y_val = valdata["PUBCHEM_ACTIVITY_OUTCOME"]
y_val

0     1
1     1
2     1
3     0
4     1
5     1
6     0
7     0
8     0
9     1
10    0
11    1
12    1
13    0
14    0
15    1
16    1
17    0
18    1
19    0
20    1
21    1
22    1
23    0
24    0
25    1
26    0
27    0
28    1
29    0
30    0
31    0
32    1
33    0
34    1
35    0
36    1
37    1
38    0
39    0
Name: PUBCHEM_ACTIVITY_OUTCOME, dtype: int64

In [128]:
y_valpred = gbm.predict(X_val)
y_valproba = gbm.predict_proba(X_val)

In [129]:
tn, fp, fn, tp = confusion_matrix(y_val, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


sensitivity: 0.8, specificity: 0.95, precision: 0.9411764705882353, f1: 0.8648648648648648, accuracy: 0.875, roc_auc: 0.9038391327547953 
balanced_accuracy: 0.875


# Y_RANDOMIZATION ON GRADIENT BOOST MACHINE MODEL

In [130]:
y_val_randomized = np.random.permutation(y_val)
y_val_randomized

array([1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1])

In [131]:
y_valpred = gbm.predict(X_val)

In [132]:
tn, fp, fn, tp = confusion_matrix(y_val_randomized, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


sensitivity: 0.4, specificity: 0.55, precision: 0.47058823529411764, f1: 0.4324324324324324, accuracy: 0.475, roc_auc: 0.9038391327547953 
balanced_accuracy: 0.47500000000000003


# BUILDING CATBOOST MODEL

In [96]:
from catboost import CatBoostClassifier

catboost = CatBoostClassifier(
    iterations=185,            
    depth=5,                   
    learning_rate=0.1,         
    random_seed=2,             
    verbose=0,                 
    loss_function='Logloss',   
)

catboost.fit(X_sm, y_sm)

preds = catboost.predict(X_test)
proba = catboost.predict_proba(X_test)

In [97]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
roc_auc = roc_auc_score(y_test, proba[:, 1]) 


print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1} accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")

sensitivity: 0.8741258741258742, specificity: 0.8755020080321285, precision: 0.8012820512820513, f1: 0.8361204013377928 accuracy: 0.875, roc_auc: 0.926053865812902 
balanced_accuracy: 0.8748139410790013


# VALIDATING CATBOOST MODEL

In [102]:
dp= pd.read_csv('val_data.csv')
dp

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CCN1CCN(CC1)CC(=O)NC2=C(C=CC(=C2)C(F)(F)F)OCCO...,1
1,CC(=O)NC1=CC(=O)C2=C(N1)N=CC=C2,1
2,CN1[C@H]2C[C@H](N(C[C@H]2CO1)CC3=CC=C(C=C3)Cl)...,1
3,CCOC(=O)N1CCN(CC1)C2=C(C(=O)C2=O)N3CCN(CC3)C4=...,0
4,C1CN(CCN1C2=CC=CC=C2)C3=C(C=CC=N3)C(=O)NC4=CC=...,1
5,CN(CC1=C(C=CC2=C1OC(=O)C3=CC=CC=C23)O)C4CCCCC4,1
6,CC1=C2COC(CC2=C(C(=N1)N3CCOCC3)C#N)(C)C,0
7,COC1=CC=C(C=C1)C2C3=C(C=C(C=C3)O)OC4=C2C(=N)N(...,0
8,COC1=C(C(=C2CCC3=C(C2=C1)ON=C3)OC)OC,0
9,CN(C)C1=CC=C(C=C1)C(CCN2CCOCC2)C3=C(C=C(C=C3OC...,1


In [103]:
valdata = dp.copy()
colname = ["PUBCHEM_EXT_DATASOURCE_SMILES", "PUBCHEM_ACTIVITY_OUTCOME"]
valdata = valdata[colname]

valdata.drop_duplicates(inplace = True, ignore_index = True)
valdata.reset_index(inplace = True, drop = True)
valdata

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CCN1CCN(CC1)CC(=O)NC2=C(C=CC(=C2)C(F)(F)F)OCCO...,1
1,CC(=O)NC1=CC(=O)C2=C(N1)N=CC=C2,1
2,CN1[C@H]2C[C@H](N(C[C@H]2CO1)CC3=CC=C(C=C3)Cl)...,1
3,CCOC(=O)N1CCN(CC1)C2=C(C(=O)C2=O)N3CCN(CC3)C4=...,0
4,C1CN(CCN1C2=CC=CC=C2)C3=C(C=CC=N3)C(=O)NC4=CC=...,1
5,CN(CC1=C(C=CC2=C1OC(=O)C3=CC=CC=C23)O)C4CCCCC4,1
6,CC1=C2COC(CC2=C(C(=N1)N3CCOCC3)C#N)(C)C,0
7,COC1=CC=C(C=C1)C2C3=C(C=C(C=C3)O)OC4=C2C(=N)N(...,0
8,COC1=C(C(=C2CCC3=C(C2=C1)ON=C3)OC)OC,0
9,CN(C)C1=CC=C(C=C1)C(CCN2CCOCC2)C3=C(C=C(C=C3OC...,1


In [104]:
smi_list = valdata["PUBCHEM_EXT_DATASOURCE_SMILES"].to_list()
molsv = [Chem.MolFromSmiles(smi) for smi in smi_list]

X_val = morgan_fps(molsv)
X_val



Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [106]:
y_val = valdata["PUBCHEM_ACTIVITY_OUTCOME"]
y_val

0     1
1     1
2     1
3     0
4     1
5     1
6     0
7     0
8     0
9     1
10    0
11    1
12    1
13    0
14    0
15    1
16    1
17    0
18    1
19    0
20    1
21    1
22    1
23    0
24    0
25    1
26    0
27    0
28    1
29    0
30    0
31    0
32    1
33    0
34    1
35    0
36    1
37    1
38    0
39    0
Name: PUBCHEM_ACTIVITY_OUTCOME, dtype: int64

In [107]:
y_valpred = catboost.predict(X_val)
y_valproba = catboost.predict_proba(X_val)

In [108]:
tn, fp, fn, tp = confusion_matrix(y_val, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


sensitivity: 0.85, specificity: 0.85, precision: 0.85, f1: 0.85, accuracy: 0.85, roc_auc: 0.9038391327547953 
balanced_accuracy: 0.85


# Y_RANDOMIZATION ON CATBOOST MODEL

In [109]:
y_val_randomized = np.random.permutation(y_val)
y_val_randomized

array([1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1])

In [110]:
tn, fp, fn, tp = confusion_matrix(y_val_randomized, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


sensitivity: 0.55, specificity: 0.55, precision: 0.55, f1: 0.55, accuracy: 0.55, roc_auc: 0.9038391327547953 
balanced_accuracy: 0.55


# BUILDING ADABOOST MODEL

In [98]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

base_estimator = DecisionTreeClassifier(max_depth=5, random_state=2)

adaboost = AdaBoostClassifier(
    estimator=base_estimator, 
    n_estimators=185,         
    learning_rate=1.0,        
    random_state=2            
)

adaboost.fit(X_sm, y_sm)

preds = adaboost.predict(X_test)
proba = adaboost.predict_proba(X_test)



In [99]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
roc_auc = roc_auc_score(y_test, proba[:, 1]) 


print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1} accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")

sensitivity: 0.7692307692307693, specificity: 0.9196787148594378, precision: 0.8461538461538461, f1: 0.8058608058608059 accuracy: 0.8647959183673469, roc_auc: 0.9286657117982419 
balanced_accuracy: 0.8444547420451035


# VALIDATING ADABOOST MODEL

In [111]:
dp= pd.read_csv('val_data.csv')
dp

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CCN1CCN(CC1)CC(=O)NC2=C(C=CC(=C2)C(F)(F)F)OCCO...,1
1,CC(=O)NC1=CC(=O)C2=C(N1)N=CC=C2,1
2,CN1[C@H]2C[C@H](N(C[C@H]2CO1)CC3=CC=C(C=C3)Cl)...,1
3,CCOC(=O)N1CCN(CC1)C2=C(C(=O)C2=O)N3CCN(CC3)C4=...,0
4,C1CN(CCN1C2=CC=CC=C2)C3=C(C=CC=N3)C(=O)NC4=CC=...,1
5,CN(CC1=C(C=CC2=C1OC(=O)C3=CC=CC=C23)O)C4CCCCC4,1
6,CC1=C2COC(CC2=C(C(=N1)N3CCOCC3)C#N)(C)C,0
7,COC1=CC=C(C=C1)C2C3=C(C=C(C=C3)O)OC4=C2C(=N)N(...,0
8,COC1=C(C(=C2CCC3=C(C2=C1)ON=C3)OC)OC,0
9,CN(C)C1=CC=C(C=C1)C(CCN2CCOCC2)C3=C(C=C(C=C3OC...,1


In [112]:
valdata = dp.copy()
colname = ["PUBCHEM_EXT_DATASOURCE_SMILES", "PUBCHEM_ACTIVITY_OUTCOME"]
valdata = valdata[colname]

valdata.drop_duplicates(inplace = True, ignore_index = True)
valdata.reset_index(inplace = True, drop = True)
valdata

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CCN1CCN(CC1)CC(=O)NC2=C(C=CC(=C2)C(F)(F)F)OCCO...,1
1,CC(=O)NC1=CC(=O)C2=C(N1)N=CC=C2,1
2,CN1[C@H]2C[C@H](N(C[C@H]2CO1)CC3=CC=C(C=C3)Cl)...,1
3,CCOC(=O)N1CCN(CC1)C2=C(C(=O)C2=O)N3CCN(CC3)C4=...,0
4,C1CN(CCN1C2=CC=CC=C2)C3=C(C=CC=N3)C(=O)NC4=CC=...,1
5,CN(CC1=C(C=CC2=C1OC(=O)C3=CC=CC=C23)O)C4CCCCC4,1
6,CC1=C2COC(CC2=C(C(=N1)N3CCOCC3)C#N)(C)C,0
7,COC1=CC=C(C=C1)C2C3=C(C=C(C=C3)O)OC4=C2C(=N)N(...,0
8,COC1=C(C(=C2CCC3=C(C2=C1)ON=C3)OC)OC,0
9,CN(C)C1=CC=C(C=C1)C(CCN2CCOCC2)C3=C(C=C(C=C3OC...,1


In [113]:
smi_list = valdata["PUBCHEM_EXT_DATASOURCE_SMILES"].to_list()
molsv = [Chem.MolFromSmiles(smi) for smi in smi_list]

X_val = morgan_fps(molsv)
X_val




Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [114]:
y_val = valdata["PUBCHEM_ACTIVITY_OUTCOME"]
y_val

0     1
1     1
2     1
3     0
4     1
5     1
6     0
7     0
8     0
9     1
10    0
11    1
12    1
13    0
14    0
15    1
16    1
17    0
18    1
19    0
20    1
21    1
22    1
23    0
24    0
25    1
26    0
27    0
28    1
29    0
30    0
31    0
32    1
33    0
34    1
35    0
36    1
37    1
38    0
39    0
Name: PUBCHEM_ACTIVITY_OUTCOME, dtype: int64

In [115]:
y_valpred = adaboost.predict(X_val)
y_valproba = adaboost.predict_proba(X_val)

In [116]:
tn, fp, fn, tp = confusion_matrix(y_val, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


sensitivity: 0.8, specificity: 0.9, precision: 0.8888888888888888, f1: 0.8421052631578948, accuracy: 0.85, roc_auc: 0.9038391327547953 
balanced_accuracy: 0.8500000000000001


# Y_RANDOMIZATION ON ADABOOST MODEL

In [117]:
tn, fp, fn, tp = confusion_matrix(y_val_randomized, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


sensitivity: 0.35, specificity: 0.45, precision: 0.3888888888888889, f1: 0.36842105263157887, accuracy: 0.4, roc_auc: 0.9038391327547953 
balanced_accuracy: 0.4


# BUILDING SUPPORT VECTOR MACHINE

In [100]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

# Define the scaler
scaler = StandardScaler()

# Fit on training set only
scaler.fit(X_sm)

# Apply transform to both the training set and the test set
X_sm= scaler.transform(X_sm)
X_test = scaler.transform(X_test)  # Make sure X_test is defined and appropriate

# Define the model
svm = SVC(kernel='poly', degree=3, coef0=1.0, probability=True)

# Train the model
svm.fit(X_sm, y_sm)

# Now you can make predictions on your test set and evaluate the model

preds = svm.predict(X_test)
proba = svm.predict_proba(X_test)

In [101]:
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
roc_auc = roc_auc_score(y_test, proba[:, 1]) 


print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1} accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")

sensitivity: 0.8601398601398601, specificity: 0.8072289156626506, precision: 0.7192982456140351, f1: 0.7834394904458599 accuracy: 0.826530612244898, roc_auc: 0.9038391327547953 
balanced_accuracy: 0.8336843879012554


# VALIDATING SUPPORT VECTOR MACHINE MODEL

In [118]:
dp= pd.read_csv('val_data.csv')
dp

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CCN1CCN(CC1)CC(=O)NC2=C(C=CC(=C2)C(F)(F)F)OCCO...,1
1,CC(=O)NC1=CC(=O)C2=C(N1)N=CC=C2,1
2,CN1[C@H]2C[C@H](N(C[C@H]2CO1)CC3=CC=C(C=C3)Cl)...,1
3,CCOC(=O)N1CCN(CC1)C2=C(C(=O)C2=O)N3CCN(CC3)C4=...,0
4,C1CN(CCN1C2=CC=CC=C2)C3=C(C=CC=N3)C(=O)NC4=CC=...,1
5,CN(CC1=C(C=CC2=C1OC(=O)C3=CC=CC=C23)O)C4CCCCC4,1
6,CC1=C2COC(CC2=C(C(=N1)N3CCOCC3)C#N)(C)C,0
7,COC1=CC=C(C=C1)C2C3=C(C=C(C=C3)O)OC4=C2C(=N)N(...,0
8,COC1=C(C(=C2CCC3=C(C2=C1)ON=C3)OC)OC,0
9,CN(C)C1=CC=C(C=C1)C(CCN2CCOCC2)C3=C(C=C(C=C3OC...,1


In [119]:
valdata = dp.copy()
colname = ["PUBCHEM_EXT_DATASOURCE_SMILES", "PUBCHEM_ACTIVITY_OUTCOME"]
valdata = valdata[colname]

valdata.drop_duplicates(inplace = True, ignore_index = True)
valdata.reset_index(inplace = True, drop = True)
valdata

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CCN1CCN(CC1)CC(=O)NC2=C(C=CC(=C2)C(F)(F)F)OCCO...,1
1,CC(=O)NC1=CC(=O)C2=C(N1)N=CC=C2,1
2,CN1[C@H]2C[C@H](N(C[C@H]2CO1)CC3=CC=C(C=C3)Cl)...,1
3,CCOC(=O)N1CCN(CC1)C2=C(C(=O)C2=O)N3CCN(CC3)C4=...,0
4,C1CN(CCN1C2=CC=CC=C2)C3=C(C=CC=N3)C(=O)NC4=CC=...,1
5,CN(CC1=C(C=CC2=C1OC(=O)C3=CC=CC=C23)O)C4CCCCC4,1
6,CC1=C2COC(CC2=C(C(=N1)N3CCOCC3)C#N)(C)C,0
7,COC1=CC=C(C=C1)C2C3=C(C=C(C=C3)O)OC4=C2C(=N)N(...,0
8,COC1=C(C(=C2CCC3=C(C2=C1)ON=C3)OC)OC,0
9,CN(C)C1=CC=C(C=C1)C(CCN2CCOCC2)C3=C(C=C(C=C3OC...,1


In [120]:
smi_list = valdata["PUBCHEM_EXT_DATASOURCE_SMILES"].to_list()
molsv = [Chem.MolFromSmiles(smi) for smi in smi_list]

X_val = morgan_fps(molsv)
X_val



Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [121]:
y_val = valdata["PUBCHEM_ACTIVITY_OUTCOME"]
y_val

0     1
1     1
2     1
3     0
4     1
5     1
6     0
7     0
8     0
9     1
10    0
11    1
12    1
13    0
14    0
15    1
16    1
17    0
18    1
19    0
20    1
21    1
22    1
23    0
24    0
25    1
26    0
27    0
28    1
29    0
30    0
31    0
32    1
33    0
34    1
35    0
36    1
37    1
38    0
39    0
Name: PUBCHEM_ACTIVITY_OUTCOME, dtype: int64

In [122]:
y_valpred = svm.predict(X_val)
y_valproba = svm.predict_proba(X_val)



In [123]:
tn, fp, fn, tp = confusion_matrix(y_val, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


sensitivity: 0.35, specificity: 1.0, precision: 1.0, f1: 0.5185185185185185, accuracy: 0.675, roc_auc: 0.9038391327547953 
balanced_accuracy: 0.675
