In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('dataset.csv')
df

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,COC(=O)C1=CC=C(C=C1)COC(=O)C2=CC=CC=N2,0
1,CCCCN1C2=C(C(=O)NC1=O)N(C(=N2)CN(C)CC3=C(C(=CC...,1
2,CC(CCC1=CC=CC=C1)NC(=O)C2=CC=C(O2)Br,0
3,COC1=CC(=C(C=C1)OC)NC(=O)CN2C(=O)C3=CC=CC=C3S2...,0
4,CCOC(=O)C1=CC=C(C=C1)N2C(=O)CC(C2=O)N3CCCC(C3)C,0
...,...,...
1955,COC1=CC=C(C=C1)NC(=S)NC(=O)C2=CC(=CN=C2)Br,0
1956,CCCCOC1=CC=CC(=C1)C(=O)NNC(=S)NC2CCCCC2,0
1957,COC1=C(C=C(C=C1)C2=NOC(=N2)CCCC(=O)NC3=CC=C(C=...,0
1958,CC1=CC=C(C=C1)N2C(=O)C3=CC(=C(C=C3N(C2=O)CC(=O...,0


In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem

smiles_list = df['PUBCHEM_EXT_DATASOURCE_SMILES'].tolist() 
mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]

In [3]:
def morgan_fps(data):
    fps = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) for mol in data]
    fp_array = [np.array(fp) for fp in fps]  
    column_names = ["morgan_" + str(i) for i in range(2048)]  
    return pd.DataFrame(fp_array, columns=column_names)

X = morgan_fps(mols)
X





KeyboardInterrupt: 

In [None]:
y = df["PUBCHEM_ACTIVITY_OUTCOME"]
y

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 18)
X_train


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority',random_state = 9)

X_sm, y_sm = smote.fit_resample(X_train, y_train)


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

base_estimator = DecisionTreeClassifier(max_depth=5, random_state=2)

adaboost = AdaBoostClassifier(
    estimator=base_estimator, 
    n_estimators=185,         
    learning_rate=1.0,        
    random_state=2            
)

adaboost.fit(X_sm, y_sm)

preds = adaboost.predict(X_test)
proba = adaboost.predict_proba(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
roc_auc = roc_auc_score(y_test, proba[:, 1]) 


print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1} accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np

param_grid = {
    'n_estimators': np.linspace(50, 300, num=5, dtype=int).tolist(),  # Range of values
    'learning_rate': np.linspace(0.1, 2.0, num=5).tolist(),  # Range of learning rates
    'estimator__max_depth': [3, 5, 7, 10]  # Depth of base estimator
}

base_estimator = DecisionTreeClassifier(random_state=2)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)

grid_search = GridSearchCV(
    estimator=AdaBoostClassifier(estimator=base_estimator, random_state=2),
    param_grid=param_grid,
    scoring='accuracy',  
    cv=cv,  
    n_jobs=-1,  
    verbose=1  
)

grid_search.fit(X_sm, y_sm)

print("Best parameters found using GridSearchCV:")
print(grid_search.best_params_)
print()

print("Best accuracy achieved:")
print(grid_search.best_score_)


In [None]:
import pandas as pd
import numpy as np
dp= pd.read_csv('val_data.csv')
dp

In [None]:
valdata = dp.copy()
colname = ["PUBCHEM_EXT_DATASOURCE_SMILES", "PUBCHEM_ACTIVITY_OUTCOME"]
valdata = valdata[colname]

valdata.drop_duplicates(inplace = True, ignore_index = True)
valdata.reset_index(inplace = True, drop = True)
valdata

In [None]:
smi_list = valdata["PUBCHEM_EXT_DATASOURCE_SMILES"].to_list()
molsv = [Chem.MolFromSmiles(smi) for smi in smi_list]

X_val = morgan_fps(molsv)
X_val


In [None]:
smi_list = valdata["PUBCHEM_EXT_DATASOURCE_SMILES"].to_list()
molsv = [Chem.MolFromSmiles(smi) for smi in smi_list]

X_val = morgan_fps(molsv)
X_val

In [None]:
y_val = valdata["PUBCHEM_ACTIVITY_OUTCOME"]
y_val

In [None]:
y_valpred = adaboost.predict(X_val)
y_valproba = adaboost.predict_proba(X_val)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_val, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


In [None]:
y_val_randomized = np.random.permutation(y_val)
y_val_randomized

In [None]:
y_valpred = adaboost.predict(X_val)

In [None]:
from skley_valpred = adaboost.predict(X_val)arn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_val_randomized, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_val_randomized, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")

