In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('dataset.csv')
df

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem

smiles_list = df['PUBCHEM_EXT_DATASOURCE_SMILES'].tolist() 
mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]

In [None]:
def morgan_fps(data):
    fps = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) for mol in data]
    fp_array = [np.array(fp) for fp in fps]  
    column_names = ["morgan_" + str(i) for i in range(2048)]  
    return pd.DataFrame(fp_array, columns=column_names)

X = morgan_fps(mols)
X

In [None]:
y = df["PUBCHEM_ACTIVITY_OUTCOME"]
y

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 18)
X_train


In [None]:
from imblearn.over_sampling import SMOTE

# assuming X_train and y_train are your training features and labels

smote = SMOTE(sampling_strategy='minority',random_state = 9)

X_sm, y_sm = smote.fit_resample(X_train, y_train)


In [None]:
from catboost import CatBoostClassifier

catboost = CatBoostClassifier(
    iterations=185,            
    depth=5,                   
    learning_rate=0.1,         
    random_seed=2,             
    verbose=0,                 
    loss_function='Logloss',   
)

catboost.fit(X_sm, y_sm)

preds = catboost.predict(X_test)
proba = catboost.predict_proba(X_test)

In [None]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
roc_auc = roc_auc_score(y_test, proba[:, 1]) 


print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1} accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from catboost import CatBoostClassifier
import numpy as np

param_grid = {
    'iterations': np.linspace(50, 300, num=5, dtype=int).tolist(),  # Range of iterations
    'depth': [3, 5, 7, 10],  # Range of tree depths
    'learning_rate': np.linspace(0.01, 0.3, num=5).tolist()  # Range of learning rates
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)

grid_search = GridSearchCV(
    estimator=CatBoostClassifier(random_seed=2, verbose=0, loss_function='Logloss'),
    param_grid=param_grid,
    scoring='accuracy',  
    cv=cv,  
    n_jobs=-1,  
    verbose=1  
)

grid_search.fit(X_sm, y_sm)

print("Best parameters found using GridSearchCV:")
print(grid_search.best_params_)
print()

print("Best accuracy achieved:")
print(grid_search.best_score_)


In [None]:
import pandas as pd
import numpy as np
dp= pd.read_csv('val_data.csv')
dp

In [None]:
valdata = dp.copy()
colname = ["PUBCHEM_EXT_DATASOURCE_SMILES", "PUBCHEM_ACTIVITY_OUTCOME"]
valdata = valdata[colname]

valdata.drop_duplicates(inplace = True, ignore_index = True)
valdata.reset_index(inplace = True, drop = True)
valdata

In [None]:
smi_list = valdata["PUBCHEM_EXT_DATASOURCE_SMILES"].to_list()
molsv = [Chem.MolFromSmiles(smi) for smi in smi_list]

X_val = morgan_fps(molsv)
X_val

In [None]:
y_val = valdata["PUBCHEM_ACTIVITY_OUTCOME"]
y_val

In [None]:
y_valpred = catboost.predict(X_val)
y_valproba = catboost.predict_proba(X_val)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_val, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


In [None]:
y_val_randomized = np.random.permutation(y_val)
y_val_randomized

In [None]:
y_valpred = catboost.predict(X_val)

In [None]:
from skley_valpred = adaboost.predict(X_val)arn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_val_randomized, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


In [None]:
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.spatial import distance
import numpy as np

catboost_pipeline = Pipeline([
    ("Smote", SMOTE(sampling_strategy='minority', random_state=9)),
    ("model", CatBoostClassifier(iterations=185, depth=5, random_seed=2, verbose=0))
])


In [None]:
catboost_pipeline.fit(X_train,y_train)

In [None]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, \nbalanced_accuracy: {balanced_accuracy}")


In [None]:
import pickle
with open('catboost_pipeline.pkl', 'wb') as file:
    # Use pickle to dump the pipeline into the file
    pickle.dump(catboost_pipeline, file)

In [None]:
import pickle

# Load the pipeline from the pickle file
with open('catboost_pipeline.pkl', 'rb') as file:
    catboost_pipeline = pickle.load(file)


In [None]:
# Assuming X_test is your test dataset
preds = catboost_pipeline.predict(X_test)         # For class predictions
proba = catboost_pipeline.predict_proba(X_test)  # For probabilities

print("Predictions:", preds)
print("Probabilities:", proba)
