In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.2 kB)
Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl (36.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.3


#Importing and Data Preprocessing

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

from scipy.stats import randint as sp_randint

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs import BitVectToText

In [None]:
file_path = '/content/drive/My Drive/Datasets/AKT1_smiles_3.csv' ##Replace with your own paths
df = pd.read_csv(file_path)

In [None]:
file_path = '/content/drive/My Drive/Datasets/AKT1 CHEMBL.xlsx' ##Replace with your own paths
df1 = pd.read_excel(file_path)


In [None]:
def label_fromIC50(string):
  if string>6:
    return 1.0
  return 0.0

In [None]:
df['IC50'] = df1['pChEMBL Value']
df['Labels'] = df['IC50'].apply(label_fromIC50)
df=df.dropna()
df_features = df[['QED','Docking Score','SAScore']].copy()
df_labels   = df['Labels'].copy()

#Morgan Fingerprints

In [None]:
def smiles_to_morgan_fingerprint(smiles, radius=2, nBits=2048):

    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(
            mol,
            radius=radius,
            nBits=nBits
        )
        return BitVectToText(fingerprint)

    except Exception:
        return None

In [None]:
df['Morgan_FP_2048'] = df['Smiles'].astype(str).apply(smiles_to_morgan_fingerprint)

df_clean = df.dropna(subset=['Morgan_FP_2048'])

fingerprint_matrix = np.array(
    [list(map(int, fp)) for fp in df_clean['Morgan_FP_2048']]
)

print(f"\n### Fingerprint Matrix for ML (NumPy Array) ###")
print(f"Shape of FP Matrix: {fingerprint_matrix.shape}")
print(f"First 10 bits of the first valid compound (Ethanol):\n{fingerprint_matrix[0, :10]}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m



### Fingerprint Matrix for ML (NumPy Array) ###
Shape of FP Matrix: (5923, 2048)
First 10 bits of the first valid compound (Ethanol):
[0 0 0 0 0 0 0 0 0 0]


#Model Training

In [None]:
X_features = df_features.to_numpy()
X = np.hstack((X_features, fingerprint_matrix))
y = df_labels.to_numpy()

print(f"Features (X) shape: {X.shape}")
print(f"Labels (y) shape: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Features (X) shape: (5923, 2051)
Labels (y) shape: (5923,)

Training set size: 4738 samples
Testing set size: 1185 samples


In [None]:
model = RandomForestClassifier(
    n_estimators=190,
    random_state=42, # Set for reproducible results
    n_jobs=-1,         # Use all available cores for faster training
    criterion='entropy',
    min_samples_leaf=1
)

print("\nStarting model training...")
model.fit(X_train, y_train)
print("Model training complete.")


Starting model training...
Model training complete.


#Metrics

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred)

print("\n--- Model Evaluation ---")
print(f"Accuracy on Test Set: {accuracy:.4f}")
print(f"AUC Score: {auc_score}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- Model Evaluation ---
Accuracy on Test Set: 0.9316
AUC Score: 0.8827323533915173

Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.80      0.83       252
         1.0       0.95      0.97      0.96       933

    accuracy                           0.93      1185
   macro avg       0.91      0.88      0.89      1185
weighted avg       0.93      0.93      0.93      1185



#Hyperparam Tuning

In [None]:
# This dictionary defines the range/list of values to try for each hyperparameter.

param_dist = {
    "n_estimators": sp_randint(150, 300),
    "max_depth": [None],
    "min_samples_leaf": sp_randint(1, 5),
    "max_features": ['sqrt', 1.0],
    "criterion": ["entropy"]
}

In [None]:
# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
# n_iter: number of parameter settings that are sampled (more is better but slower)
# cv: number of cross-validation folds (e.g., 5-fold cross-validation)
# scoring: the metric to optimize (e.g., 'accuracy', 'f1', 'roc_auc')

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

In [None]:
print("Starting Hyperparameter Tuning with Randomized Search...")
random_search.fit(X_train, y_train)
print("Tuning complete.")

Starting Hyperparameter Tuning with Randomized Search...
Fitting 5 folds for each of 20 candidates, totalling 100 fits


KeyboardInterrupt: 

In [None]:
best_params = random_search.best_params_
best_score = random_search.best_score_
best_model = random_search.best_estimator_

print("--- Tuning Results ---")
print(f"Best Cross-Validation Score: {best_score:.4f}")
print("Best Parameters Found:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

--- Tuning Results ---
Best Cross-Validation Accuracy Score: 0.8690
Best Parameters Found:
  criterion: entropy
  max_depth: None
  max_features: sqrt
  min_samples_leaf: 1
  n_estimators: 222


In [None]:
final_test_score = best_model.score(X_test, y_test)
print(f"\nAccuracy on FINAL Unseen Test Set: {final_test_score:.4f}")


Accuracy on FINAL Unseen Test Set: 0.8970
