# 1. Title & Overview

# 01 – Model Training for Drug-Likeness Prediction

This notebook demonstrates how to:
1. Load the descriptor dataset  
2. Explore label balance and key feature distributions  
3. Train a Random Forest classifier  
4. Evaluate performance (confusion matrix, ROC curve)  
5. Inspect feature importances  

# 2. Setup & Imports

In [None]:
# (Re)install any missing dependencies in Colab
!pip install -q scikit-learn matplotlib pandas joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    RocCurveDisplay
)
import joblib

# 3. Load Descriptor Data

In [None]:
# Point this path at your descriptor CSV
df = pd.read_csv('../data/drug_likeness_descriptors.csv')
print(f"Dataset shape: {df.shape}")
df.head()

# 4. Quick Exploratory Data Analysis



*   Label distribution





In [None]:
df['Label'].value_counts().plot.bar(title='Class Balance')
plt.xticks([0,1], ['Decoy (0)', 'Drug (1)'])
plt.show()



*   Example descriptor summary




In [None]:
df.iloc[:, 3:10].describe().T  # show first few descriptor stats

# 5. Train/Test Split

In [None]:
X = df.drop(columns=['ID','Target','Label'])
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train: {len(y_train)} | Test: {len(y_test)}")

# 6. Random Forest Training

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42
)
rf.fit(X_train, y_train)

# 7. Evaluation


*   Classification report




In [None]:
y_pred  = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))



*  Confusion matrix



In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)



*   ROC curve & AUC



In [None]:
auc = roc_auc_score(y_test, y_proba)
print(f'ROC AUC = {auc:.3f}')
RocCurveDisplay.from_estimator(rf, X_test, y_test)
plt.show()

# 8. Feature Importance

In [None]:
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.nlargest(10).plot.barh(title='Top 10 Feature Importances')
plt.show()

# 9. Save Trained Model

In [None]:
joblib.dump(rf, '../models/rf_drug_likeness_model.joblib')
print("Model saved to models/rf_drug_likeness_model.joblib")

# 10. Next Steps
**Next:**  
- Hyperparameter tuning (GridSearchCV on `n_estimators`, `max_depth`)  
- Try XGBoost or logistic regression baselines  
- Build a Streamlit demo in `app/streamlit_app.py`