# BELKA-mini Starter Notebook 🧪
Predict small molecule binding using ECFPs + protein encoding

Model: Random Forest (sklearn)


## 🔧 Setup

In [None]:
# !pip install -q rdkit-pypi

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## 📦 Load Data

In [None]:
import os
data_path = ""

In [None]:
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
test = pd.read_csv("test.csv")  # contains binds for now

print(f"Train size: {len(train)}")
print(f"Val size:   {len(val)}")
print(f"Test size:  {len(test)}")
print("\nTrain columns:", train.columns.tolist())

## 👀 Peek at the data

In [None]:
train.head()

## 🧪 Featurization: ECFP + One-Hot Protein

In [None]:
def smiles_to_ecfp(smiles, radius=2, n_bits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits)
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)

In [None]:
# Featurize molecules
train['ecfp'] = train['molecule_smiles'].apply(smiles_to_ecfp)
val['ecfp'] = val['molecule_smiles'].apply(smiles_to_ecfp)
test['ecfp'] = test['molecule_smiles'].apply(smiles_to_ecfp)

In [None]:
# One-hot protein
protein_encoder = OneHotEncoder(sparse_output=False)
protein_encoder.fit(train[['protein_name']])

In [None]:
X_train_prot = protein_encoder.transform(train[['protein_name']])
X_val_prot = protein_encoder.transform(val[['protein_name']])
X_test_prot = protein_encoder.transform(test[['protein_name']])

In [None]:
# Stack features
X_train = [np.concatenate([fp, prot]) for fp, prot in zip(train['ecfp'], X_train_prot)]
X_val = [np.concatenate([fp, prot]) for fp, prot in zip(val['ecfp'], X_val_prot)]
X_test = [np.concatenate([fp, prot]) for fp, prot in zip(test['ecfp'], X_test_prot)]

In [None]:
y_train = train['binds']
y_val = val['binds']

## 🧠 Model: Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=42, class_weight="balanced")
rf.fit(X_train, y_train)

In [None]:
val_preds = rf.predict_proba(X_val)[:, 1]
ap_score = average_precision_score(y_val, val_preds)

In [None]:
print(f"📊 Validation Average Precision (AP): {ap_score:.4f}")

## 📤 Make Submission

In [None]:
# We assume test.csv includes labels for now — remove them before submission!
test_preds = rf.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({'id': test['id'], 'binds': test_preds})

In [None]:
# Save to file
submission.to_csv("submission.csv", index=False)

In [None]:
# Preview
submission.head()