# BELKA-mini Starter Notebook 🧪
Predict small molecule binding using ECFPs + protein encoding

Model: Random Forest (sklearn)


## 🔧 Setup

In [1]:
# !pip install -q rdkit-pypi

In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## 📦 Load Data

In [3]:
import os
data_path = ""

In [4]:
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
test = pd.read_csv("test.csv")  # contains binds for now

print(f"Train size: {len(train)}")
print(f"Val size:   {len(val)}")
print(f"Test size:  {len(test)}")
print("\nTrain columns:", train.columns.tolist())

Train size: 66923
Val size:   16731
Test size:  40000

Train columns: ['id', 'buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles', 'molecule_smiles', 'protein_name', 'binds']


## 👀 Peek at the data

In [5]:
train.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,182153914,O=C(Nc1ccc(C(=O)O)c(C(=O)O)c1)OCC1c2ccccc2-c2c...,COC(CN)CC(N)=O.Cl,NCc1nc2c(s1)CCCCC2,COC(CNc1nc(NCc2nc3c(s2)CCCCC3)nc(Nc2ccc(C(=O)N...,HSA,0
1,245366148,O=C(O)C[C@@H](Cc1ccc(Cl)c(Cl)c1)NC(=O)OCC1c2cc...,COc1cc(C#N)c(F)cc1N,CNC(=O)c1cc(Oc2ccc(N)cc2)ccn1,CNC(=O)c1cc(Oc2ccc(Nc3nc(Nc4cc(F)c(C#N)cc4OC)n...,BRD4,0
2,99662502,O=C(N[C@@H](Cc1ccc(F)cc1F)C(=O)O)OCC1c2ccccc2-...,Cl.Cl.NCc1nnc2n1CCOCC2,COC(=O)c1cc(N)cs1,COC(=O)c1cc(Nc2nc(NCc3nnc4n3CCOCC4)nc(N[C@@H](...,BRD4,0
3,11914877,CC(=O)c1ccc(C[C@H](NC(=O)OCC2c3ccccc3-c3ccccc3...,Nc1cnc(Cl)cn1,Nc1nc(F)nc2nc[nH]c12,CC(=O)c1ccc(C[C@H](Nc2nc(Nc3cnc(Cl)cn3)nc(Nc3n...,sEH,0
4,209057132,O=C(Nc1cccc(Br)c1C(=O)O)OCC1c2ccccc2-c2ccccc21,Nc1ccc2c(c1)oc1ccccc12,Cc1cc([N+](=O)[O-])c(Cl)cc1N,Cc1cc([N+](=O)[O-])c(Cl)cc1Nc1nc(Nc2ccc3c(c2)o...,sEH,0


## 🧪 Featurization: ECFP + One-Hot Protein

In [6]:
def smiles_to_ecfp(smiles, radius=2, n_bits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits)
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)

In [7]:
# Featurize molecules
train['ecfp'] = train['molecule_smiles'].apply(smiles_to_ecfp)
val['ecfp'] = val['molecule_smiles'].apply(smiles_to_ecfp)
test['ecfp'] = test['molecule_smiles'].apply(smiles_to_ecfp)

In [8]:
# One-hot protein
protein_encoder = OneHotEncoder(sparse_output=False)
protein_encoder.fit(train[['protein_name']])

In [9]:
X_train_prot = protein_encoder.transform(train[['protein_name']])
X_val_prot = protein_encoder.transform(val[['protein_name']])
X_test_prot = protein_encoder.transform(test[['protein_name']])

In [10]:
# Stack features
X_train = [np.concatenate([fp, prot]) for fp, prot in zip(train['ecfp'], X_train_prot)]
X_val = [np.concatenate([fp, prot]) for fp, prot in zip(val['ecfp'], X_val_prot)]
X_test = [np.concatenate([fp, prot]) for fp, prot in zip(test['ecfp'], X_test_prot)]

In [11]:
y_train = train['binds']
y_val = val['binds']

## 🧠 Model: Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=42, class_weight="balanced")
rf.fit(X_train, y_train)

In [None]:
val_preds = rf.predict_proba(X_val)[:, 1]
ap_score = average_precision_score(y_val, val_preds)

In [None]:
print(f"📊 Validation Average Precision (AP): {ap_score:.4f}")

## 📤 Make Submission

In [None]:
# We assume test.csv includes labels for now — remove them before submission!
test_preds = rf.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({'id': test['id'], 'binds': test_preds})

In [None]:
# Save to file
submission.to_csv("submission.csv", index=False)

In [None]:
# Preview
submission.head()