# BELKA-mini: Exploratory Hint Notebook 🧠

## 🧰 Setup

In [None]:
# !pip install -q rdkit-pypi

import pandas as pd
import numpy as np
from rdkit import Chem
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
test = pd.read_csv("test.csv")  # includes is_novel + binds for now

## 📊 Basic Stats

In [None]:
print("Train size:", len(train))
print("Val size:", len(val))
print("Test size:", len(test))

print("\n🔬 Columns:", train.columns.tolist())

## 🔍 Binding Imbalance

In [None]:
def plot_class_balance(df, label):
    df['binds'].value_counts().sort_index().plot(kind='bar', title=f"{label} Binding Distribution")
    plt.xticks([0, 1], ['Non-binders (0)', 'Binders (1)'], rotation=0)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

In [None]:
plot_class_balance(train, "Train")
plot_class_balance(val, "Validation")

## 🧬 Protein Distribution

In [None]:
def plot_protein_distribution(df, label):
    df['protein_name'].value_counts().plot(kind='bar', title=f"{label} Protein Distribution")
    plt.xticks(rotation=0)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

In [None]:
plot_protein_distribution(train, "Train")
plot_protein_distribution(test, "Test")

## 🧩 Building Block Coverage

In [None]:
def unique_bbs(df, slot):
    return set(df[f'buildingblock{slot}_smiles'].unique())

In [None]:
bb_train = {i: unique_bbs(train, i) for i in [1,2,3]}
bb_test = {i: unique_bbs(test, i) for i in [1,2,3]}

In [None]:
for i in [1,2,3]:
    overlap = len(bb_train[i] & bb_test[i])
    total = len(bb_test[i])

print(f"🧱 BB{i}: {overlap} / {total} ({100 * overlap/total:.2f}%) overlap between test and train")