# Step 1 â€” Data Curation (ChEMBL205: Carbonic Anhydrase II)

This notebook loads the raw ChEMBL export, filters to **Homo sapiens**, removes missing values, resolves duplicates by averaging per compound, and exports a curated dataset for modeling.

**Input:** `data/raw/chembl_CAII_IC50.csv`  
**Output:** `data/processed/ca2_curated.csv`

In [None]:
from pathlib import Path
import pandas as pd

DATA_RAW = Path('data/raw/chembl_CAII_IC50.csv')
DATA_OUT = Path('data/processed/ca2_curated.csv')

EXPECTED_COLS = {
    'Molecule ChEMBL ID',
    'Assay Organism',
    'pChEMBL Value',
    'AlogP',
    'Molecular Weight'
}

df = pd.read_csv(DATA_RAW, sep=';')
missing = EXPECTED_COLS - set(df.columns)
if missing:
    raise ValueError(f'Missing expected columns: {missing}')

len(df)

In [None]:
# Filter to Homo sapiens
df = df[df['Assay Organism'].eq('Homo sapiens')].copy()

# Keep relevant columns
df = df[['Molecule ChEMBL ID', 'pChEMBL Value', 'AlogP', 'Molecular Weight']].copy()

# Coerce to numeric (safety)
df['pChEMBL Value'] = pd.to_numeric(df['pChEMBL Value'], errors='coerce')
df['AlogP'] = pd.to_numeric(df['AlogP'], errors='coerce')
df['Molecular Weight'] = pd.to_numeric(df['Molecular Weight'], errors='coerce')

# Drop missing
df = df.dropna(subset=['pChEMBL Value', 'AlogP', 'Molecular Weight'])

len(df)

In [None]:
# Resolve duplicates by averaging per compound
df = (
    df.groupby('Molecule ChEMBL ID', as_index=False)
      .agg({'pChEMBL Value': 'mean', 'AlogP': 'mean', 'Molecular Weight': 'mean'})
)

# Rename
df = df.rename(columns={'pChEMBL Value': 'pIC50', 'Molecular Weight': 'MW'})

# Export
DATA_OUT.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(DATA_OUT, index=False)

df.head()

In [None]:
df.describe()