# Build Erlotinib Biological Dataset

This notebook reconstructs a **biologically enriched Erlotinib dataset** from the original
PANCANCER files:

- `PANCANCER_IC_*.csv` – drug response (IC50, etc.)
- `PANCANCER_Genetic_features_*.csv` – genetic features per cell line
- `Cell_list*.csv` – cell line metadata (tissue, cancer type)
- `Drug_list*.csv` – (optional) drug metadata

Steps:

1. Load raw files.
2. Extract Erlotinib IC50 per cell line.
3. Merge genetic features.
4. Merge tissue / cancer-type metadata.
5. Save an enriched dataset for modeling.


In [1]:
import pandas as pd

# === Configure your file names here if needed ===
ic_file   = "PANCANCER_IC_Sun Nov  2 18_33_31 2025.csv"
gen_file  = "PANCANCER_Genetic_features_Sun Nov  2 18_34_28 2025.csv"
cell_file = "Cell_listSun Nov  2 18_30_32 2025.csv"
drug_file = "Drug_listSun Nov  2 18_27_59 2025 1.csv"  # optional, not strictly required

ic    = pd.read_csv(ic_file)
gen   = pd.read_csv(gen_file)
cells = pd.read_csv(cell_file)
drugs = pd.read_csv(drug_file)

print("IC shape:", ic.shape)
print("Genetic shape:", gen.shape)
print("Cell list shape:", cells.shape)
print("Drug list shape:", drugs.shape)

print("\nIC columns:", ic.columns.tolist()[:15])
print("\nGenetic columns:", gen.columns.tolist()[:15])
print("\nCell list columns:", cells.columns.tolist()[:15])


  gen   = pd.read_csv(gen_file)


IC shape: (333292, 13)
Genetic shape: (699665, 9)
Cell list shape: (1939, 8)
Drug list shape: (700, 9)

IC columns: ['Drug Name', 'Drug ID', 'Cell Line Name', 'Cosmic ID', 'TCGA Classification', 'Tissue', 'Tissue Sub-type', 'IC50', 'AUC', 'Max Conc', 'RMSE', 'Z score', 'Dataset Version']

Genetic columns: ['Cell Line Name', 'COSMIC ID', 'GDSC Desc1', 'GDSC Desc2', 'TCGA Desc', 'Genetic Feature', 'IS Mutated', 'Recurrent Gain Loss', 'Genes in Segment']

Cell list columns: ['Cell line Name', 'Model ID', ' COSMIC ID', ' TCGA Classfication', ' Tissue', 'Tissue sub-type', ' Datasets', ' number of drugs']


In [2]:
# === Extract Erlotinib IC50 per cell line ===

# Filter rows for Erlotinib
erl = ic[ic["Drug Name"].str.strip().str.lower() == "erlotinib"].copy()
print("Erlotinib rows:", len(erl))

# Find COSMIC ID column in the IC50 table
ic_id_cols = [c for c in erl.columns if "cosmic" in c.lower()]
print("Possible IC ID columns:", ic_id_cols)
ic_id_col = ic_id_cols[0]

erl = erl.rename(columns={ic_id_col: "COSMIC_ID"})

# Find IC50 column
ic50_cols = [c for c in erl.columns if "ic50" in c.lower()]
print("Possible IC50 columns:", ic50_cols)
ic50_col = ic50_cols[0]

# Keep one IC50 per cell line (average if multiple rows)
erl = erl[["COSMIC_ID", ic50_col]].dropna()
erl = erl.rename(columns={ic50_col: "IC50"})
erl = erl.groupby("COSMIC_ID", as_index=False)["IC50"].mean()

print("Unique Erlotinib cell lines:", erl["COSMIC_ID"].nunique())
erl.head()


Erlotinib rows: 393
Possible IC ID columns: ['Cosmic ID']
Possible IC50 columns: ['IC50']
Unique Erlotinib cell lines: 393


Unnamed: 0,COSMIC_ID,IC50
0,684057,3.966813
1,684059,2.69209
2,684062,2.47799
3,684072,2.033564
4,687448,2.966007


In [3]:
# === Prepare genetic features ===

gen_id_cols = [c for c in gen.columns if "cosmic" in c.lower()]
print("Possible genetic ID columns:", gen_id_cols)
gen_id_col = gen_id_cols[0]

gen = gen.rename(columns={gen_id_col: "COSMIC_ID"})

# Drop duplicate rows per cell line
gen = gen.drop_duplicates(subset=["COSMIC_ID"])
print("Genetic data shape after de-dup:", gen.shape)
gen.head()


Possible genetic ID columns: ['COSMIC ID']
Genetic data shape after de-dup: (970, 9)


Unnamed: 0,Cell Line Name,COSMIC_ID,GDSC Desc1,GDSC Desc2,TCGA Desc,Genetic Feature,IS Mutated,Recurrent Gain Loss,Genes in Segment
0,CAL-29,1290730,urogenital_system,bladder,BLCA,CDC27_mut,0,,
252,CAL-33,753541,aero_digestive_tract,head_and_neck,HNSC,ABCB1_mut,0,,
376,697,906800,blood,lymphoblastic_leukemia,UNCLASSIFIED,cnaPANCAN1,0,gain,"C19orf12,CCNE1,PLEKHF1,POP4,TSHZ3,UQCRFS1,URI1..."
801,5637,687452,urogenital_system,bladder,BLCA,cnaPANCAN1,0,gain,"C19orf12,CCNE1,PLEKHF1,POP4,TSHZ3,UQCRFS1,URI1..."
1333,CAL-39,924107,urogenital_system,cervix,CESC,ABCB1_mut,0,,


In [4]:
# === Prepare cell metadata (tissue / cancer type) ===

cell_id_cols = [c for c in cells.columns if "cosmic" in c.lower()]
print("Possible cell list ID columns:", cell_id_cols)
cell_id_col = cell_id_cols[0]

cells = cells.rename(columns={cell_id_col: "COSMIC_ID"})

meta_cols = ["COSMIC_ID"]
for col in ["TCGA_Desc", "TCGA Desc", "GDSC_Desc2", "GDSC Desc2",
            "Tissue", "Tissue_type", "Cancer Type", "Cancer.Type", "Site"]:
    if col in cells.columns:
        meta_cols.append(col)

cells_small = cells[meta_cols].drop_duplicates(subset=["COSMIC_ID"])

print("Cell metadata shape:", cells_small.shape)
cells_small.head()


Possible cell list ID columns: [' COSMIC ID']
Cell metadata shape: (978, 1)


Unnamed: 0,COSMIC_ID
0,924100
2,910924
4,687561
6,1287706
8,687452


In [5]:
# === Merge IC50 + genetics + metadata into one dataset ===

erl_full = erl.merge(gen, on="COSMIC_ID", how="inner")
erl_full = erl_full.merge(cells_small, on="COSMIC_ID", how="left")

print("Final merged shape:", erl_full.shape)
print("Columns sample:", erl_full.columns.tolist()[:25])
erl_full.head()


Final merged shape: (393, 10)
Columns sample: ['COSMIC_ID', 'IC50', 'Cell Line Name', 'GDSC Desc1', 'GDSC Desc2', 'TCGA Desc', 'Genetic Feature', 'IS Mutated', 'Recurrent Gain Loss', 'Genes in Segment']


Unnamed: 0,COSMIC_ID,IC50,Cell Line Name,GDSC Desc1,GDSC Desc2,TCGA Desc,Genetic Feature,IS Mutated,Recurrent Gain Loss,Genes in Segment
0,684057,3.966813,ES5,bone,ewings_sarcoma,UNCLASSIFIED,cnaPANCAN1,0,gain,"C19orf12,CCNE1,PLEKHF1,POP4,TSHZ3,UQCRFS1,URI1..."
1,684059,2.69209,ES7,bone,ewings_sarcoma,UNCLASSIFIED,cnaPANCAN1,0,gain,"C19orf12,CCNE1,PLEKHF1,POP4,TSHZ3,UQCRFS1,URI1..."
2,684062,2.47799,EW-11,bone,ewings_sarcoma,UNCLASSIFIED,cnaPANCAN1,0,gain,"C19orf12,CCNE1,PLEKHF1,POP4,TSHZ3,UQCRFS1,URI1..."
3,684072,2.033564,SK-ES-1,bone,ewings_sarcoma,UNCLASSIFIED,cnaPANCAN1,0,gain,"C19orf12,CCNE1,PLEKHF1,POP4,TSHZ3,UQCRFS1,URI1..."
4,687448,2.966007,COLO-829,skin,melanoma,SKCM,cnaPANCAN1,0,gain,"C19orf12,CCNE1,PLEKHF1,POP4,TSHZ3,UQCRFS1,URI1..."


In [6]:
# === Save enriched Erlotinib dataset ===

out_file = "Erlotinib_Biological_Enriched.csv"
erl_full.to_csv(out_file, index=False)
print(f"Saved enriched dataset to: {out_file}")


Saved enriched dataset to: Erlotinib_Biological_Enriched.csv
