# Pre-processing pipeline 

In [10]:
import pandas as pd

# Load datasets
df_genetic = pd.read_csv("Data/df_genetic.csv", index_col=0)
df_ic50 = pd.read_csv("Data/df_ic50.csv", index_col=0)

# Select the drugs of interest
selected_drugs = ["Camptothecin", "Vinblastine", "Cisplatin", "Cytarabine", "Docetaxel"]
df_ic50_filtered = df_ic50[df_ic50.index.isin(selected_drugs)]

# STEP 1: Pivot genetic features (reshape to wide format)
df_genetic_wide = df_genetic.pivot_table(
    index="Cell Line Name",
    columns="Genetic Feature",
    values="IS Mutated",  # or use another omics signal if preferred
    aggfunc="first"
)

# STEP 2: Create ML datasets per drug
drug_datasets = {}

for drug in selected_drugs:
    df_drug = df_ic50_filtered[df_ic50_filtered.index == drug]
    
    # Merge IC50 data with omics (on Cell Line Name)
    df_merged = df_drug.merge(df_genetic_wide, on="Cell Line Name", how="inner")
    
    # Add to dictionary
    drug_datasets[drug] = df_merged
    
    # (Optional) Save each to CSV
    df_merged.to_csv(f"{drug}_ML_dataset.csv", index=False)

# Optional: View shape of one result
drug_datasets["Camptothecin"].head()


Unnamed: 0,Drug ID,Cell Line Name,Cosmic ID,TCGA Classification,Tissue,Tissue Sub-type,IC50,AUC,Max Conc,RMSE,...,cnaBRCA6,cnaBRCA60,cnaBRCA61,cnaBRCA62,cnaBRCA63,cnaBRCA64,cnaBRCA65,cnaBRCA7,cnaBRCA8,cnaBRCA9
0,1003,HCC1954,749709,BRCA,breast,breast,0.317741,0.983262,0.1,0.082831,...,0,0,0,1,1,1,0,0,0,0
1,1003,HCC1143,749710,BRCA,breast,breast,0.636184,0.96905,0.1,0.077198,...,0,0,0,0,0,0,0,0,0,0
2,1003,HCC1187,749711,BRCA,breast,breast,1.235544,0.992326,0.1,0.079605,...,0,0,0,1,0,0,0,0,0,0
3,1003,HCC1395,749712,BRCA,breast,breast,-2.255899,0.861208,0.1,0.096794,...,0,0,0,0,0,0,0,1,0,0
4,1003,HCC1599,749713,BRCA,breast,breast,-3.247021,0.768404,0.1,0.111973,...,0,0,0,1,0,0,0,0,0,0
