## Load initial files

In [None]:
import pandas as pd
import numpy as np
from pycaret.classification import *  # For later ML work

# Define file paths 
clinical_fp = "clinical.csv"
methylation_fp = "methylation.csv"
mrna_fp = "mrna.csv"

# Load the CSV files into pandas DataFrames 
df_clinical = pd.read_csv(clinical_fp, sep='\t')
df_methylation = pd.read_csv(methylation_fp)
df_mrna = pd.read_csv(mrna_fp)

# Display shapes of the loaded DataFrames to verify they loaded correctly.
print("Loaded Clinical Data shape:", df_clinical.shape)
print("Loaded Methylation Data shape:", df_methylation.shape)
print("Loaded mRNA Data shape:", df_mrna.shape)

## make sure rfs is present in each

In [None]:
dataframes = {
    "Clinical": df_clinical,
    "Methylation": df_methylation,
    "mRNA": df_mrna
}

for name, df in dataframes.items():
    if "RFS_STATUS" in df.columns:
        print(f"{name} DataFrame contains 'RFS_STATUS'.")
    else:
        print(f"{name} DataFrame DOES NOT contain 'RFS_STATUS'.")

In [None]:
import pandas as pd

# Load the patient_ids_RFS_STATUS file (adjust separator if necessary)
df_rfs = pd.read_csv("patient_ids_RFS_STATUS.csv")

print("Patient IDs RFS_STATUS:")
print(df_rfs.head())

# Merge RFS_STATUS into the Methylation DataFrame if missing
if "RFS_STATUS" not in df_methylation.columns:
    df_methylation = df_methylation.merge(df_rfs, on="PATIENT_ID", how="left")
    print("Merged RFS_STATUS into Methylation DataFrame. New shape:", df_methylation.shape)
else:
    print("Methylation DataFrame already contains RFS_STATUS.")

# Merge RFS_STATUS into the mRNA DataFrame if missing
if "RFS_STATUS" not in df_mrna.columns:
    df_mrna = df_mrna.merge(df_rfs, on="PATIENT_ID", how="left")
    print("Merged RFS_STATUS into mRNA DataFrame. New shape:", df_mrna.shape)
else:
    print("mRNA DataFrame already contains RFS_STATUS.")

## ML Pipeline (individual files)

In [None]:
from pycaret.classification import *
import pandas as pd
import matplotlib.pyplot as plt

def run_ml_pipeline(df, target, ignore_features, model_save_name):
    """
    Runs a PyCaret classification pipeline on the provided DataFrame.
    It removes rows with missing target values, compares multiple models
    and generates evaluation plots that are output in the notebook.
    
    Parameters:
      - df: pandas DataFrame containing the data.
      - target: Name of the target column (e.g., "RFS_STATUS").
      - ignore_features: List of columns to ignore during training.
      - model_save_name: Base name for saving the best model.
    """
    # Drop rows with missing target values
    df = df.dropna(subset=[target])
    print(f"After dropping missing '{target}' values, dataset shape: {df.shape}")
    
    # Initialize PyCaret's setup. preprocess=True handles encoding, scaling, etc.
    clf = setup(
        data=df,
        target=target,
        session_id=42,           # For reproducibility
        fold=5,                 
        ignore_features=ignore_features,
        preprocess=True,
        html=False,
        verbose=True
    )
    
    # Compare models and select the best based on default metrics.
    best_model = compare_models()
    
    # Output evaluation plots using PyCaret (plots will display in the notebook).
    plot_types = ['auc', 'confusion_matrix', 'feature', 'class_report']
    
    for plot_type in plot_types:
        try:
            plot_model(best_model, plot=plot_type)
        except Exception as e:
            print(f"Could not generate plot '{plot_type}': {e}")
    
    # Save the best model.
    save_model(best_model, model_save_name)
    print(f"Best model saved as {model_save_name}.pkl")
    
# Run the ML pipeline on individual datasets.

# Clinical Dataset
run_ml_pipeline(df_clinical, target="RFS_STATUS", ignore_features=["PATIENT_ID"], model_save_name="best_model_clinical")

# Methylation Dataset
run_ml_pipeline(df_methylation, target="RFS_STATUS", ignore_features=["PATIENT_ID"], model_save_name="best_model_methylation")

# mRNA Dataset
run_ml_pipeline(df_mrna, target="RFS_STATUS", ignore_features=["PATIENT_ID"], model_save_name="best_model_mrna")

## combine files

In [None]:
import pandas as pd

def merge_and_drop_target(df1, df2, key="PATIENT_ID", target="RFS_STATUS"):
    """
    Merges two DataFrames on a given key while ensuring that only the first DataFrame's target column is kept.
    If df2 contains the target column, it will be dropped.
    """
    # Drop target column from the second DataFrame if present.
    if target in df2.columns:
        df2 = df2.drop(columns=[target])
    merged = pd.merge(df1, df2, on=key, how='inner')
    return merged

# Ensure you are working with copies of your original DataFrames
df_clin = df_clinical.copy()
df_mrna_cp = df_mrna.copy()
df_meth_cp = df_methylation.copy()

# Merge combinations:
# 1. Clinical + mRNA (keeps RFS_STATUS from Clinical)
df_clin_mrna = merge_and_drop_target(df_clin, df_mrna_cp, key="PATIENT_ID", target="RFS_STATUS")
print("Clinical + mRNA merged shape:", df_clin_mrna.shape)

# 2. Clinical + Methylation (keeps RFS_STATUS from Clinical)
df_clin_meth = merge_and_drop_target(df_clin, df_meth_cp, key="PATIENT_ID", target="RFS_STATUS")
print("Clinical + Methylation merged shape:", df_clin_meth.shape)

# 3. mRNA + Methylation (here, arbitrarily keep RFS_STATUS from mRNA)
df_mrna_meth = merge_and_drop_target(df_mrna_cp, df_meth_cp, key="PATIENT_ID", target="RFS_STATUS")
print("mRNA + Methylation merged shape:", df_mrna_meth.shape)

# 4. All three combined: Merge Clinical and mRNA first, then merge with Methylation.
temp_merge = merge_and_drop_target(df_clin, df_mrna_cp, key="PATIENT_ID", target="RFS_STATUS")
df_all_combined = merge_and_drop_target(temp_merge, df_meth_cp, key="PATIENT_ID", target="RFS_STATUS")
print("Clinical + mRNA + Methylation merged shape:", df_all_combined.shape)


In [None]:


# For individual DataFrames:
print("Unique patient IDs in Clinical:", df_clin["PATIENT_ID"].nunique())
print("Unique patient IDs in Methylation:", df_meth_cp["PATIENT_ID"].nunique())
print("Unique patient IDs in mRNA:", df_mrna_cp["PATIENT_ID"].nunique())

# For merged DataFrames:
print("Unique patient IDs in Clinical + mRNA merge:", df_clin_mrna["PATIENT_ID"].nunique())
print("Unique patient IDs in Clinical + Methylation merge:", df_clin_meth["PATIENT_ID"].nunique())
print("Unique patient IDs in mRNA + Methylation merge:", df_mrna_meth["PATIENT_ID"].nunique())
print("Unique patient IDs in Clinical + mRNA + Methylation merge:", df_all_combined["PATIENT_ID"].nunique())


# Now you can pass any of these merged DataFrames to PyCaret.
# PyCaret's setup() (with preprocess=True) will handle mixed categorical/numerical features automatically.

## ML Combined


In [None]:
from pycaret.classification import *
import matplotlib.pyplot as plt

def run_ml_pipeline_merged(df, target, ignore_features, model_save_name):
    """
    Runs PyCaret ML pipeline on a merged DataFrame.
    Assumes that `df` already has only one RFS_STATUS column.
    Outputs evaluation plots in the notebook.
    
    Parameters:
      - df: pandas DataFrame containing the data.
      - target: Name of the target column (e.g., "RFS_STATUS").
      - ignore_features: List of columns to ignore during training.
      - model_save_name: Base name for saving the best model.
    """
    # Drop rows with missing target values.
    df = df.dropna(subset=[target])
    print(f"After dropping missing '{target}' values, dataset shape for {model_save_name}: {df.shape}")
    
    # Initialize PyCaret's setup.
    clf = setup(
        data=df,
        target=target,
        session_id=42,
        fold=10,                 
        ignore_features=ignore_features,
        preprocess=True,
        html=False,
        verbose=True
    )
    
    # Compare models and select the best.
    best_model = compare_models()
    
    # Output evaluation plots in the notebook.
    plot_types = ['auc', 'confusion_matrix', 'feature', 'class_report']
    
    for plot_type in plot_types:
        try:
            plot_model(best_model, plot=plot_type)
        except Exception as e:
            print(f"Could not generate plot '{plot_type}': {e}")
    
    # Optionally, review the model interactively.
    evaluate_model(best_model)
    
    # Save the best model.
    save_model(best_model, model_save_name)
    print(f"Best merged model saved as {model_save_name}.pkl")


# For each merged dataset, run the ML pipeline (using "RFS_STATUS" as target):
run_ml_pipeline_merged(df_all_combined, target="RFS_STATUS", ignore_features=["PATIENT_ID"], model_save_name="best_model_comvbined")
### run same line with different dfs to get other results, run one at a time to save resources

In [None]:
run_ml_pipeline_merged(df_clin_mrna, target="RFS_STATUS", ignore_features=["PATIENT_ID"], model_save_name="best_model_clinical_mrna")


In [None]:
run_ml_pipeline_merged(df_clin_meth, target="RFS_STATUS", ignore_features=["PATIENT_ID"], model_save_name="best_model_clinical_methylation")