# Imports

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read data

In [None]:
# Define path
# path = "/cluster/projects/gliugroup/2BLAST/data/final/data_2025-03-29/processed/treatment_centered_data.parquet"

In [None]:
# Define path
path = "/cluster/projects/gliugroup/2BLAST/data/final/data_2023-02-21/processed/treatment_centered_dataset.parquet"

In [None]:
# Read dataset
df = pd.read_parquet(path)

In [None]:
# List of columns to select
columns_to_select = [
    'mrn',
    'treatment_date',
    'regimen',
    'age',
    'hemoglobin',
    'female',
    '%_ideal_dose_given_CISPLATIN',
    '%_ideal_dose_given_FLUOROURACIL',
    '%_ideal_dose_given_ETOPOSIDE',
    '%_ideal_dose_given_CARBOPLATIN',
    '%_ideal_dose_given_OXALIPLATIN',
    '%_ideal_dose_given_PEMBROLIZUMAB',
    '%_ideal_dose_given_NIVOLUMAB',
    '%_ideal_dose_given_DOCETAXEL',
    '%_ideal_dose_given_CYCLOPHOSPHAMIDE',
    '%_ideal_dose_given_DURVALUMAB',
    '%_ideal_dose_given_CETUXIMAB',
    '%_ideal_dose_given_RAMUCIRUMAB',
    '%_ideal_dose_given_RALTITREXED',
    '%_ideal_dose_given_IPILIMUMAB',
    '%_ideal_dose_given_MITOMYCIN',
    '%_ideal_dose_given_PANITUMUMAB',
    '%_ideal_dose_given_CAPECITABINE',
    '%_ideal_dose_given_ERLOTINIB'
]

# Select columns and create a copy of the DataFrame
df = df[columns_to_select].copy()

# Compute modified CARG score

In [None]:
def compute_modified_carg_scores(
    df: pd.DataFrame,
    drug_regex=r'^%_ideal',      # columns with %_ideal* dosing
    gi_prefix='GI',
    age_cutoff=72,
    hi_dose_threshold=0.95,      # e.g., 0.95 = 95% of ideal
) -> pd.DataFrame:
    """
    Adds scoring columns and returns the DataFrame (modified in place).
    Expects:
      - df['regimen'] (str-like), df['age'] (numeric),
      - df['female'] (bool), df['hemoglobin'] (numeric),
      - dose columns matching `drug_regex` with 0..1 values.
    """

    # --- 1) Identify drug columns once
    drug_cols = df.columns[df.columns.str.match(drug_regex, na=False)]

    # --- 2) Cancer type & age (vectorized)
    df['cancer_type_score'] = (df['regimen'].astype('string')
                                .str.startswith(gi_prefix, na=False)).astype(int) * 2
    df['age_score'] = (pd.to_numeric(df['age'], errors='coerce').ge(age_cutoff)).astype(int) * 2

    # --- 3) Number of drugs with > 0 dose
    if len(drug_cols):
        drug_positive = df[drug_cols].gt(0)
        df['number_of_drugs'] = drug_positive.sum(axis=1)
        df['number_of_drugs_score'] = (df['number_of_drugs'] > 1).astype(int) * 2

        # Any drug dosed at or above threshold
        df['dosage_score'] = df[drug_cols].ge(hi_dose_threshold).any(axis=1).astype(int) * 2
    else:
        df['number_of_drugs'] = 0
        df['number_of_drugs_score'] = 0
        df['dosage_score'] = 0

    # --- 4) Hemoglobin (vectorized; units assumed g/L)
    hb = pd.to_numeric(df['hemoglobin'], errors='coerce')
    is_female = df['female'].fillna(False).astype(bool)

    # threshold: <100 if female else <110
    hb_thresh = np.where(is_female, 100, 110)
    df['hemoglobin_score'] = (hb < hb_thresh).astype(int) * 3

    # --- 5) Final sum 
    score_cols = [
        'cancer_type_score',
        'age_score',
        'number_of_drugs_score',
        'dosage_score',
        'hemoglobin_score',
    ]
    df['modified_carg_score'] = df[score_cols].sum(axis=1)

    return df


In [None]:
# Apply compute_modified_carg_scores function
df = compute_modified_carg_scores(df)

# Frequency of modified CARG scores

In [None]:
# Count frequency of each score and sort by score value
score_counts = df['modified_carg_score'].value_counts().sort_index()

plt.figure(figsize=(7,4))
bars = plt.bar(score_counts.index, score_counts.values, color='#A7C7E7', edgecolor='black')

# Add labels above each bar
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,  
        height + 1,                         
        f'{int(height)}',                   
        ha='center', va='bottom', fontsize=10
    )

plt.title('Frequency of Modified CARG Scores')
plt.xlabel('Modified CARG Score')
plt.ylabel('Count')
plt.xticks(score_counts.index)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()