1. cleaning up compiled data from Nico 
- starting w/ one file for now = convnext.csv
- clean up columns w/ less than 10 data points and rows w/o data in "permanent" column
- "permanent" column = final diagnoses
- want to delete 128+ columns so Nico can add more image features 
(max # of columns readable by models we will use later = 500)
- want to separate the data via diagnoses to see what diagnoses works w/ what data

*preliminary data cleaning via chatgpt shows:
OG: 522 rows x 407 columns
cleaned: 510 rows x 241 columns
dropped columns: 166
dropped rows: 12
dx groups created: 220

*too many different dx
check top 10 most common dx
generate table of dx counts listing all 222 dx groups w/ row counts 
- easier to filter/sort by row count

In [4]:
import pandas as pd
import re
import os

In [9]:
# ✅ 1. Check current working directory (just to confirm)
print("Current working directory:", os.getcwd())

# ✅ 2. Helper function: Standardize ALL text entries
def standardize_text(value):
    if pd.isna(value):
        return value
    if isinstance(value, str):
        value = value.replace("\xa0", " ")          # replace non-breaking spaces
        value = re.sub(r"\s+", " ", value).strip()  # collapse multiple spaces & trim
        value = value.lower()                       # lowercase everything
        value = re.sub(r"^[-–]+", "", value).strip() # remove leading dashes or en-dashes
    return value

# ✅ 3. Cleaning & Standardizing Function
def clean_and_standardize_all(df, filename_prefix, save_path="./"):
    original_shape = df.shape

    # --- Standardize text in all object columns ---
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].apply(standardize_text)

   # --- Drop rows with no permanent diagnosis (NaN or empty) ---
    rows_before = df.shape[0]
    df_cleaned = df[df["permanent"].notna() & (df["permanent"].str.strip() != "")]

    # --- Track dropped rows ---
    rows_missing_dx = df[~df.index.isin(df_cleaned.index)].copy()
    rows_missing_dx["drop_reason"] = "No permanent diagnosis"

   # --- Drop columns with <10 non-null entries ---
    cols_to_drop = [col for col in df_cleaned.columns if df_cleaned[col].notna().sum() < 10]
    dropped_cols_info = pd.DataFrame({
        "column": cols_to_drop,
        "non_null_count": [df_cleaned[c].notna().sum() for c in cols_to_drop],
        "drop_reason": "Fewer than 10 non-null entries"
    })
    df_cleaned = df_cleaned.drop(columns=cols_to_drop)

    # --- ✅ Sort/group by permanent diagnosis ---
    df_cleaned = df_cleaned.sort_values(by="permanent").reset_index(drop=True)

    # --- Save Cleaned Master CSV ---
    cleaned_file = os.path.join(save_path, f"{filename_prefix}_cleaned_master.csv")
    df_cleaned.to_csv(cleaned_file, index=False)
    print(f"✅ Cleaned data saved (grouped by diagnosis): {cleaned_file}")

    # --- Save dropped info CSV ---
    dropped_cols_info["row_index"] = "N/A"
    rows_missing_dx["column"] = "N/A"
    rows_missing_dx["non_null_count"] = "N/A"
    dropped_info_combined = pd.concat([dropped_cols_info, rows_missing_dx], ignore_index=True)
    dropped_file = os.path.join(save_path, f"{filename_prefix}_dropped_info.csv")
    dropped_info_combined.to_csv(dropped_file, index=False)
    print(f"✅ Dropped info saved: {dropped_file}")

    return {
        "original_shape": original_shape,
        "cleaned_shape": df_cleaned.shape,
        "columns_dropped": len(cols_to_drop),
        "rows_dropped": rows_before - df_cleaned.shape[0]
    }

# ✅ 4. Example Run for ConvNeXt (change filename_prefix for others)
data_path = "/Users/joi263/Documents/MultimodalTabData/data/OG_data_csv/convnext.csv"
save_path = "/Users/joi263/Documents/MultimodalTabData/data"

df_convnext = pd.read_csv(data_path)
summary = clean_and_standardize_all(df_convnext, "convnext", save_path=save_path)
print(summary)


Current working directory: /Users/joi263/Documents/MultimodalTabData/data
✅ Cleaned data saved (grouped by diagnosis): /Users/joi263/Documents/MultimodalTabData/data/convnext_cleaned_master.csv
✅ Dropped info saved: /Users/joi263/Documents/MultimodalTabData/data/convnext_dropped_info.csv
{'original_shape': (522, 407), 'cleaned_shape': (510, 241), 'columns_dropped': 166, 'rows_dropped': 12}


In [18]:
import pandas as pd

# Load your cleaned master CSV
df_cleaned = pd.read_csv("/Users/joi263/Documents/MultimodalTabData/data/convnext_cleaned_master2.csv")

# Check the number of columns
num_columns = df_cleaned.shape[1]
print(f"✅ The cleaned master CSV has {num_columns} columns.")


✅ The cleaned master CSV has 218 columns.


In [17]:
import pandas as pd

df = pd.read_csv("/Users/joi263/Documents/MultimodalTabData/data/convnext_cleaned_master.csv", sep=None, engine="python")
print(df.shape)


ParserError: 'e' expected after '"'

In [5]:
def generate_diagnosis_counts(df, filename_prefix, save_path="./"):
    diagnosis_counts = (
        df.groupby("permanent")
        .size()
        .reset_index(name="row_count")
        .sort_values(by="row_count", ascending=False)
    )
    counts_file = os.path.join(save_path, f"{filename_prefix}_diagnosis_counts.csv")
    diagnosis_counts.to_csv(counts_file, index=False)
    print(f"✅ Diagnosis counts saved: {counts_file}")
    return diagnosis_counts

# ✅ Example Run (after cleaning)
df_cleaned = pd.read_csv("/Users/joi263/Documents/MultimodalTabData/data/convnext_cleaned_master.csv")
diagnosis_counts = generate_diagnosis_counts(df_cleaned, "convnext", save_path="/Users/joi263/Documents/MultimodalTabData/data")
diagnosis_counts.head(10)


✅ Diagnosis counts saved: /Users/joi263/Documents/MultimodalTabData/data/convnext_diagnosis_counts.csv


Unnamed: 0,permanent,row_count
86,"Glioblastoma, CNS WHO grade 4",70
84,Glioblastoma WHO Grade IV,40
180,Pituitary adenoma,26
80,Glioblastoma,20
131,Meningioma,19
141,Metastatic carcinoma,16
126,Lymphoma,7
201,"Recurrent/residual glioblastoma, CNS WHO grade 4",6
147,Metastatic melanoma,6
55,Diffuse large B-cell lymphoma,6


In [7]:
def generate_feature_completeness_per_dx(df, filename_prefix, save_path="./"):
    # Calculate completeness per feature for each diagnosis group
    feature_completeness_per_dx = (
        df.groupby("permanent")
        .apply(lambda group: group.notna().sum() / len(group) * 100)
    )

    # Convert index (permanent) into a proper column without duplication
    feature_completeness_per_dx = feature_completeness_per_dx.copy()
    feature_completeness_per_dx["diagnosis"] = feature_completeness_per_dx.index
    feature_completeness_per_dx = feature_completeness_per_dx.reset_index(drop=True)

    # Melt to long format for easy sorting
    feature_completeness_per_dx = feature_completeness_per_dx.melt(
        id_vars=["diagnosis"], var_name="feature", value_name="percent_complete"
    ).sort_values(["diagnosis", "percent_complete"], ascending=[True, False])

    # Save to CSV
    completeness_file = os.path.join(save_path, f"{filename_prefix}_feature_completeness_per_diagnosis.csv")
    feature_completeness_per_dx.to_csv(completeness_file, index=False)
    print(f"✅ Feature completeness per diagnosis saved: {completeness_file}")

    return feature_completeness_per_dx

# ✅ Example Run (after cleaning)
feature_completeness = generate_feature_completeness_per_dx(
    df_cleaned,
    "convnext",
    save_path="/Users/joi263/Documents/MultimodalTabData/data"
)
feature_completeness.head(10)


✅ Feature completeness per diagnosis saved: /Users/joi263/Documents/MultimodalTabData/data/convnext_feature_completeness_per_diagnosis.csv


  .apply(lambda group: group.notna().sum() / len(group) * 100)


Unnamed: 0,diagnosis,feature,percent_complete
0,- Adenocarcinoma consistent with metastases fr...,case_number,100.0
220,- Adenocarcinoma consistent with metastases fr...,institution_x,100.0
440,- Adenocarcinoma consistent with metastases fr...,organ,100.0
660,- Adenocarcinoma consistent with metastases fr...,p19q_report,100.0
1320,- Adenocarcinoma consistent with metastases fr...,mgmt_pyro,100.0
2860,- Adenocarcinoma consistent with metastases fr...,oncomine_focus,100.0
7040,- Adenocarcinoma consistent with metastases fr...,genomic_tests_complete,100.0
7260,- Adenocarcinoma consistent with metastases fr...,ala_case,100.0
7700,- Adenocarcinoma consistent with metastases fr...,frozen,100.0
8140,- Adenocarcinoma consistent with metastases fr...,time_frozen_diagnosis,100.0
