1. same thing for the other 4 data sets
- used the updated version Jack sent (w/ demographics)
- saved in their respective folders

In [2]:
import pandas as pd
import re
import os

In [4]:
# cleaning all the datasets

# âœ… 1. Helper function: Standardize ALL text entries
def standardize_text(value):
    if pd.isna(value):
        return value
    if isinstance(value, str):
        value = value.replace("\xa0", " ")          # replace non-breaking spaces
        value = re.sub(r"\s+", " ", value).strip()  # collapse multiple spaces & trim
        value = value.lower()                       # lowercase everything
        value = re.sub(r"^[-â€“]+", "", value).strip() # remove leading dashes or en-dashes
    return value

# âœ… 2. Cleaning & Standardizing Function
def clean_and_standardize_all(df, filename_prefix, save_path="./"):
    original_shape = df.shape

    # --- Standardize text in all object columns ---
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].apply(standardize_text)

    # --- Drop rows with no permanent diagnosis (NaN or empty) ---
    rows_before = df.shape[0]
    df_cleaned = df[df["permanent"].notna() & (df["permanent"].str.strip() != "")]

    # --- Track dropped rows ---
    rows_missing_dx = df[~df.index.isin(df_cleaned.index)].copy()
    rows_missing_dx["drop_reason"] = "No permanent diagnosis"

    # --- Drop columns with <10 non-null entries ---
    cols_to_drop = [col for col in df_cleaned.columns if df_cleaned[col].notna().sum() < 10]
    dropped_cols_info = pd.DataFrame({
        "column": cols_to_drop,
        "non_null_count": [df_cleaned[c].notna().sum() for c in cols_to_drop],
        "drop_reason": "Fewer than 10 non-null entries"
    })
    df_cleaned = df_cleaned.drop(columns=cols_to_drop)

    # âœ… Explicitly drop any unnamed/blank columns (just in case)
    unnamed_cols = [c for c in df_cleaned.columns if "Unnamed" in c or c.strip() == ""]
    if unnamed_cols:
        df_cleaned = df_cleaned.drop(columns=unnamed_cols)

    # --- âœ… Sort/group by permanent diagnosis ---
    df_cleaned = df_cleaned.sort_values(by="permanent").reset_index(drop=True)

    # --- Save Cleaned Master CSV ---
    os.makedirs(save_path, exist_ok=True)  # create folder if not exists
    cleaned_file = os.path.join(save_path, f"{filename_prefix}_cleaned_master.csv")
    df_cleaned.to_csv(cleaned_file, index=False)
    print(f"âœ… Cleaned data saved (grouped by diagnosis): {cleaned_file}")

    # --- Save Dropped Info CSV ---
    dropped_cols_info["row_index"] = "N/A"
    rows_missing_dx["column"] = "N/A"
    rows_missing_dx["non_null_count"] = "N/A"
    dropped_info_combined = pd.concat([dropped_cols_info, rows_missing_dx], ignore_index=True)
    dropped_file = os.path.join(save_path, f"{filename_prefix}_dropped_info.csv")
    dropped_info_combined.to_csv(dropped_file, index=False)
    print(f"âœ… Dropped info saved: {dropped_file}")

    return {
        "original_shape": original_shape,
        "cleaned_shape": df_cleaned.shape,
        "columns_dropped": len(cols_to_drop) + len(unnamed_cols),
        "rows_dropped": rows_before - df_cleaned.shape[0]
    }

# âœ… 3. Batch Clean 4 Datasets
data_folder = "/Users/joi263/Documents/MultimodalTabData/data/OG_data_csv"

datasets = {
    "efficientnet": "efficientnet_new.csv",
    "imagenet_resnet50": "imagenet_resnet50_new.csv",
    "pretrained_resnet50": "pretrained_resnet50_new.csv",
    "vit_base": "vit_base_new.csv"
}

summaries = {}

for name, filename in datasets.items():
    print(f"\nðŸ”„ Cleaning {name}...")
    
    # Define dataset-specific save folder (e.g., efficientnet_data)
    save_folder = f"/Users/joi263/Documents/MultimodalTabData/data/{name}_data"
    
    # Load and clean
    df_raw = pd.read_csv(os.path.join(data_folder, filename))
    summaries[name] = clean_and_standardize_all(df_raw, name, save_path=save_folder)

print("\nâœ… All 4 datasets cleaned!")
print(summaries)



ðŸ”„ Cleaning efficientnet...
âœ… Cleaned data saved (grouped by diagnosis): /Users/joi263/Documents/MultimodalTabData/data/efficientnet_data/efficientnet_cleaned_master.csv
âœ… Dropped info saved: /Users/joi263/Documents/MultimodalTabData/data/efficientnet_data/efficientnet_dropped_info.csv

ðŸ”„ Cleaning imagenet_resnet50...
âœ… Cleaned data saved (grouped by diagnosis): /Users/joi263/Documents/MultimodalTabData/data/imagenet_resnet50_data/imagenet_resnet50_cleaned_master.csv
âœ… Dropped info saved: /Users/joi263/Documents/MultimodalTabData/data/imagenet_resnet50_data/imagenet_resnet50_dropped_info.csv

ðŸ”„ Cleaning pretrained_resnet50...
âœ… Cleaned data saved (grouped by diagnosis): /Users/joi263/Documents/MultimodalTabData/data/pretrained_resnet50_data/pretrained_resnet50_cleaned_master.csv
âœ… Dropped info saved: /Users/joi263/Documents/MultimodalTabData/data/pretrained_resnet50_data/pretrained_resnet50_dropped_info.csv

ðŸ”„ Cleaning vit_base...
âœ… Cleaned data saved (grouped

In [None]:
#counting how many columns were dropped from master csv pre manual drop

# âœ… Paths to cleaned master files
cleaned_files = {
    "efficientnet": "/Users/joi263/Documents/MultimodalTabData/data/efficientnet_data/efficientnet_cleaned_master.csv",
    "imagenet_resnet50": "/Users/joi263/Documents/MultimodalTabData/data/imagenet_resnet50_data/imagenet_resnet50_cleaned_master.csv",
    "pretrained_resnet50": "/Users/joi263/Documents/MultimodalTabData/data/pretrained_resnet50_data/pretrained_resnet50_cleaned_master.csv",
    "vit_base": "/Users/joi263/Documents/MultimodalTabData/data/vit_base_data/vit_base_cleaned_master.csv"
}

print("âœ… Column Counts per Cleaned Master CSV:")
for name, path in cleaned_files.items():
    df = pd.read_csv(path)
    print(f"{name}: {df.shape[1]} columns")


âœ… Column Counts per Cleaned Master CSV:
efficientnet: 244 columns
imagenet_resnet50: 244 columns
pretrained_resnet50: 244 columns
vit_base: 244 columns


In [6]:
#counting how many columns were dropped from master csv post manual drop

# âœ… Paths to cleaned master files
cleaned_files = {
    "efficientnet": "/Users/joi263/Documents/MultimodalTabData/data/efficientnet_data/efficientnet_cleaned_master.csv",
    "imagenet_resnet50": "/Users/joi263/Documents/MultimodalTabData/data/imagenet_resnet50_data/imagenet_resnet50_cleaned_master.csv",
    "pretrained_resnet50": "/Users/joi263/Documents/MultimodalTabData/data/pretrained_resnet50_data/pretrained_resnet50_cleaned_master.csv",
    "vit_base": "/Users/joi263/Documents/MultimodalTabData/data/vit_base_data/vit_base_cleaned_master.csv"
}

print("âœ… Column Counts per Cleaned Master CSV:")
for name, path in cleaned_files.items():
    df = pd.read_csv(path)
    print(f"{name}: {df.shape[1]} columns")


âœ… Column Counts per Cleaned Master CSV:
efficientnet: 228 columns
imagenet_resnet50: 228 columns
pretrained_resnet50: 228 columns
vit_base: 228 columns


-Each file started w/ 480 columns
-Dropped to 244 columns after initial cleaning
-Dropped to 228 columns after manual cleaning (16 columns dropped manually)

2. now generate files of diagnosis counts for all 4 remaining datasets

In [7]:
#generates diagnosis counts

# âœ… Paths to the 4 cleaned master CSVs
cleaned_files = {
    "efficientnet": "/Users/joi263/Documents/MultimodalTabData/data/efficientnet_data/efficientnet_cleaned_master.csv",
    "imagenet_resnet50": "/Users/joi263/Documents/MultimodalTabData/data/imagenet_resnet50_data/imagenet_resnet50_cleaned_master.csv",
    "pretrained_resnet50": "/Users/joi263/Documents/MultimodalTabData/data/pretrained_resnet50_data/pretrained_resnet50_cleaned_master.csv",
    "vit_base": "/Users/joi263/Documents/MultimodalTabData/data/vit_base_data/vit_base_cleaned_master.csv"
}

# âœ… Helper: Generate Diagnosis Counts
def generate_diagnosis_counts(df, filename_prefix, save_path="./"):
    diagnosis_counts = (
        df.groupby("permanent")
        .size()
        .reset_index(name="row_count")
        .sort_values(by="row_count", ascending=False)
    )
    counts_file = os.path.join(save_path, f"{filename_prefix}_diagnosis_counts.csv")
    diagnosis_counts.to_csv(counts_file, index=False)
    print(f"âœ… Diagnosis counts saved: {counts_file}")
    return diagnosis_counts

# âœ… Batch Run for All 4 Datasets
all_counts = {}

for name, path in cleaned_files.items():
    print(f"\nðŸ”„ Processing {name}...")
    df = pd.read_csv(path)
    save_path = os.path.dirname(path)  # save in the same folder as the cleaned master
    all_counts[name] = generate_diagnosis_counts(df, name, save_path=save_path)

print("\nâœ… Diagnosis counts generated for all 4 datasets!")



ðŸ”„ Processing efficientnet...
âœ… Diagnosis counts saved: /Users/joi263/Documents/MultimodalTabData/data/efficientnet_data/efficientnet_diagnosis_counts.csv

ðŸ”„ Processing imagenet_resnet50...
âœ… Diagnosis counts saved: /Users/joi263/Documents/MultimodalTabData/data/imagenet_resnet50_data/imagenet_resnet50_diagnosis_counts.csv

ðŸ”„ Processing pretrained_resnet50...
âœ… Diagnosis counts saved: /Users/joi263/Documents/MultimodalTabData/data/pretrained_resnet50_data/pretrained_resnet50_diagnosis_counts.csv

ðŸ”„ Processing vit_base...
âœ… Diagnosis counts saved: /Users/joi263/Documents/MultimodalTabData/data/vit_base_data/vit_base_diagnosis_counts.csv

âœ… Diagnosis counts generated for all 4 datasets!


In [8]:
#filters using glioma, glioblastoma, meningioma

# âœ… Paths to the 4 cleaned master CSVs
cleaned_files = {
    "efficientnet": "/Users/joi263/Documents/MultimodalTabData/data/efficientnet_data/efficientnet_cleaned_master.csv",
    "imagenet_resnet50": "/Users/joi263/Documents/MultimodalTabData/data/imagenet_resnet50_data/imagenet_resnet50_cleaned_master.csv",
    "pretrained_resnet50": "/Users/joi263/Documents/MultimodalTabData/data/pretrained_resnet50_data/pretrained_resnet50_cleaned_master.csv",
    "vit_base": "/Users/joi263/Documents/MultimodalTabData/data/vit_base_data/vit_base_cleaned_master.csv"
}

# âœ… Keywords for filtering
keywords = ["glioma", "glioblastoma", "meningioma"]
pattern = "|".join(keywords)  # "glioma|glioblastoma|meningioma"

# âœ… Batch Filtering
for name, path in cleaned_files.items():
    print(f"\nðŸ”„ Filtering {name} for {keywords}...")
    
    df = pd.read_csv(path)
    df_filtered = df[df["permanent"].str.contains(pattern, case=False, na=False)]
    
    save_path = os.path.join(os.path.dirname(path), f"{name}_omas_only.csv")
    df_filtered.to_csv(save_path, index=False)
    
    print(f"âœ… Filtered CSV saved: {save_path}")
    print(f"âœ… Shape: {df_filtered.shape} (rows, columns)")

print("\nâœ… Filtering complete for all 4 datasets!")



ðŸ”„ Filtering efficientnet for ['glioma', 'glioblastoma', 'meningioma']...
âœ… Filtered CSV saved: /Users/joi263/Documents/MultimodalTabData/data/efficientnet_data/efficientnet_omas_only.csv
âœ… Shape: (273, 228) (rows, columns)

ðŸ”„ Filtering imagenet_resnet50 for ['glioma', 'glioblastoma', 'meningioma']...
âœ… Filtered CSV saved: /Users/joi263/Documents/MultimodalTabData/data/imagenet_resnet50_data/imagenet_resnet50_omas_only.csv
âœ… Shape: (273, 228) (rows, columns)

ðŸ”„ Filtering pretrained_resnet50 for ['glioma', 'glioblastoma', 'meningioma']...
âœ… Filtered CSV saved: /Users/joi263/Documents/MultimodalTabData/data/pretrained_resnet50_data/pretrained_resnet50_omas_only.csv
âœ… Shape: (273, 228) (rows, columns)

ðŸ”„ Filtering vit_base for ['glioma', 'glioblastoma', 'meningioma']...
âœ… Filtered CSV saved: /Users/joi263/Documents/MultimodalTabData/data/vit_base_data/vit_base_omas_only.csv
âœ… Shape: (273, 228) (rows, columns)

âœ… Filtering complete for all 4 datasets!
