In [3]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("synthetic_data1.csv")
df = df.rename(columns={"Unnamed: 0": "Patient_ID"})

# Define baseline and irrelevant features
baseline_cols = list(df.columns[:16])
irrelevant_cols = list(df.columns[16:])

# Sanity check
assert len(irrelevant_cols) >= 50, "Not enough irrelevant features"

# Reproducibility
rng = np.random.default_rng(seed=42)

# Randomize irrelevant columns once
shuffled_irrelevant = rng.permutation(irrelevant_cols)

# Store dataframes
dfs = {}

# Baseline dataframe
dfs["baseline"] = df[baseline_cols]

# Incremental perturbations
for k in range(1, 11):  # 1 → 10
    added_cols = list(shuffled_irrelevant[: 5 * k])
    df_name = f"baseline_plus_{5*k}"
    dfs[df_name] = df[baseline_cols + added_cols]

# Example access:
# dfs["baseline"]
# dfs["baseline_plus_5"]
# dfs["baseline_plus_50"]


In [9]:
for name in dfs:
    print(name)

baseline
baseline_plus_5
baseline_plus_10
baseline_plus_15
baseline_plus_20
baseline_plus_25
baseline_plus_30
baseline_plus_35
baseline_plus_40
baseline_plus_45
baseline_plus_50


In [10]:
dfs["baseline"].head()


Unnamed: 0,Patient_ID,Age,BMI,Family_Health_History,Current_Diagnosis,Race,Gender,Sexual_Orientation,Weekly_Alcohol_Intake,Smoking,HAM-D,Been_Sad_or_Fatigued,Loss_of_Interest,Hours_of_Sleep,Employment,Socioeconomic_Status
0,0,37,20.3,Heart Disease,Scizophrenia,Asian,Female,Homosexual,3,Yes,25+: Severe Depression,No,Yes,6.3,Employed,Middle
1,1,22,23.8,Cancer,Anxiety,Other,Non-binary,Heterosexual,7,Yes,25+: Severe Depression,No,No,6.3,Student,Low
2,2,40,34.6,Cancer,Bipolar,White,Non-binary,Bisexual,4,No,7–17: Mild Depression,No,Yes,8.9,Employed,Middle
3,3,64,29.9,Asthma,Depression,Native American,Male,Heterosexual,2,No,18–24: Moderate Depression,No,Yes,6.4,Retired,Middle
4,4,40,16.7,Hypertension,Depression,Asian,Other,Asexual,3,Yes,18–24: Moderate Depression,Yes,Yes,8.0,Employed,Low


In [11]:
dfs["baseline_plus_5"].head()

Unnamed: 0,Patient_ID,Age,BMI,Family_Health_History,Current_Diagnosis,Race,Gender,Sexual_Orientation,Weekly_Alcohol_Intake,Smoking,...,Been_Sad_or_Fatigued,Loss_of_Interest,Hours_of_Sleep,Employment,Socioeconomic_Status,Drinks_Coffee,Likes_Vegetables,Favourite_Sport,Votes,Height_in_Inches
0,0,37,20.3,Heart Disease,Scizophrenia,Asian,Female,Homosexual,3,Yes,...,No,Yes,6.3,Employed,Middle,No,Yes,Basketball,Never,71
1,1,22,23.8,Cancer,Anxiety,Other,Non-binary,Heterosexual,7,Yes,...,No,No,6.3,Student,Low,No,Yes,Hockey,Sometimes,59
2,2,40,34.6,Cancer,Bipolar,White,Non-binary,Bisexual,4,No,...,No,Yes,8.9,Employed,Middle,Yes,No,,Always,70
3,3,64,29.9,Asthma,Depression,Native American,Male,Heterosexual,2,No,...,No,Yes,6.4,Retired,Middle,Yes,Yes,Soccer,Always,65
4,4,40,16.7,Hypertension,Depression,Asian,Other,Asexual,3,Yes,...,Yes,Yes,8.0,Employed,Low,Yes,No,Soccer,Always,71


In [12]:
dfs['baseline_plus_10'].head()

Unnamed: 0,Patient_ID,Age,BMI,Family_Health_History,Current_Diagnosis,Race,Gender,Sexual_Orientation,Weekly_Alcohol_Intake,Smoking,...,Drinks_Coffee,Likes_Vegetables,Favourite_Sport,Votes,Height_in_Inches,Has_Drivers_License,Picks_Up_Litter,Uses_Public_Transportation,Donated_Blood,Birthday_Month
0,0,37,20.3,Heart Disease,Scizophrenia,Asian,Female,Homosexual,3,Yes,...,No,Yes,Basketball,Never,71,Yes,Sometimes,Never,Yes,June
1,1,22,23.8,Cancer,Anxiety,Other,Non-binary,Heterosexual,7,Yes,...,No,Yes,Hockey,Sometimes,59,No,Sometimes,Never,No,May
2,2,40,34.6,Cancer,Bipolar,White,Non-binary,Bisexual,4,No,...,Yes,No,,Always,70,Yes,Never,Sometimes,No,November
3,3,64,29.9,Asthma,Depression,Native American,Male,Heterosexual,2,No,...,Yes,Yes,Soccer,Always,65,Yes,Never,Sometimes,Yes,March
4,4,40,16.7,Hypertension,Depression,Asian,Other,Asexual,3,Yes,...,Yes,No,Soccer,Always,71,No,Sometimes,Sometimes,No,February


In [4]:
import pickle

with open("dfs.pkl", "wb") as f:  # This will save in the current notebook folder
    pickle.dump(dfs, f)
