In [5]:
#Dependencies
import pandas as pd
import numpy as np

np.random.seed(51)

# population 
n = 50  

# Sample SES (socioecon status) first
ses_choices = ['Low', 'Middle', 'High']
ses_probs = [0.3, 0.5, 0.2]  # change to match your target population
ses = np.random.choice(ses_choices, size=n, p=ses_probs)

# Conditional sampling for Smoking given SES
# Specify P(smoker | SES)
p_smoke_given_ses = {
    'Low': 0.35,
    'Middle': 0.20,
    'High': 0.10,
}

smoking = []
for s in ses:
    smoking.append(np.random.choice(['Yes', 'No', 'Former'],
                                    p=[p_smoke_given_ses[s], 1 - p_smoke_given_ses[s], 0.0]))
smoking = np.array(smoking)

# Sample Sexual Orientation
sex_orient_choices = ['Heterosexual', 'Homosexual', 'Bisexual', 'Asexual', 'Other']
sex_orient_probs = [0.75, 0.07, 0.10, 0.03, 0.05]
sex_orient = np.random.choice(sex_orient_choices, size=n, p=sex_orient_probs)

# Define possible political ideologies
ideology_choices = ['Conservative', 'Liberal', 'Moderate', 'Apolitical']

# Define conditional probabilities for each sexual orientation
# (sums to 1.0 per row)
orientation_to_ideology_probs = {
    'Heterosexual': [0.45, 0.35, 0.15, 0.05],
    'Homosexual':  [0.05, 0.75, 0.15, 0.05],
    'Bisexual':    [0.10, 0.70, 0.15, 0.05],
    'Asexual':     [0.05, 0.10, 0.20, 0.65],
    'Other':       [0.20, 0.40, 0.25, 0.15]
}

# Sample ideology conditional on sexual orientation
political_ideology = []
for so in sex_orient:
    probs = orientation_to_ideology_probs[so]
    political_ideology.append(np.random.choice(ideology_choices, p=probs))

political_ideology = np.array(political_ideology)

# Age and Employment Correlation
#Sample Employment
employment_choices = ['Employed', 'Unemployed', 'Student', 'Retired', 'Homemaker']
employment_probs = [0.63, 0.05, 0.17, 0.13, 0.02]

# Define age ranges per employment type
p_age_given_employment = {
    'Employed': list(range(18, 63)),
    'Unemployed': list(range(18, 63)),
    'Student': list(range(18, 23)),
    'Retired': list(range(62, 91)),
    'Homemaker': list(range(18, 91))
}

# Sample employment categories
employment = np.random.choice(employment_choices, size=n, p=employment_probs)

ages = np.zeros(n, dtype=int)

for emp_type in employment_choices:
    mask = employment == emp_type
    ages[mask] = np.random.choice(p_age_given_employment[emp_type], size=mask.sum(), replace=True)

# Optional: Build employment_age_dict if you still want it
employment_age_dict = {
    emp_type: ages[employment == emp_type].tolist() for emp_type in employment_choices
}

HAM_D = np.random.choice(
    [
        '0–6: Normal / No Depression',
        '7–17: Mild Depression',
        '18–24: Moderate Depression',
        '25+: Severe Depression'
    ],
    size=n,
    p=[0.45, 0.30, 0.15, 0.10]
)

# Synthetic distributions using psychosocial and clinically relevant factors
bmis = np.round(np.random.normal(27, 5, n), 1)
family_health_history = np.random.choice(['Diabetes', 'Hypertension', 'None', 'Asthma', 'Heart Disease','Cancer'], n)
current_diagnosis = np.random.choice(['None', 'Anxiety', 'Depression', 'Bipolar', 'Scizophrenia'], n)
race = np.random.choice(['White', 'Black', 'Asian', 'Hispanic', 'Native American', 'Other'], n)
gender = np.random.choice(['Male', 'Female', 'Non-binary', 'Other'], n)
weekly_alcohol_intake = np.random.poisson(3, n)
sad_fatigued = np.random.choice(['Yes', 'No'], n)
loss_interest = np.random.choice(['Yes', 'No'], n)
hours_sleep = np.round(np.random.normal(7, 1.5, n), 1)
#HAM-D
#


#Nonsensical Variables
likes_chocolate = np.random.choice(['Yes', 'No'], n)
pet_preference = np.random.choice(['Dog', 'Cat', 'Bird', 'Fish', 'Guinea Pig', 'Other' ], n)
drinks_coffee = np.random.choice(['Yes', 'No'], n)
favourite_color = np.random.choice(['Blue', 'Green','Yellow','Red','Orange','Purple'], n)
favourite_season = np.random.choice(['Fall','Winter','Spring','Summer'], n)
favourite_music_genre = np.random.choice(['Pop','HipHop','Country','Indie'], n)
favourite_movie_genre = np.random.choice(['Comedy','Romance','Thriller','Indie', 'Foreign', 'Action', 'Science Fiction'], n)
favourite_cuisine = np.random.choice(['Thai','Mexican','American','Indian','Chinese','Italian','Mediterranean'], n)
orders_takeout = np.random.choice(['Daily', 'Weekly', 'Monthly', 'Never'], n)
spotify_vs_applemusic = np.random.choice(['Spotify', 'Apple Music'], n)
preferred_grocery_store = np.random.choice(['Wegmans', 'Giant', 'Whole Foods', 'Harris Teeter', 'Kroger'], n)
favourite_super_store = np.random.choice(['Walmart', 'Target',], n)
morning_or_night = np.random.choice(['Morning', 'Night'], n)
birthday_month = np.random.choice(['January', 'February','March','April','May','June','July','August','September','October','November','December'], n)
favourite_meal = np.random.choice(['Breakfast', 'Lunch','Dinner'], n)
likes_art = np.random.choice(['Yes', 'No'], n)
favourite_sport = np.random.choice(['Basketball', 'Soccer', 'Hockey', 'Gymnastics', 'None'], n)
religious = np.random.choice(['Yes', 'No'], n)
exercises = np.random.choice(['Yes', 'No'], n)
likes_kids = np.random.choice(['No', 'Yes'], n, p=[0.2, 0.8])
travels = np.random.choice(['Yes', 'No'], n)
likes_vegetables = np.random.choice(['Yes', 'No'], n)
phone = np.random.choice(['Apple', 'Android'], n)
has_drivers_liscense = np.random.choice(['Yes', 'No'], n)
can_ride_bicyle = np.random.choice(['Yes', 'No'], n)
best_highschool_subject = np.random.choice(['Math', 'Science', 'English','History'], n)
skips_breakfast = np.random.choice(['Yes', 'No'], n)
bilingual = np.random.choice(['Yes', 'No'], n)
binge_watches_tv = np.random.choice(['Yes', 'No'], n)
has_cheated_on_an_assignment_before = np.random.choice(['Yes', 'No'], n)
relationship_status = np.random.choice(['Dating', 'Single'], n)
academic_preference = np.random.choice(['STEM', 'Humanities'], n)
reads_novels = np.random.choice(['Yes', 'No'], n)
water_intake_oz = np.round(np.random.normal(44, 1, n), 1) #https://www.cdc.gov/nutrition/php/data-research/fast-facts-water-consumption.html


#Physical/Genetic Traits
eye_color = np.random.choice(['Brown', 'Hazel', 'Blue', 'Green'], n) 
hair_color = np.random.choice(['Brown', 'Black', 'Blonde', 'Red'], n)
height_in_inches_rounded = np.random.randint(58, 75, n) #check this
L_R_handed = np.random.choice(['Left', 'Right'], n, p=[0.1, 0.9])
wears_glasses = np.random.choice(['Yes', 'No'], n)
allergy = np.random.choice(['None', 'Pollen','Tree Nuts', 'Peanut', 'Gluten', 'Dust', 'Animal Dander', 'Mold', 'Milk/Eggs','Shellfish','Penecillin'], n)


#Altruistic Behaviors 
reclycling_habits = np.random.choice(['Always', 'Sometimes', 'Never'], n)
donates_to_charity_yearly = np.random.choice(['Always', 'Sometimes', 'Never'], n)
registered_organ_donor = np.random.choice(['Yes', 'No'], n)
volunteers = np.random.choice(['Always', 'Sometimes','Never'], n)
uses_public_transportation = np.random.choice(['Always', 'Sometimes', 'Never'], n)
donated_blood = np.random.choice(['Yes', 'No'], n)
picks_up_litter = np.random.choice(['Always', 'Sometimes', 'Never'], n)
helps_elderly = np.random.choice(['Always', 'Sometimes', 'Never'], n)
votes = np.random.choice(['Always', 'Sometimes', 'Never'], n)





# Combine into a DataFrame
df = pd.DataFrame({
    'Age': ages,
    'BMI': bmis,
    'Family_Health_History': family_health_history,
    'Current_Diagnosis': current_diagnosis,
    'Race': race,
    'Gender': gender,
    'Sexual_Orientation': sex_orient,
    'Weekly_Alcohol_Intake': weekly_alcohol_intake,
    'Smoking': smoking,
    'HAM-D' : HAM_D,
    'Been_Sad_or_Fatigued': sad_fatigued,
    'Loss_of_Interest': loss_interest,
    'Hours_of_Sleep': hours_sleep,
    'Employment': employment,
    'Socioeconomic_Status': ses,
    'Political_Ideology': political_ideology,
    'Eye_Color': eye_color,
    'Likes_Chocolate': likes_chocolate,
    'Pet_Preference': pet_preference,
    'Recycling_Habits': reclycling_habits,
    'Drinks_Coffee': drinks_coffee,
    'Annual_Charity_Donation': donates_to_charity_yearly,
    'Likes_Chocolate': likes_chocolate,
    'Favourite_Color': favourite_color,
    'Favourite_Season': favourite_season,
    'Favourite_Music_Genre': favourite_music_genre,
    'Favourite_Movie_Genre': favourite_movie_genre,
    'Favourite_Cuisine': favourite_cuisine,
    'Orders_Takeout': orders_takeout,
    'Spotify_vs_AppleMusic': spotify_vs_applemusic,
    'Preferred_Grocery_Store': preferred_grocery_store,
    'Favourite_Super_Store': favourite_super_store,
    'Morning_or_Night': morning_or_night,
    'Birthday_Month': birthday_month,
    'Favourite_Meal': favourite_meal,
    'Likes_Art': likes_art,
    'Favourite_Sport': favourite_sport,
    'Religious': religious,
    'Exercises': exercises,
    'Likes_Kids': likes_kids,
    'Travels': travels,
    'Likes_Vegetables': likes_vegetables,
    'Phone': phone,
    'Has_Drivers_License': has_drivers_liscense,
    'Can_Ride_Bicycle': can_ride_bicyle,
    'Best_Highschool_Subject': best_highschool_subject,
    'Skips_Breakfast': skips_breakfast,
    'Bilingual': bilingual,
    'Binge_Watches_TV': binge_watches_tv,
    'Has_Cheated_on_an_Assignment_Before': has_cheated_on_an_assignment_before,
    'Relationship_Status': relationship_status,
    'Academic_Preference': academic_preference,
    'Reads_Novels': reads_novels,
    'Water_Intake_oz': water_intake_oz,
    'Hair_Color': hair_color,
    'Height_in_Inches': height_in_inches_rounded,
    'Handedness': L_R_handed,
    'Wears_Glasses': wears_glasses,
    'Allergy': allergy,
    'Registered_Organ_Donor': registered_organ_donor,
    'Volunteers': volunteers,
    'Uses_Public_Transportation': uses_public_transportation,
    'Donated_Blood': donated_blood,
    'Picks_Up_Litter': picks_up_litter,
    'Helps_Elderly': helps_elderly,
    'Votes': votes,    
# })
})

df.to_csv("synthetic_data1.csv")


Unnamed: 0,Age,BMI,Family_Health_History,Current_Diagnosis,Race,Gender,Sexual_Orientation,Weekly_Alcohol_Intake,Smoking,HAM-D,...,Handedness,Wears_Glasses,Allergy,Registered_Organ_Donor,Volunteers,Uses_Public_Transportation,Donated_Blood,Picks_Up_Litter,Helps_Elderly,Votes
0,37,20.3,Heart Disease,Scizophrenia,Asian,Female,Homosexual,3,Yes,25+: Severe Depression,...,Right,Yes,Gluten,No,Sometimes,Never,Yes,Sometimes,Sometimes,Never
1,22,23.8,Cancer,Anxiety,Other,Non-binary,Heterosexual,7,Yes,25+: Severe Depression,...,Right,Yes,Gluten,No,Sometimes,Never,No,Sometimes,Always,Sometimes
2,40,34.6,Cancer,Bipolar,White,Non-binary,Bisexual,4,No,7–17: Mild Depression,...,Right,Yes,,Yes,Sometimes,Sometimes,No,Never,Always,Always
3,64,29.9,Asthma,Depression,Native American,Male,Heterosexual,2,No,18–24: Moderate Depression,...,Right,Yes,Animal Dander,Yes,Sometimes,Sometimes,Yes,Never,Never,Always
4,40,16.7,Hypertension,Depression,Asian,Other,Asexual,3,Yes,18–24: Moderate Depression,...,Right,No,Mold,No,Sometimes,Sometimes,No,Sometimes,Never,Always
5,61,25.8,Asthma,,Black,Female,Heterosexual,1,No,7–17: Mild Depression,...,Right,Yes,Dust,Yes,Never,Sometimes,No,Sometimes,Sometimes,Sometimes
6,36,31.3,Asthma,Anxiety,White,Male,Heterosexual,5,No,0–6: Normal / No Depression,...,Right,No,Pollen,No,Always,Always,Yes,Always,Always,Sometimes
7,18,24.5,Asthma,Scizophrenia,Native American,Male,Heterosexual,0,Yes,18–24: Moderate Depression,...,Right,No,Peanut,Yes,Never,Never,Yes,Always,Always,Sometimes
8,29,25.9,,Anxiety,Asian,Male,Heterosexual,5,No,0–6: Normal / No Depression,...,Right,Yes,Animal Dander,Yes,Always,Sometimes,Yes,Never,Never,Sometimes
9,35,26.8,,Bipolar,Asian,Male,Heterosexual,4,No,7–17: Mild Depression,...,Right,No,Shellfish,Yes,Sometimes,Never,No,Always,Never,Never
