In [3]:
import pandas as pd
import numpy as np

# Define the diseases, symptoms, and other relevant data
data = {
    'Disease': ['Cold', 'Flu', 'Asthma', 'Hypertension', 'Heart Disease', 'Diabetes', 'Anxiety', 'Depression', 'Allergies', 'Breast Cancer', 'Prostate Cancer', 'Colon Cancer', 'IBD', 'Osteoarthritis', 'Cystic Fibrosis', 'Tuberculosis', 'Scleroderma'],
    'Prevalence_USA': [0.116, 0.058, 0.046, 0.191, 0.069, 0.058, 0.110, 0.098, 0.173, 0.006, 0.006, 0.006, 0.006, 0.058, 0.0001, 0.0001, 0.0002],
    'Percent Male': [0.50, 0.50, 0.50, 0.50, 0.55, 0.52, 0.37, 0.40, 0.50, 0.01, 1.00, 0.50, 0.50, 0.40, 0.50, 0.50, 0.20],
    'Mean Age': [30, 30, 20, 50, 60, 50, 30, 30, 20, 60, 65, 60, 30, 60, 20, 40, 50],
    'Age Variance': [15, 20, 15, 10, 10, 10, 10, 10, 15, 10, 10, 10, 10, 10, 10, 20, 10]
}

# Define the symptoms for each disease (You can replace these with your own symptoms list)
data['Symptoms'] = [
    ['Sneezing', 'Runny Nose', 'Itchy Eyes', 'Cough', 'Mild Sore Throat'],  # For Cold
    ['Fever', 'Cough', 'Sore Throat', 'Chills', 'Muscle Aches'],  # For Flu
    ['Wheezing', 'Shortness of Breath', 'Chest Tightness', 'Trouble Sleeping', 'Coughing or Whistling Sound'],  # For Asthma
    ['Headache', 'Shortness of Breath', 'Nosebleeds', 'Flushing', 'Dizziness'],  # For Hypertension
    ['Chest Pain', 'Shortness of Breath', 'Fatigue', 'Swelling in Legs, Ankles, and Feet', 'Irregular Heartbeat'],  # For Heart Disease
    ['Increased Thirst', 'Frequent Urination', 'Unexplained Weight Loss', 'Fatigue', 'Blurred Vision'],  # For Diabetes
    ['Excessive Worry', 'Restlessness', 'Fatigue', 'Irritability', 'Tense Muscles'],  # For Anxiety
    ['Depressed Mood', 'Loss of Interest', 'Fatigue', 'Sleep Problems', 'Changes in Appetite'],  # For Depression
    ['Sneezing', 'Runny Nose', 'Itchy Eyes', 'Shortness of Breath', 'Cough'],  # For Allergies
    ['Lump in the Breast', 'Change in Breast Size', 'Change in Breast Shape', 'Skin Dimpling', 'Inverted Nipple'],  # For Breast Cancer
    ['Trouble Urinating', 'Blood in Semen', 'Discomfort in the Pelvic Area', 'Erectile Dysfunction', 'Bone Pain'],  # For Prostate Cancer
    ['Abdominal Pain', 'Blood in Stool', 'Change in Bowel Habits', 'Fatigue', 'Unintended Weight Loss'],  # For Colon Cancer
    ['Abdominal Pain', 'Diarrhea', 'Fatigue', 'Reduced Appetite', 'Fever'],  # For IBD
    ['Joint Pain', 'Stiffness', 'Loss of Flexibility', 'Bone Tenderness', 'Swelling'],  # For Osteoarthritis
    ['Persistent Coughing', 'Frequent Lung Infections', 'Wheezing', 'Fatigue', 'Poor Growth'],  # For Cystic Fibrosis
    ['Coughing', 'Chest Pain', 'Weight Loss', 'Fatigue', 'Night Sweats'],  # For Tuberculosis
    ['Hardened Skin', 'Swollen Fingers', 'Heartburn', 'Difficulty Swallowing', 'Joint Pain'],  # For Scleroderma
]

disease_symptoms_info = pd.DataFrame(data)

# Calculate the scaled prevalence
disease_symptoms_info['Scaled Prevalence'] = disease_symptoms_info['Prevalence_USA'] / disease_symptoms_info['Prevalence_USA'].sum()

# Save the disease_symptoms_info DataFrame to a .csv file
disease_symptoms_info.to_csv('disease_symptoms_info.csv', index=False)

# Define common symptoms to add as noise
common_symptoms = ['Headache', 'Backache', 'Dizziness', 'Fatigue', 'Nausea']

# Define function to add noise to symptoms
def add_noise(symptoms):
    choice = np.random.choice(['add', 'remove', 'both'], p=[0.33, 0.33, 0.34])
    if choice == 'add' or choice == 'both':
        symptoms.append(np.random.choice(common_symptoms))
    if (choice == 'remove' or choice == 'both') and len(symptoms) > 1:
        symptoms.remove(np.random.choice(symptoms))
    return symptoms

# Define function to generate a search query
def generate_search_query(symptoms):
    pattern = np.random.choice([
        'I am feeling {}{}. What could be the cause?',
        'What could be the cause of {}{}?',
        'What does it mean if I have {}{}?',
        'Why am I experiencing {}{}?'
    ])
    symptoms_str = ', '.join(symptoms[:-1])
    if len(symptoms) > 1:
        symptoms_str += ' and ' + symptoms[-1]
    query = pattern.format(symptoms_str, '?' if pattern.endswith('{}') else '')
    return query

# Define the raters
raters = ['Yiwei', 'Josue', 'Olivia', 'Matias', 'Jane']

# Initialize the list to store patient data
patients = []

# Loop to generate 300 patient observations
for i in range(300):

    # Drawing the disease, using the prevalence as weights
    disease = np.random.choice(disease_symptoms_info['Disease'], p=disease_symptoms_info['Scaled Prevalence'])

    # Retrieving the disease information
    disease_row = disease_symptoms_info[disease_symptoms_info['Disease'] == disease].iloc[0]

    # Drawing the sex, using the % male column
    sex = np.random.choice(['Male', 'Female'], p=[disease_row['Percent Male'], 1-disease_row['Percent Male']])

    # Drawing the age, using the mean and variance
    age = np.random.normal(disease_row['Mean Age'], np.sqrt(disease_row['Age Variance']))
    while age < 0:  # Redrawing if age is negative
        age = np.random.normal(disease_row['Mean Age'], np.sqrt(disease_row['Age Variance']))
    age = int(age)  # Converting age to an integer

    # Retrieving the symptoms and adding noise
    symptoms = disease_row['Symptoms'].copy()  # Creating a copy to avoid altering the original symptoms
    symptoms = add_noise(symptoms)

    # Adding the patient data to the list
    patients.append([i+1, age, sex, disease, symptoms])

# Creating a DataFrame from the patient data
patient_data = pd.DataFrame(patients, columns=['Patient ID', 'Age', 'Sex', 'Disease', 'Symptoms'])

# Generate search queries
patient_data['Search Query'] = patient_data['Symptoms'].apply(generate_search_query)

# Save the DataFrame to a .csv file
patient_data.to_csv('simulated_patient_data_with_adjusted_noise_and_id.csv', index=False)

# Drop the 'Disease' column to blind the raters to the diagnosis
patient_data_blinded = patient_data.drop('Disease', axis=1)

# Assign records evenly among the raters
patient_data_blinded['Rater'] = [raters[i % len(raters)] for i in range(len(patient_data_blinded))]

# Save the blinded DataFrame to a .csv file
patient_data_blinded.to_csv('blinded_patient_data_with_adjusted_noise_and_id.csv', index=False)
