# Import libraries

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import truncnorm

# Generate baseline systolic blood pressure data

In [3]:
# Set seed for reproducibility
np.random.seed(42)

# Aggregate sample size
N = 300

# Generate column 'age' values
age_lower = 40
age_upper = 75
age_mean = 60
age_std = 10

def generate_age(n, mean = age_mean, std = age_std, low = age_lower, high = age_upper):
    lower_trunc = (low - mean) / std, 
    upper_trunc = (high - mean) / std
    age = truncnorm.rvs(lower_trunc, upper_trunc, loc = mean, scale = std, size = n).astype(int)
    return age

# Generate column 'sex' values
sex_labels = ['M', 'male', 'm', 'F', 'female', 'FEMALE', None, '']
probs_under_55 = np.array([0.30, 0.20, 0.05, 0.15, 0.05, 0.25]) ## Male/Female distribution under age 55
probs_under_55 = np.append(probs_under_55 * 0.98, [0.01, 0.01]) ## 2% of records have missing labels
probs_over_55 = np.array([0.20, 0.20, 0.05, 0.50, 0.025, 0.025]) ## Male/Female distribution over age 55
probs_over_55 = np.append(probs_over_55 * 0.98, [0.01, 0.01]) ## 2% of records have missing labels

def generate_sex(age):
    if age < 55:
        sex = np.random.choice(sex_labels, p = probs_under_55)     
    else:
        sex = np.random.choice(sex_labels, p = probs_over_55)
    return sex

# Generate column 'ethnicity' values
ethnicity_labels = ['White', 'Black', 'Hispanic', 'Asian', 'Other', None]
ethnicity_probs = np.array([0.5, 0.3, 0.15, 0.03, 0.02])  ## Ethnicity distribution
ethnicity_probs = np.append(ethnicity_probs * 0.97, 0.03) ## 3% of records have missing labels

def generate_ethnicity():
    ethnicity = np.random.choice(ethnicity_labels, p = ethnicity_probs)
    return ethnicity

# Generate column 'baseline_sbp'
baseline_sbp_labels = ['Normal', 'Outlier']
baseline_sbp_prob = np.array([0.95, 0.05]) ## 5% of the data will have outlier baseline sbp values

def generate_baseline_sbp():    
    if np.random.choice(baseline_sbp_labels, p = baseline_sbp_prob) == 'Outlier': 
        baseline_sbp = round(np.random.choice([np.random.normal(190, 5), np.random.normal(110, 5)])) ## Sample from outlier blood pressure distribution
    else:
        baseline_sbp = round(np.random.normal(loc = 150, scale = 10))  # Sample from hypertension blood pressure distribution
    return baseline_sbp

# Generate full dataset
def generate_baseline_data(n):
    data = []
    ages = generate_age(n)
    
    for i in range(n):
        patient_id = f"P{1000+i}"
        age = ages[i]
        sex = generate_sex(age)
        ethnicity = generate_ethnicity()
        baseline_sbp = generate_baseline_sbp()
        
        data.append([patient_id, age, sex, ethnicity, baseline_sbp])
    
    df = pd.DataFrame(data, columns=['patient_id', 'age', 'sex', 'ethnicity', 'baseline_sbp'])

    ## Introduce a few duplicate rows (same patient ID)
    duplicate_ids = np.random.choice(df['patient_id'], size=5, replace=False)
    duplicates = df[df['patient_id'].isin(duplicate_ids)]
    df = pd.concat([df, duplicates], ignore_index=True)

    # Shuffle rows
    df = df.sample(frac=1).reset_index(drop=True)
    
    return df

# Create the dataset
baseline_df = generate_baseline_data(N)

In [5]:
baseline_df

Unnamed: 0,patient_id,age,sex,ethnicity,baseline_sbp
0,P1275,59,male,White,149
1,P1272,68,F,Other,158
2,P1197,69,F,Hispanic,155
3,P1164,47,F,White,153
4,P1095,59,F,Black,146
...,...,...,...,...,...
300,P1118,69,F,Hispanic,158
301,P1221,61,F,Black,147
302,P1031,50,FEMALE,,145
303,P1121,69,male,White,154


In [7]:
# Save to CSV
## baseline_df.to_csv('baseline_sbp_data.csv', index=False)

# Generate post-experiment systolic blood pressure data

In [32]:
# Load your dataset
df = pd.read_csv("variants_data.csv")

# Set seed for reproducibility
np.random.seed(42)

# Initialize side_effect column to 'No' for all
df["side_effect"] = "No"

# Assign side effects to ~2% of treatment group only
treatment_mask = df["variant"] == "Treatment"
treatment_indices = df[treatment_mask].index
n_treatment = len(treatment_indices)

# Randomly select ~2% of treatment patients to have side effects
n_side_effect = int(0.02 * n_treatment)
side_effect_indices = np.random.choice(treatment_indices, size=n_side_effect, replace=False)

# Assign 'Yes' to side_effect for those patients
df.loc[side_effect_indices, "side_effect"] = "Yes"

# Simulate post_sbp
# For control: no significant change (small noise)
control_mask = df["variant"] == "Control"
n_control = control_mask.sum()
df.loc[control_mask, "post_sbp"] = df.loc[control_mask, "baseline_sbp"] + np.random.normal(loc=0, scale=3, size=n_control)

# For treatment: average reduction in SBP
df.loc[treatment_mask, "post_sbp"] = df.loc[treatment_mask, "baseline_sbp"] + np.random.normal(loc=-10, scale=5, size=n_treatment)

# Turn 'post_sbp' into integers
df['post_sbp'] = df['post_sbp'].round().astype(int)

post_df = df

In [34]:
post_df

Unnamed: 0,patient_id,age,age_group,sex,ethnicity,baseline_sbp,variant,side_effect,post_sbp
0,P1275,59,50s,Male,White,149,Control,No,153
1,P1272,68,60s,Female,Other,158,Treatment,No,152
2,P1197,69,60s,Female,Hispanic,155,Treatment,No,152
3,P1164,47,40s,Female,White,153,Control,No,157
4,P1095,59,50s,Female,Black,146,Treatment,No,129
...,...,...,...,...,...,...,...,...,...
253,P1267,64,60s,Male,White,137,Control,No,133
254,P1118,69,60s,Female,Hispanic,158,Control,No,158
255,P1221,61,60s,Female,Black,147,Control,No,148
256,P1121,69,60s,Male,White,154,Treatment,No,144


In [36]:
# Save to CSV
## post_df.to_csv("post_sbp_data.csv", index=False)