In [1]:
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
import math

In [None]:
## Function to create cohort of pregnant women, some of which are exposed
## The function creates a simulated cohort and three different periods during which effect applies 

def create_simulation_exposure(range_weeks = [22, 37],
                               frequency_exposure = 0.02,
                               duration_effect_1 = 2,
                               induction_period_1 = 0,
                               duration_effect_2 = 6,
                               induction_period_2 = 0,
                               duration_effect_3 = 16,
                               induction_period_3 = 0,                               
                               number_pts = 10000):
    
    Data_Sim = pd.DataFrame()
    
    size_longitudinal = len(np.arange(range_weeks[0], range_weeks[1]))
    
    for current_id in list(np.arange(1, number_pts+1)):
        
        current_exposure = np.random.choice(a=[0, 1], size=size_longitudinal, p=[(1-frequency_exposure),frequency_exposure])
        
        if np.max(current_exposure) == 1:
            exposed = 1
            first_exposure = np.min(np.where(current_exposure == 1)) + 22

            first_effect_1 = first_exposure + induction_period_1 
            end_effect_1 = first_effect_1 + duration_effect_1

            first_effect_2 = first_exposure + induction_period_2
            end_effect_2 = first_effect_2 + duration_effect_2

            first_effect_3 = first_exposure + induction_period_3
            end_effect_3 = first_effect_3 + duration_effect_3

        elif np.max(current_exposure) == 0:
            exposed = 0
            first_exposure = np.nan

            first_effect_1 = np.nan
            end_effect_1 = np.nan

            first_effect_2 = np.nan
            end_effect_2 = np.nan

            first_effect_3 = np.nan
            end_effect_3 = np.nan
                
                
        Data_Sim = pd.concat([Data_Sim, pd.DataFrame({'id' : [current_id],
                                                      'exposed' : [exposed],
                                                     'first_exposure': [first_exposure], 
                                                     'first_effect_1' : [first_effect_1],
                                                     'end_effect_1' : [end_effect_1],
                                                     'first_effect_2' : [first_effect_2],
                                                     'end_effect_2' : [end_effect_2],
                                                     'first_effect_3' : [first_effect_3],
                                                     'end_effect_3' : [end_effect_3]})])
        
    
    Data_Sim['duration_effect_1'] = duration_effect_1
    Data_Sim['induction_period_1'] = induction_period_1 
    Data_Sim['duration_effect_2'] = duration_effect_2
    Data_Sim['induction_period_2'] = induction_period_2 
    Data_Sim['duration_effect_3'] = duration_effect_3
    Data_Sim['induction_period_3'] = induction_period_3     
    
    return Data_Sim

In [None]:
## Generate dataset that simulates preterm birth based on baseline risk and effect of exposure
## (uses a dataset with the risk of preterm birth in each gestational week "risk_per_week_data")
## "exposure_data" is generated by the function "create_simulation_exposure" 
## "multiplicative_effect" refers to the exposure effect on probability of preterm birth in gestational week
## "effect" specifies which of the three assumptions should be used (see function "create_simulation_exposure" above)

def generate_pret_data(exposure_data, 
                       risk_per_week_data, 
                       multiplicative_effect = 5, 
                       first_week = 22, 
                       effect = 1):
    
    Overall_Data = pd.DataFrame()

    current_risk_per_week_data = risk_per_week_data.loc[risk_per_week_data['week'] >= first_week, : ].copy()
    
    for current_id in sorted(list(exposure_data['id'].unique())):
        print(current_id)
        
        current_data = exposure_data.loc[exposure_data['id'] == current_id, : ].copy()
        exposed_current = current_data['exposed'].values[0]
        current_preterm = 0

        current_first_exposure = current_data['first_exposure'].values[0]                
        
        first_effect_var = 'first_effect_' + str(effect)
        end_effect_var = 'end_effect_' + str(effect)        
        
        current_first_effect = current_data[first_effect_var].values[0]
        current_end_effect = current_data[end_effect_var].values[0]

        for current_week in np.arange(first_week, 37):
            
            current_risk = risk_per_week_data.loc[(risk_per_week_data['week'] == current_week), 'pret_risk_baseline'].values[0]
            
            if (current_week >= current_first_effect) and (current_week <= current_end_effect):
                current_risk = current_risk*multiplicative_effect
                
            current_preterm = np.random.choice(a = [0, 1], size = 1, p = [(1-current_risk), current_risk])[0]                       
            
            if current_preterm == 1:
                Overall_Data = pd.concat([Overall_Data, pd.DataFrame({'id' : [current_id],
                                                                      'exposed' : [exposed_current],
                                                                      'end_pregnancy_week': [current_week],
                                                                      'first_exposure' : [current_first_exposure], 
                                                                      'current_preterm' : [current_preterm]})])
                break
                
            if (current_preterm == 0) and (current_week == 36):
                Overall_Data = pd.concat([Overall_Data, pd.DataFrame({'id' : [current_id], 
                                                                      'exposed' : [exposed_current],
                                                                      'end_pregnancy_week': [current_week],
                                                                      'first_exposure' : [current_first_exposure], 
                                                                      'current_preterm' : [current_preterm]})])

                break                        
    
    duraction_effect_var = 'duration_effect_' + str(effect)
    induction_effect_var = 'induction_period_' + str(effect)    
        
    duration_effect = exposure_data[duraction_effect_var].unique()[0]
    induction_period = exposure_data[induction_effect_var].unique()[0]
    
    Overall_Data['duration_effect'] = duration_effect
    Overall_Data['induction_period'] = induction_period    
     
    Overall_Data = Overall_Data.reset_index(drop = True)
    
    return Overall_Data