In [1]:
import numpy as np
import os
import pandas as pd
from pygformula import ParametricGformula
from pygformula.parametric_gformula.interventions import static
from pygformula.data import load_basicdata_nocomp
from IPython import get_ipython

### Toy example with real data

In [2]:
data = pd.read_csv('included_cohort_sample_prep.csv')

In [3]:
data.drop(['Unnamed: 0'], inplace=True, axis=1)

In [4]:
data = data.sample(frac=0.1, random_state=1).copy()

In [5]:
data.sexe = data.sexe.map({'H':0, 'D':1})

In [6]:
len(data)

37495

In [7]:
data.columns

Index(['NIA', 'sexe', 'data_naixement', 'abs_c', 'abs', 'pais_c',
       'N_vaccine_total', 'VACUNA_1_DATA', 'VACUNA_1_MOTIU', 'VACUNA_2_DATA',
       'VACUNA_2_MOTIU', 'VACUNA_3_DATA', 'VACUNA_3_MOTIU', 'VACUNA_1_DATA_pp',
       'VACUNA_2_DATA_pp', 'VACUNA_3_DATA_pp', 'DATA_DM_min', 'DM',
       'covid_bef_vax', 'test_date_covid_1', 'test_res_covid_1',
       'test_date_covid_2', 'test_res_covid_2', 'test_date_covid_3',
       'test_res_covid_3', 'test_date_imc_1', 'test_res_imc_1',
       'test_date_imc_2', 'test_res_imc_2', 'test_date_imc_3',
       'test_res_imc_3', 'test_date_sp_1', 'test_res_sp_1', 'test_date_sp_2',
       'test_res_sp_2', 'test_date_sp_3', 'test_res_sp_3', 'test_date_dp_1',
       'test_res_dp_1', 'test_date_dp_2', 'test_res_dp_2', 'test_date_dp_3',
       'test_res_dp_3', 'test_date_abdo_1', 'test_res_abdo_1',
       'test_date_abdo_2', 'test_res_abdo_2', 'test_date_abdo_3',
       'test_res_abdo_3', 'test_date_bg_1', 'test_res_bg_1', 'test_date_bg_2',
       

In [8]:
###### CAMBIAR FORMATO DE LOS DATOS (PIVOT) 

In [9]:
for i in range(1,4):
    print(i)
    data.rename({'VACUNA_{}_DATA'.format(i): 'VACUNA_DATA_{}'.format(i)}, axis=1, inplace=True)
    data.rename({'VACUNA_{}_MOTIU'.format(i): 'VACUNA_MOTIU_{}'.format(i)}, axis=1, inplace=True)
    data.rename({'VACUNA_{}_DATA_pp'.format(i): 'VACUNA_DATA_pp_{}'.format(i)}, axis=1, inplace=True)

1
2
3


In [10]:
stubnames = ['VACUNA_DATA_1', 'VACUNA_MOTIU_1', 'VACUNA_DATA_2',
'VACUNA_MOTIU_2', 'VACUNA_DATA_3', 'VACUNA_MOTIU_3', 'VACUNA_DATA_pp_1',
'VACUNA_DATA_pp_2', 'VACUNA_DATA_pp_3', 
'test_date_covid_1', 'test_res_covid_1',
'test_date_covid_2', 'test_res_covid_2', 'test_date_covid_3',
'test_res_covid_3', 'test_date_imc_1', 'test_res_imc_1',
'test_date_imc_2', 'test_res_imc_2', 'test_date_imc_3',
'test_res_imc_3', 'test_date_sp_1', 'test_res_sp_1', 'test_date_sp_2',
'test_res_sp_2', 'test_date_sp_3', 'test_res_sp_3', 'test_date_dp_1',
'test_res_dp_1', 'test_date_dp_2', 'test_res_dp_2', 'test_date_dp_3',
'test_res_dp_3', 'test_date_abdo_1', 'test_res_abdo_1',
'test_date_abdo_2', 'test_res_abdo_2', 'test_date_abdo_3',
'test_res_abdo_3', 'test_date_bg_1', 'test_res_bg_1', 'test_date_bg_2',
'test_res_bg_2', 'test_date_bg_3', 'test_res_bg_3', 'test_date_chol_1',
'test_res_chol_1', 'test_date_chol_2', 'test_res_chol_2',
'test_date_chol_3', 'test_res_chol_3', 'test_date_smoking_1',
'test_res_smoking_1', 'test_date_smoking_2', 'test_res_smoking_2',
'test_date_smoking_3', 'test_res_smoking_3', 'test_date_sociostat_1',
'test_res_sociostat_1', 'test_date_sociostat_2', 'test_res_sociostat_2',
'test_date_sociostat_3', 'test_res_sociostat_3', 'test_date_gma_1',
'test_res_gma_1', 'test_date_gma_2', 'test_res_gma_2',
'test_date_gma_3', 'test_res_gma_3',  'Vacuna_1', 'Vacuna_2', 'Vacuna_3'] 


In [11]:
for i in range(0,len(stubnames)):
    stubnames[i] = stubnames[i][0:-1]

In [12]:
data_piv = pd.wide_to_long(data, list(set(stubnames)), i='NIA', j='time')
data_piv.reset_index(inplace=True, drop=False)

In [13]:
# Remove underscore _ from treatment name

In [14]:
data_piv.rename({'Vacuna_':'Vacuna'}, axis=1, inplace=True)

In [15]:
data_piv.time = data_piv.time - 1

In [16]:
# Transform outcome for binary eof

In [17]:
data_piv.loc[(data_piv.time==1)|(data_piv.time==2), 'DM'] = np.NaN

In [18]:
data_piv_final = data_piv[['NIA','test_res_covid_', 'test_res_gma_', 'test_res_sociostat_', 'Vacuna', 'DM', 'sexe', 'pais_c', 'time']].copy()

In [19]:
# Outcome model GLM and covariate models ML

time_name = 'time'
id = 'NIA'
time_points = np.max(np.unique(data_piv[time_name])) + 1

covnames = ['test_res_covid_', # binary (int)
            'test_res_gma_', # cont (float)
            'test_res_sociostat_', # categorical ordinal (int)
            'Vacuna']

covtypes = ['unknown-binary',
            'unknown-continuous',
            'unknown-continuous',
            'unknown-binary']

trunc_params = ['NA', 'NA', 'NA', 'NA']

covmodels = ['test_res_covid_ ~ lag1_test_res_covid_ + lag2_test_res_covid_ + time',
             'test_res_gma_ ~ lag1_test_res_gma_ + lag2_test_res_gma_ + time',
             'test_res_sociostat_ ~ lag1_test_res_sociostat_ + lag2_test_res_sociostat_ + time',
             'Vacuna ~ lag1_Vacuna + lag2_Vacuna + time']

basecovs = ['pais_c', 'sexe'] 

outcome_name = 'DM'
ymodel = 'DM ~ sexe + pais_c + test_res_covid_ + lag1_test_res_covid_ + lag2_test_res_covid_ + test_res_gma_ + lag1_test_res_gma_ + lag2_test_res_gma_\
              + test_res_sociostat_ + lag1_test_res_sociostat_ + lag2_test_res_sociostat_ + Vacuna + lag1_Vacuna + lag2_Vacuna + time'
ymodel_type='ML'
 
outcome_type = 'binary_eof'
int_descript = ['Never treat', 'Treat on Vacuna only at t1', 'Treat on Vacuna only at t1 & t2', 'Treat on Vacuna at t1, t2 & t3']

Intervention1_Vacuna = [static, np.zeros(time_points), [0, 1, 2]]
Intervention2_Vacuna = [static, np.ones(time_points), [0]]
Intervention3_Vacuna = [static, np.ones(time_points), [0, 1]]
Intervention4_Vacuna = [static, np.ones(time_points), [0, 1, 2]]

In [23]:
g3 = ParametricGformula(obs_data = data_piv_final, id = id, time_name=time_name,
                         time_points = time_points, int_descript = int_descript,
                         covnames=covnames, covtypes=covtypes, trunc_params=trunc_params,
                         covmodels=covmodels, basecovs=basecovs,
                         outcome_name=outcome_name, ymodel=ymodel, outcome_type=outcome_type,
                         Intervention1_Vacuna = Intervention1_Vacuna,
                         Intervention2_Vacuna = Intervention2_Vacuna,
                         Intervention3_Vacuna = Intervention3_Vacuna,
                         Intervention4_Vacuna = Intervention4_Vacuna,
                         nsamples=10, parallel=True, ncores=30)

g3.fit()

print('********************************************************************')

start fitting parametric model.
start simulating.


TypeError: 'in <string>' requires string as left operand, not int

In [21]:
g3.g_results

[0.016272807293010075,
 0.02983172487622354,
 0.01308275338316604,
 0.013082753383166043,
 0.013082753383166043]

### Notes:
- Time has to start at 0, otherwise we get errors of type "operands of shape () could not be broadcast together"
- No NaNs in the covariates, otherwise we get errors of type "operands of shape () could not be broadcast together"
- The way the intervention variable name is passed to the program is the most stupid thing I have ever seen in the history of software package development (something that I could perfectly have done, which is the reason why I do not write software packages)