In [6]:
import numpy as np
import os
import pandas as pd
from pygformula import ParametricGformula
from pygformula.parametric_gformula.interventions import static
from pygformula.data import load_basicdata_nocomp
import pickle

### G-Formula

In [None]:
data = pd.read_csv('included_cohort_prep.csv')

In [3]:
data.drop(['Unnamed: 0'], inplace=True, axis=1)

In [8]:
data = data.sample(frac=0.5, random_state=1).copy()

In [9]:
len(data)

1874766

In [10]:
data.columns

Index(['NIA', 'sexe', 'data_naixement', 'abs_c', 'abs', 'pais_c',
       'N_vaccine_total', 'VACUNA_1_DATA', 'VACUNA_1_MOTIU', 'VACUNA_2_DATA',
       'VACUNA_2_MOTIU', 'VACUNA_3_DATA', 'VACUNA_3_MOTIU', 'VACUNA_1_DATA_pp',
       'VACUNA_2_DATA_pp', 'VACUNA_3_DATA_pp', 'DATA_DM_min', 'DM',
       'covid_bef_vax', 'test_date_covid_1', 'test_res_covid_1',
       'test_date_covid_2', 'test_res_covid_2', 'test_date_covid_3',
       'test_res_covid_3', 'test_date_imc_1', 'test_res_imc_1',
       'test_date_imc_2', 'test_res_imc_2', 'test_date_imc_3',
       'test_res_imc_3', 'test_date_sp_1', 'test_res_sp_1', 'test_date_sp_2',
       'test_res_sp_2', 'test_date_sp_3', 'test_res_sp_3', 'test_date_dp_1',
       'test_res_dp_1', 'test_date_dp_2', 'test_res_dp_2', 'test_date_dp_3',
       'test_res_dp_3', 'test_date_abdo_1', 'test_res_abdo_1',
       'test_date_abdo_2', 'test_res_abdo_2', 'test_date_abdo_3',
       'test_res_abdo_3', 'test_date_bg_1', 'test_res_bg_1', 'test_date_bg_2',
       

In [11]:
###### CAMBIAR FORMATO DE LOS DATOS (PIVOT) 

In [12]:
for i in range(1,4):
  print(i)
  data.rename({'VACUNA_{}_DATA'.format(i): 'VACUNA_DATA_{}'.format(i)}, axis=1, inplace=True)
  data.rename({'VACUNA_{}_MOTIU'.format(i): 'VACUNA_MOTIU_{}'.format(i)}, axis=1, inplace=True)
  data.rename({'VACUNA_{}_DATA_pp'.format(i): 'VACUNA_DATA_pp_{}'.format(i)}, axis=1, inplace=True)

1
2
3


In [13]:
stubnames = ['VACUNA_DATA_1', 'VACUNA_MOTIU_1', 'VACUNA_DATA_2',
'VACUNA_MOTIU_2', 'VACUNA_DATA_3', 'VACUNA_MOTIU_3', 'VACUNA_DATA_pp_1',
'VACUNA_DATA_pp_2', 'VACUNA_DATA_pp_3', 
'test_date_covid_1', 'test_res_covid_1',
'test_date_covid_2', 'test_res_covid_2', 'test_date_covid_3',
'test_res_covid_3', 'test_date_imc_1', 'test_res_imc_1',
'test_date_imc_2', 'test_res_imc_2', 'test_date_imc_3',
'test_res_imc_3', 'test_date_sp_1', 'test_res_sp_1', 'test_date_sp_2',
'test_res_sp_2', 'test_date_sp_3', 'test_res_sp_3', 'test_date_dp_1',
'test_res_dp_1', 'test_date_dp_2', 'test_res_dp_2', 'test_date_dp_3',
'test_res_dp_3', 'test_date_abdo_1', 'test_res_abdo_1',
'test_date_abdo_2', 'test_res_abdo_2', 'test_date_abdo_3',
'test_res_abdo_3', 'test_date_bg_1', 'test_res_bg_1', 'test_date_bg_2',
'test_res_bg_2', 'test_date_bg_3', 'test_res_bg_3', 'test_date_chol_1',
'test_res_chol_1', 'test_date_chol_2', 'test_res_chol_2',
'test_date_chol_3', 'test_res_chol_3', 'test_date_smoking_1',
'test_res_smoking_1', 'test_date_smoking_2', 'test_res_smoking_2',
'test_date_smoking_3', 'test_res_smoking_3', 'test_date_gma_1',
'test_res_gma_1', 'test_date_gma_2', 'test_res_gma_2',
'test_date_gma_3', 'test_res_gma_3', 'Vacuna_1', 'Vacuna_2', 'Vacuna_3']

In [14]:
for i in range(0,len(stubnames)):
  stubnames[i] = stubnames[i][0:-1]

In [16]:
data_piv = pd.wide_to_long(data, list(set(stubnames)), i='NIA', j='time')
data_piv.reset_index(inplace=True, drop=False)

In [17]:
data_piv.time = data_piv.time - 1

In [18]:
# Transform outcome for binary eof

In [19]:
data_piv.loc[(data_piv.time==1)|(data_piv.time==2), 'DM'] = np.NaN

In [20]:
# G-formula

In [21]:
time_name = 'time'
id = 'NIA'
time_points = np.max(np.unique(data_piv[time_name])) + 1

In [22]:
covnames = [
  'test_res_sp_',
  'test_res_smoking_',
  'test_res_chol_',
  'test_res_abdo_',
  'test_res_dp_',
  'test_res_imc_',
  'test_res_bg_',
  'test_res_covid_',
  'test_res_gma_',
  'Vacuna_']

covtypes = [
  'normal',
  'categorical',
  'normal',
  'normal',
  'normal',
  'normal',
  'normal',
  'categorical',
  'truncated normal',
  'binary']

trunc_params = ['NA', 'NA', 'NA', 'NA', 'NA', 'NA','NA', 'NA', [0, 'left'], 'NA']

covmodels = [
  'test_res_sp_ ~ lag1_test_res_sp_ + test_res_imc_ + data_naixement + pais_c + C(test_res_smoking_) + test_res_sociostat_1 + time',
  'test_res_smoking_ ~ C(lag1_test_res_smoking_) + data_naixement + pais_c + test_res_sociostat_1 + time',
  'test_res_chol_ ~ lag1_test_res_chol_ + data_naixement + pais_c + test_res_sociostat_1 + time',
  'test_res_abdo_ ~ lag1_test_res_abdo_ + test_res_imc_ +  + data_naixement + pais_c + C(test_res_smoking_) + time',
  'test_res_dp_ ~ lag1_test_res_dp_ + test_res_imc_ + data_naixement + pais_c + C(test_res_smoking_) + test_res_sociostat_1 + time',
  'test_res_imc_ ~ lag1_test_res_imc_ + data_naixement + pais_c + C(test_res_smoking_) + test_res_sociostat_1 + time',
  'test_res_bg_ ~ lag1_test_res_bg_ + test_res_imc_ + data_naixement + pais_c + test_res_sociostat_1 + time',
  'test_res_covid_ ~ C(lag1_test_res_covid_) + test_res_imc_ + data_naixement + pais_c + test_res_bg_ + C(test_res_smoking_) + test_res_sociostat_1 + time',
  'test_res_gma_ ~ lag1_test_res_gma_ + test_res_imc_ + data_naixement + test_res_dp_ + test_res_sp_ + test_res_chol_ + pais_c + test_res_bg_ + C(test_res_smoking_) + time',
  'Vacuna_ ~ lag1_Vacuna_ + time']

basecovs = ['abs_c', 'pais_c', 'sexe', 'data_naixement', 'test_res_sociostat_1'] 

outcome_name = 'DM'
ymodel = 'DM ~ abs_c + pais_c + sexe + data_naixement + test_res_sociostat_1 +\
  test_res_sp_ + lag1_test_res_sp_ +\
  C(test_res_smoking_) + C(lag1_test_res_smoking_)+\
  test_res_chol_ + lag1_test_res_chol_+\
  test_res_abdo_ + lag1_test_res_abdo_+\
  test_res_dp_ + lag1_test_res_dp_+\
  test_res_imc_ + lag1_test_res_imc_+\
  test_res_bg_ + lag1_test_res_bg_+\
  C(test_res_covid_) + C(lag1_test_res_covid_)+\
  test_res_gma_ + lag1_test_res_gma_'
 
outcome_type = 'binary_eof'
int_descript = ['Never treat', 'Treat on Vacuna only at t1', 'Treat on Vacuna only at t1 & t2', 'Treat on Vacuna at t1, t2 & t3']
ymodel_type = None

Intervention1_Vacuna = [static, np.zeros(time_points)]
Intervention2_Vacuna = [static, np.ones(time_points), [0]]
Intervention3_Vacuna = [static, np.ones(time_points), [0, 1]]
Intervention4_Vacuna = [static, np.ones(time_points), [0, 1, 2]]

In [27]:
float64_cols = list(data_piv.select_dtypes(include='float64'))

# The same code again calling the columns
data_piv[float64_cols] = data_piv[float64_cols].astype('float32')

In [29]:
gc.collect()

663

In [25]:
%%capture cap
%%time
print('********************************************************************')
g = ParametricGformula(obs_data = data_piv, id = id, time_name=time_name,
             time_points = time_points, int_descript = int_descript,
             covnames=covnames, covtypes=covtypes, trunc_params=trunc_params,
             covmodels=covmodels, basecovs=basecovs,
             outcome_name=outcome_name, ymodel=ymodel, ymodel_type=ymodel_type, outcome_type=outcome_type,
             Intervention1_Vacuna = Intervention1_Vacuna,
             Intervention2_Vacuna = Intervention2_Vacuna,
             Intervention3_Vacuna = Intervention3_Vacuna,
             Intervention4_Vacuna = Intervention4_Vacuna,
             parallel=True, ncores=18)


g.fit()
print('********************************************************************')

MemoryError: Unable to allocate 498. GiB for an array with shape (3749532, 17823) and data type float64

In [26]:
# Save the captured output to a text file
with open('output.txt', 'a') as file:
    file.write(cap.stdout)

In [27]:
# Serialize the object to a binary format
with open('gform.pkl', 'wb') as file:
    pickle.dump(g, file)

In [10]:
# Deserialize the object from the binary file
#with open('gform.pkl', 'rb') as file:
#    g = pickle.load(file)

### Notes:
- Time has to start at 0, otherwise we get errors of type "operands of shape () could not be broadcast together"
- No NaNs in the covariates, otherwise we get errors of type "operands of shape () could not be broadcast together"

### Full version with real data

In [None]:
### Investigate other intervention options (grace period etc.)

In [None]:
### Investigate possibility of using ML model