In [25]:
import pandas as pd
import numpy as np
import statsmodels.api as sm 
from sklearn.preprocessing import PowerTransformer
from imblearn.over_sampling import RandomOverSampler

In [26]:
# read the database 
ltc_df=pd.read_hdf("ltc_df.h5",key='ltc_df')

In [27]:
# to expand the number of columns and rows that we can see while dispalying a datafram
pd.set_option('display.max_columns', ltc_df.columns.shape[0]+1)
pd.set_option('display.max_rows', 200)

# Pre-processing

In [28]:
ltc_df_desc_t2_=ltc_df.copy()
# adding year_mont togerher
ltc_df_desc_t2_['y_m_']=ltc_df_desc_t2_['specimen_year_collected'].astype(str)+'-'+ltc_df_desc_t2_['specimen_month_collected'].astype(str)
ltc_df_desc_t2_['specimen_year_month_collected_con']=ltc_df_desc_t2_[['specimen_year_collected','specimen_month_collected']].apply(lambda x: x[1]-2 if x[0]==2020 else x[1]+10, axis=1)


In [29]:
# list of comorbidities
list_of_cmbdt=[

'chf_2_years_full',
'htn_unc_2_years_full', 
'cancer_mets_2_years_full',
'fluid_elec_dis_2_years_full', 
'card_arrh_2_years_full', 
'valv_dis_2_years_full', 
'pcd_2_years_full',
'pvd_2_years_full', 
'htn_c_2_years_full',
'paral_2_years_full', 'oth_neur_dis_2_years_full', 'cpd_2_years_full',
'diab_2_years_full', 'hypothyroidism_2_years_full',
'ren_fail_2_years_full', 'liver_dis_2_years_full',
'pep_ulc_exc_bld_2_years_full', 'aids_hiv_2_years_full',
'lymph_2_years_full', 
'tumour_no_mets_2_years_full', 'rheum_col_vasc_dis_2_years_full',
'coag_2_years_full', 'obes_2_years_full', 'wt_loss_2_years_full',
'anemia_2_years_full',
'alc_abuse_2_years_full', 'drug_abuse_2_years_full',
'psych_2_years_full', 'depress_2_years_full']

# list of continious features
list_of_continious_features=[

'num_elixhauser_2_years_full',
'elix_index',
'num_admits_1_year', 
'num_scu_admits_1_year',
'num_procs_dad_1_year', 
'num_procs_nacrs_1_year',
'num_procs_tot_1_year',
'spec_pat_num_age' ,
'specimen_year_month_collected_con',
]

# list of categorical features
list_of_categorical_features=[


# collection 
'collected_in_hospital', 
'collected_in_ed',
'collected_in_ltc', 
'collected_in_dsl',
# resident                           
'ltc_resident_during_collection', 
'dsl_resident_during_collection',
                      
]

cat_vars=[
    
'spec_pat_agecat',
'patient_gender',  
'num_elixhauser_2_years_full_cat',
'symptomatic_during_collection', 
'interp_result',
'y_m_'
]


ltc_df_desc_t2=ltc_df_desc_t2_[list_of_continious_features+list_of_categorical_features+cat_vars+['died_within_60_days']]
ltc_df_desc_t2_cmd=ltc_df_desc_t2_[list_of_cmbdt+['died_within_60_days']]

### Create dummy variables
That is variables with only two values, zero and one.

In [30]:
ltc_df_desc_t2[cat_vars]

for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(ltc_df_desc_t2[var], prefix=var)
    data1=ltc_df_desc_t2.join(cat_list)
    ltc_df_desc_t2=data1
    
data_vars=ltc_df_desc_t2.columns.values.tolist()
data_final=ltc_df_desc_t2
data_final.columns.values


ltc_df_LR_t2_1=data_final[list_of_continious_features+['dsl_resident_during_collection','patient_gender_M','interp_result_Positive','num_elixhauser_2_years_full_cat_0','num_elixhauser_2_years_full_cat_1+']] # base is dsl
ltc_df_LR_t2_2=data_final[['collected_in_ed', 'collected_in_hospital', 'collected_in_dsl']] # Base is LTC
ltc_df_LR_t2_3=data_final[['spec_pat_agecat_30-39', 'spec_pat_agecat_40-49','spec_pat_agecat_50-59', 'spec_pat_agecat_60-69','spec_pat_agecat_70-79', 'spec_pat_agecat_18-29']] # Base is 80+
ltc_df_LR_t2_4=data_final[['symptomatic_during_collection_U','symptomatic_during_collection_Y',]] # base is N
ltc_df_LR_t2_5=data_final[['y_m__2020-10','y_m__2020-11', 'y_m__2020-12', 'y_m__2020-3', 'y_m__2020-4','y_m__2020-5', 'y_m__2021-3', 'y_m__2020-7', 'y_m__2020-8','y_m__2020-9', 'y_m__2021-1', 'y_m__2021-2']]




ltc_df_desc_t2_cmd_1=ltc_df_desc_t2_cmd.copy()

for var in ltc_df_desc_t2_cmd.columns[:-1]:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(ltc_df_desc_t2_cmd_1[var], prefix=var)
    data1=ltc_df_desc_t2_cmd_1.join(cat_list.iloc[:,1])
    ltc_df_desc_t2_cmd_1=data1
    

ltc_df_desc_t2_cmd_1=ltc_df_desc_t2_cmd_1.iloc[:,29:]

data_final=data_final.join(ltc_df_desc_t2_cmd_1.iloc[:,1:])

# Odds ratio with Imbalanced classes 

In [None]:
# Odds ratio for comorbidities
y=ltc_df_desc_t2_cmd_1['died_within_60_days']
cmd=ltc_df_desc_t2_cmd_1.iloc[:,1:]
model_odds = pd.DataFrame()#columns= [['Feature','OR','p-value','2.5%', '97.5%', 'LLR p-value','Pseudo R-squared']])

i=0
for feature in cmd.columns.values:
    
    X=cmd[feature]
    X=sm.add_constant(X)
    logit_model=sm.Logit(y,X)
    result=logit_model.fit()

    dict = {'Feature': feature , 
            'OR': np.round( np.exp(result.params),2)[1], 
            'Coef':np.round( result.params.values,3)[1], 
            'p-value': np.round( result.pvalues.values,2)[1]  ,
             '2.5%': np.round( np.exp(result.conf_int())[0].values,3)[1] , 
            '97.5%': np.round( np.exp(result.conf_int())[1].values,3)[1],
             'LLR p-value' :result.llr_pvalue, 
            'Pseudo R-squared':result.prsquared
           
           }

    model_odds = model_odds.append(dict, ignore_index = True)
    


In [32]:
model_odds[['Feature','OR','p-value','2.5%', '97.5%']]

Unnamed: 0,Feature,OR,p-value,2.5%,97.5%
0,chf_2_years_full_1,1.6,0.0,1.457,1.75
1,htn_unc_2_years_full_1,0.97,0.53,0.898,1.057
2,cancer_mets_2_years_full_1,1.92,0.0,1.476,2.499
3,fluid_elec_dis_2_years_full_1,1.47,0.0,1.35,1.611
4,card_arrh_2_years_full_1,1.36,0.0,1.246,1.493
5,valv_dis_2_years_full_1,1.48,0.0,1.196,1.83
6,pcd_2_years_full_1,1.59,0.0,1.327,1.902
7,pvd_2_years_full_1,1.35,0.0,1.188,1.535
8,htn_c_2_years_full_1,1.22,0.32,0.824,1.797
9,paral_2_years_full_1,0.63,0.0,0.507,0.788


In [None]:
# calculation odds ratio for other categorical variables
y=data_final['died_within_60_days']
model_odds = pd.DataFrame()#columns= [['Feature','OR','p-value','2.5%', '97.5%', 'LLR p-value','Pseudo R-squared']])

i=0
for feature in ltc_df_LR_t2_1.columns.values:
    X=ltc_df_LR_t2_1[feature]
    X=sm.add_constant(X)
    logit_model=sm.Logit(y,X)
    result=logit_model.fit()
    dict = {'Feature': feature , 
            'OR': np.round( np.exp(result.params),3)[1], 
            'Coef':np.round( result.params.values,3)[1], 
            'p-value': np.round( result.pvalues.values,3)[1]  ,
             '2.5%': np.round( np.exp(result.conf_int())[0].values,3)[1] , 
            '97.5%': np.round( np.exp(result.conf_int())[1].values,3)[1],
             'LLR p-value' :result.llr_pvalue, 
            'Pseudo R-squared':result.prsquared
           
           }

    model_odds = model_odds.append(dict, ignore_index = True)

In [34]:
model_odds.style.hide_index()

Feature,OR,Coef,p-value,2.5%,97.5%,LLR p-value,Pseudo R-squared
num_elixhauser_2_years_full,1.053,0.051,0.0,1.038,1.068,0.0,0.002956
elix_index,1.03,0.029,0.0,1.025,1.034,0.0,0.009596
num_admits_1_year,1.117,0.111,0.0,1.078,1.158,0.0,0.002032
num_scu_admits_1_year,1.212,0.193,0.129,0.945,1.555,0.146053,0.000126
num_procs_dad_1_year,1.04,0.039,0.006,1.011,1.069,0.008808,0.000409
num_procs_nacrs_1_year,1.001,0.001,0.469,0.999,1.002,0.478762,3e-05
num_procs_tot_1_year,1.001,0.001,0.388,0.999,1.002,0.399449,4.2e-05
spec_pat_num_age,1.038,0.037,0.0,1.034,1.042,0.0,0.021949
specimen_year_month_collected_con,1.081,0.078,0.0,1.068,1.095,0.0,0.008234
dsl_resident_during_collection,0.495,-0.704,0.0,0.452,0.542,0.0,0.014944


In [35]:
# ODDs for the collected place
X=ltc_df_LR_t2_2
X=sm.add_constant(X)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print ("#################################################################")
print ("collected place")
print('Odds ratio :')
print(np.exp(result.params))
print('')
print('Confidence interval :')
print(np.round( np.exp(result.conf_int()),2))
print ("#################################################################")



# ODDs for age category
X=ltc_df_LR_t2_3
X=sm.add_constant(X)
logit_model=sm.Logit(y,X)
result=logit_model.fit()

print ("#################################################################")
print ("age category")
print('Odds ratio :')
print(round(np.exp(result.params),2))
print('')
print('Confidence interval :')
print(np.round( np.exp(result.conf_int()),2))

print ("#################################################################")

# ODDs for symptomatic_during_collection
X=ltc_df_LR_t2_4
X=sm.add_constant(X)
logit_model=sm.Logit(y,X)
result=logit_model.fit()

print ("#################################################################")
print ("symptomatic_during_collection")
print('Odds ratio :')
print(round(np.exp(result.params),2))
print('')
print('Confidence interval:')
print(np.round( np.exp(result.conf_int()),2))

print ("#################################################################")

# ODDs for test year and month
X=ltc_df_LR_t2_5
X=sm.add_constant(X)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print ("#################################################################")
print ("test year and month")
print('Odds ratio :')
print(round(np.exp(result.params),2))
print('')
print('Confidence interval :')
print(np.round( np.exp(result.conf_int()),2))

print ("#################################################################")

  x = pd.concat(x[::order], 1)


Optimization terminated successfully.
         Current function value: 0.315348
         Iterations 7
#################################################################
collected place
Odds ratio :
const                    0.133657
collected_in_ed          3.086610
collected_in_hospital    2.330146
collected_in_dsl         0.421603
dtype: float64

Confidence interval :
                          0     1
const                  0.13  0.14
collected_in_ed        2.61  3.64
collected_in_hospital  1.92  2.82
collected_in_dsl       0.38  0.47
#################################################################
Optimization terminated successfully.
         Current function value: 0.322026
         Iterations 8
#################################################################
age category
Odds ratio :
const                    0.14
spec_pat_agecat_30-39    0.19
spec_pat_agecat_40-49    0.10
spec_pat_agecat_50-59    0.20
spec_pat_agecat_60-69    0.46
spec_pat_agecat_70-79    0.61
spec_pat_agecat_18-

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


# ODDs ration wtih balanced data 

In [None]:
# random over sampler
scaler_PT = PowerTransformer() 
over_sampler = RandomOverSampler(random_state=42)

# odds for the comorbidiries
y=ltc_df_desc_t2_cmd_1['died_within_60_days']
cmd=ltc_df_desc_t2_cmd_1.iloc[:,1:]
model_odds = pd.DataFrame()#columns= [['Feature','OR','p-value','2.5%', '97.5%', 'LLR p-value','Pseudo R-squared']])

i=0
for feature in cmd.columns.values:
    
    X=cmd[feature]
    X=sm.add_constant(X)
    X_res, y_res = over_sampler.fit_resample(X, y)
    logit_model=sm.Logit(y_res,X_res)
    result=logit_model.fit()

    dict = {'Feature': feature , 
            'OR': np.round( np.exp(result.params),2)[1], 
            'Coef':np.round( result.params.values,3)[1], 
            'p-value': np.round( result.pvalues.values,2)[1]  ,
             '2.5%': np.round( np.exp(result.conf_int())[0].values,3)[1] , 
            '97.5%': np.round( np.exp(result.conf_int())[1].values,3)[1],
             'LLR p-value' :result.llr_pvalue, 
            'Pseudo R-squared':result.prsquared
           
           }

    model_odds = model_odds.append(dict, ignore_index = True)

In [37]:
model_odds[['Feature','OR','p-value','2.5%','97.5%']]

Unnamed: 0,Feature,OR,p-value,2.5%,97.5%
0,chf_2_years_full_1,1.62,0.0,1.556,1.697
1,htn_unc_2_years_full_1,1.0,0.82,0.96,1.033
2,cancer_mets_2_years_full_1,1.91,0.0,1.669,2.192
3,fluid_elec_dis_2_years_full_1,1.48,0.0,1.416,1.537
4,card_arrh_2_years_full_1,1.36,0.0,1.308,1.423
5,valv_dis_2_years_full_1,1.42,0.0,1.281,1.578
6,pcd_2_years_full_1,1.65,0.0,1.512,1.803
7,pvd_2_years_full_1,1.32,0.0,1.238,1.398
8,htn_c_2_years_full_1,1.27,0.01,1.062,1.524
9,paral_2_years_full_1,0.66,0.0,0.602,0.721


In [None]:
# odds for the categorical values

y=data_final['died_within_60_days']
model_odds = pd.DataFrame()#columns= [['Feature','OR','p-value','2.5%', '97.5%', 'LLR p-value','Pseudo R-squared']])

i=0
for feature in ltc_df_LR_t2_1.columns.values:
#     X=pd.DataFrame(ltc_df_LR_t2_1[feature])
    X=ltc_df_LR_t2_1[feature]
    
    print(X.shape)
#     X = pd.DataFrame(scaler_PT.fit_transform(X), columns=X.columns)
    X=sm.add_constant(X)
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
    X_res, y_res = over_sampler.fit_resample(X, y)
    logit_model=sm.Logit(y_res,X_res)
    result=logit_model.fit()
    dict = {'Feature': feature , 
            'OR': np.round( np.exp(result.params.values),3)[1], 
            'p-value': np.round( result.pvalues.values,3)[1]  ,
             '2.5%': np.round( np.exp(result.conf_int())[0].values,3)[1] , 
            '97.5%': np.round( np.exp(result.conf_int())[1].values,3)[1],
             'LLR p-value' :result.llr_pvalue, 
            'Pseudo R-squared':result.prsquared
           
           }

    model_odds = model_odds.append(dict, ignore_index = True)

In [39]:
model_odds.style.hide_index()

Feature,OR,p-value,2.5%,97.5%,LLR p-value,Pseudo R-squared
num_elixhauser_2_years_full,1.051,0.0,1.044,1.058,0.0,0.003671
elix_index,1.03,0.0,1.028,1.032,0.0,0.012059
num_admits_1_year,1.125,0.0,1.105,1.145,0.0,0.002762
num_scu_admits_1_year,1.218,0.002,1.074,1.381,0.001928,0.000151
num_procs_dad_1_year,1.036,0.0,1.021,1.05,1e-06,0.000387
num_procs_nacrs_1_year,1.0,0.552,0.999,1.001,0.551801,6e-06
num_procs_tot_1_year,1.0,0.384,1.0,1.001,0.383593,1.2e-05
spec_pat_num_age,1.037,0.0,1.036,1.039,0.0,0.029138
specimen_year_month_collected_con,1.073,0.0,1.067,1.079,0.0,0.00943
dsl_resident_during_collection,0.496,0.0,0.477,0.516,0.0,0.019785


In [None]:
X=ltc_df_LR_t2_2
X=sm.add_constant(X)
X_res, y_res = over_sampler.fit_resample(X, y)
logit_model=sm.Logit(y_res,X_res)
result_collected_place=logit_model.fit()


X=ltc_df_LR_t2_3
X=sm.add_constant(X)
X_res, y_res = over_sampler.fit_resample(X, y)
logit_model=sm.Logit(y_res,X_res)
result_age=logit_model.fit()




X=ltc_df_LR_t2_4
X=sm.add_constant(X)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_res, y_res = over_sampler.fit_resample(X, y)
logit_model=sm.Logit(y_res,X_res)
result_symptomatic=logit_model.fit()



X=ltc_df_LR_t2_5
X=sm.add_constant(X)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_res, y_res = over_sampler.fit_resample(X, y)
logit_model=sm.Logit(y_res,X_res)
result_y_m=logit_model.fit()




In [41]:
print ("#################################################################")
print ("collected place")
print('Odds ratio :')
print(round(np.exp(result_collected_place.params),3))
print ("#################################################################")


print ("#################################################################")
print ("Age category")
print('Odds ratio :')
print(round(np.exp(result_age.params),3))
print ("#################################################################")

print ("#################################################################")
print ("symptomatic_during_collection")
print('Odds ratio :')
print(round(np.exp(result_symptomatic.params),3))
print ("#################################################################")


print ("#################################################################")
print ("Year and month of the test collection")
print('Odds ratio :')
print(round(np.exp(result_y_m.params),3))
print ("#################################################################")


#################################################################
collected place
Odds ratio :
const                    1.179
collected_in_ed          3.090
collected_in_hospital    2.507
collected_in_dsl         0.426
dtype: float64
#################################################################
#################################################################
Age category
Odds ratio :
const                    1.221
spec_pat_agecat_30-39    0.231
spec_pat_agecat_40-49    0.089
spec_pat_agecat_50-59    0.205
spec_pat_agecat_60-69    0.468
spec_pat_agecat_70-79    0.621
spec_pat_agecat_18-29    0.237
dtype: float64
#################################################################
#################################################################
symptomatic_during_collection
Odds ratio :
const                              0.789
symptomatic_during_collection_U    1.987
symptomatic_during_collection_Y    2.375
dtype: float64
###############################################################

In [42]:
data_final.to_hdf("data_final.h5", key='data_final')