In [8]:
# Import the libraries
import pyreadstat
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re
import os
import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
import dataframe_image as dfi
import time
from alive_progress import config_handler
import graphviz
import h5py
import statsmodels.api as sm
import pandas_profiling as pp
from pandas_profiling import ProfileReport

# Set display options
pd.options.display.max_rows = 2000
pd.options.display.max_info_columns = 2000
pd.options.display.max_seq_items = 1000
pd.options.display.max_columns = 1000

# Read the data from HDF5 file
subjects_h = pd.read_hdf('data/df_subjects_retro_homelss.h5', 'df_subjects_retro_homelss')

# Create age_categorical column based on age ranges
subjects_h['age_categorical'] = subjects_h['age'].map(lambda x: '18-29' if ((x >= 18) and (x < 30)) else ('30-39' if ((x >= 30) and (x < 40)) else ('40-49' if ((x >= 40) and (x < 50)) else ('50-59' if ((x >= 50) and (x < 60)) else ('60+' if x >= 60 else '')))))

# Fill missing values in homeless_followup column with 0
subjects_h['homeless_followup'] = subjects_h['homeless_followup'].fillna(0)

print ("shape of records where homeless_followup is not 0")
print(subjects_h[subjects_h['homeless_followup'] != 0].shape)

print ("unique values in homeless_followup column and change the non-zero values to the one")
print(subjects_h['homeless_followup'].unique())

# List of columns to be converted to binary (0 or 1)
col_list = ['substance', 'mood', 'anxiety', 'psychotic', 'cognitive', 'otherpsych', 'selfharm', 'visit_emr_MH_non_elect', 'visit_emr_NonMH', 'visit_emr_visit', 'visit_family_gp', 'visit_hosp_visit', 'visit_hospitalized_MH', 'visit_hospitalized_NonMH', 'visit_im', 'visit_neurology', 'visit_other', 'visit_psychiatry', 'EX_CHF', 'EX_Arrhy', 'EX_VD', 'EX_PCD', 'EX_PVD', 'EX_HPTN_UC', 'EX_HPTN_C', 'EX_Para', 'Ex_OthND', 'Ex_COPD', 'Ex_Diab_UC', 'Ex_Diab_C', 'Ex_Hptothy', 'Ex_RF', 'Ex_LD', 'Ex_PUD_NB', 'Ex_HIV', 'Ex_Lymp', 'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A', 'Ex_Coag', 'Ex_Obesity', 'Ex_WL', 'Ex_Fluid', 'Ex_BLA', 'Ex_DA', 'Ex_Alcohol', 'Ex_Drug', 'Ex_Psycho', 'Ex_Dep', 'Ex_Stroke', 'Ex_Dyslipid', 'Ex_Sleep', 'Ex_IHD', 'EX_Fall', 'EX_Urinary', 'EX_Visual', 'EX_Hearing', 'EX_Tobacco', 'EX_Delirium', 'Ex_MS', 'EX_parkinsons', 'homeless_followup']

# Function to convert values to 0 or 1
def return_zero_one(value):
    if value == 0:
        return 0
    else:
        return 1

# Convert columns in col_list to binary (0 or 1)
for col in col_list:
    subjects_h[col] = subjects_h[col].map(lambda x: return_zero_one(x))

# Select the desired columns in subjects_h DataFrame
subjects_h = subjects_h[['subject_id', 'sex', 'age', 'age_categorical', 'db_claim', 'db_nacrs', 'db_dad', 'db_pin', 'substance', 'mood', 'anxiety', 'psychotic', 'cognitive', 'otherpsych', 'selfharm', 'visit_emr_MH_non_elect', 'visit_emr_NonMH', 'visit_emr_visit', 'visit_family_gp', 'visit_hosp_visit', 'visit_hospitalized_MH', 'visit_hospitalized_NonMH', 'visit_im', 'visit_neurology', 'visit_other', 'visit_pharmacy', 'visit_psychiatry', 'EX_CHF', 'EX_Arrhy', 'EX_VD', 'EX_PCD', 'EX_PVD', 'EX_HPTN_UC', 'EX_HPTN_C', 'EX_Para', 'Ex_OthND', 'Ex_COPD', 'Ex_Diab_UC', 'Ex_Diab_C', 'Ex_Hptothy', 'Ex_RF', 'Ex_LD', 'Ex_PUD_NB', 'Ex_HIV', 'Ex_Lymp', 'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A', 'Ex_Coag', 'Ex_Obesity', 'Ex_WL', 'Ex_Fluid', 'Ex_BLA', 'Ex_DA', 'Ex_Alcohol', 'Ex_Drug', 'Ex_Psycho', 'Ex_Dep', 'Ex_Stroke', 'Ex_Dyslipid', 'Ex_Sleep', 'Ex_IHD', 'EX_Fall', 'EX_Urinary', 'EX_Visual', 'EX_Hearing', 'EX_Tobacco', 'EX_Delirium', 'Ex_MS', 'EX_parkinsons', 'homeless_followup']]

print ("unique values in age_categorical column")
print(subjects_h.age_categorical.unique())

print ("shape of records where homeless_followup is 1")
print(subjects_h[subjects_h.homeless_followup == 1].shape)


shape of records where homeless_followup is not 0
(1800, 83)
unique values in homeless_followup column
[  0.  13.   1.   2.   9.   6.  12.   3.   8.  25.   4.  27.   5.  20.
  19.  59.  18.  45.  11.   7. 360.  16.  68.  10.  35.  15.  99.  69.
  21.  14.  23. 146.  17.  50.  64.  89. 172.  26.  43.  37.  38.  30.
  32.  60.  22.  44.  62.  72.  24.  71.  34.]
unique values in age_categorical column
['50-59' '18-29' '30-39' '40-49' '60+']
shape of records where homeless_followup is 1
(1800, 73)


In [None]:
# Generate a profile report for the entire dataset
profile = pp.ProfileReport(subjects_h, minimal=True)
profile.to_file(output_file="Descriptive_analysis_homelessness.html")

# Generate a profile report for individuals with homelessness
subjects_h_y = subjects_h[subjects_h.homeless_followup == 1]
profile = pp.ProfileReport(subjects_h_y, minimal=True)
profile.to_file(output_file="Descriptive_analysis_individuals_with_homelessness.html")

# Generate a profile report for individuals without homelessness
subjects_h_n = subjects_h[subjects_h.homeless_followup == 0]
profile = pp.ProfileReport(subjects_h_n, minimal=True)
profile.to_file(output_file="Descriptive_analysis_individuals_without_homelessness.html")


# Oddsa Ratio

In [None]:

# Select the features for the model
features = subjects_h[['sex_M', 'substance', 'mood', 'anxiety', 'psychotic', 'cognitive', 'otherpsych',
       'selfharm', 'visit_emr_MH_non_elect', 'visit_emr_NonMH',
       'visit_emr_visit', 'visit_hosp_visit',
       'visit_hospitalized_MH', 'visit_hospitalized_NonMH', 'visit_family_gp','visit_im',
       'visit_neurology', 'visit_other', 
       'visit_psychiatry',
       'EX_CHF', 'EX_Arrhy', 'EX_VD', 'EX_PCD', 'EX_PVD', 'EX_HPTN_UC',
       'EX_HPTN_C', 'EX_Para', 'Ex_OthND', 'Ex_COPD', 'Ex_Diab_UC',
       'Ex_Diab_C', 'Ex_Hptothy', 'Ex_RF', 'Ex_LD', 'Ex_PUD_NB', 'Ex_HIV',
       'Ex_Lymp', 'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A', 'Ex_Coag', 'Ex_Obesity',
       'Ex_WL', 'Ex_Fluid', 'Ex_BLA', 'Ex_DA', 'Ex_Alcohol', 'Ex_Drug',
       'Ex_Psycho', 'Ex_Dep', 'Ex_Stroke', 'Ex_Dyslipid', 'Ex_Sleep', 'Ex_IHD',
       'EX_Fall', 'EX_Urinary', 'EX_Visual', 'EX_Hearing', 'EX_Tobacco',
       'EX_Delirium', 'Ex_MS', 'EX_parkinsons']]

# Set the target variable
y = subjects_h['homeless_followup']


# Create an empty DataFrame to store the model odds results
model_odds = pd.DataFrame()

# Iterate through each feature in the columns of the features DataFrame
for x in features.columns.values:
    X = features[x]  # Select the feature for the model

    X = sm.add_constant(X)  # Add a constant column to the feature matrix
    logit_model = sm.Logit(y, X)  # Create a logistic regression model
    result = logit_model.fit()  # Fit the model

    # Create a dictionary to store the model odds results
    dict = {'Feature': x,
            'OR': np.round(np.exp(result.params), 2)[1],
            'Coef': np.round(result.params.values, 3)[1],
            'p-value': np.round(result.pvalues.values, 2)[1],
            '2.5%': np.round(np.exp(result.conf_int())[0].values, 2)[1],
            '97.5%': np.round(np.exp(result.conf_int())[1].values, 2)[1],
            'LLR p-value': np.round(result.llr_pvalue, 5),
            'Pseudo R-squared': result.prsquared
            }

    # Append the dictionary to the model_odds DataFrame
    model_odds = model_odds.append(dict, ignore_index=True)


In [16]:
model_odds[['Feature','OR','2.5%','97.5%','p-value']].sort_values("OR", ascending=False)

Unnamed: 0,Feature,OR,2.5%,97.5%,p-value
47,Ex_Drug,20.5,18.62,22.56,0.0
46,Ex_Alcohol,16.2,14.7,17.84,0.0
1,substance,15.24,13.62,17.05,0.0
8,visit_emr_MH_non_elect,13.97,12.61,15.48,0.0
13,visit_hospitalized_NonMH,10.72,9.63,11.93,0.0
35,Ex_HIV,8.28,5.6,12.25,0.0
59,EX_Delirium,8.17,6.68,9.98,0.0
12,visit_hospitalized_MH,8.13,7.28,9.08,0.0
9,visit_emr_NonMH,7.39,5.46,10.0,0.0
48,Ex_Psycho,7.1,6.42,7.86,0.0


In [17]:
# Select the age-related features from the subjects_h DataFrame
features_age = subjects_h[['age_18-29', 'age_40-49', 'age_50-59', 'age_60+']]

# Create a DataFrame to store the odds ratio results for age
df_results = pd.DataFrame()

# Add a constant column to the age features matrix
X = features_age
X = sm.add_constant(X)

# Create a logistic regression model for age
logit_model = sm.Logit(y, X)

# Fit the logistic regression model
result = logit_model.fit()

# Calculate the odds ratios and confidence intervals
df_results = np.round(np.exp(result.conf_int()), 2)
df_results.columns = ['2.5%', '97.5%']
df_results['OR'] = pd.DataFrame(np.exp(result.params))
df_results['OR'] = df_results['OR'].map(lambda x: np.round(x, 2))
df_results['P_value'] = np.round(result.pvalues.values, 2)

# Select the columns of interest for the results DataFrame
df_results = df_results[['OR', '2.5%', '97.5%', 'P_value']]


Optimization terminated successfully.
         Current function value: 0.044251
         Iterations 10


  x = pd.concat(x[::order], 1)


In [18]:
df_results

Unnamed: 0,OR,2.5%,97.5%,P_value
const,0.01,0.01,0.01,0.0
age_18-29,1.17,1.03,1.32,0.01
age_40-49,0.73,0.63,0.83,0.0
age_50-59,0.69,0.6,0.79,0.0
age_60+,0.44,0.36,0.55,0.0


In [19]:
subjects_h.to_hdf("data/df_subjects_retro_homelss_preproc.h5",'df_subjects_retro_homelss_preproc')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['subject_id', 'sex', 'age', 'age_categorical', 'quintmat', 'quintsoc'], dtype='object')]

  subjects_h.to_hdf("data/df_subjects_retro_homelss_preproc.h5",'df_subjects_retro_homelss_preproc')
