In [None]:
# Import the libraries
import pyreadstat
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re
import os
import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
import dataframe_image as dfi
import time
from alive_progress import config_handler
import graphviz
import h5py

import statsmodels.api as sm

import pandas_profiling as pp
from pandas_profiling import ProfileReport

# Set display options for rows and columns
pd.options.display.max_rows = 2000
pd.options.display.max_info_columns = 2000
pd.options.display.max_seq_items = 1000
pd.options.display.max_columns = 1000

# Read data from HDF file
subjects_h = pd.read_hdf('data/df_subjects_homelessnes.h5', 'df_subjects_homelessnes')

# Add age_categorical column based on age ranges
subjects_h['age_categorical'] = subjects_h['age'].map(lambda x: '18-29' if ((x >= 18) and (x < 30)) else ('30-39' if ((x >= 30) and (x < 40)) else ('40-49' if ((x >= 40) and (x < 50)) else ('50-59' if ((x >= 50) and (x < 60)) else ('60+' if x >= 60 else '')))))

col_list = ['substance', 'mood', 'anxiety', 'psychotic', 'cognitive', 'otherpsych', 'selfharm',
            'visit_emr_MH_non_elect', 'visit_emr_NonMH', 'visit_emr_visit', 'visit_family_gp',
            'visit_hosp_visit', 'visit_hospitalized_MH', 'visit_hospitalized_NonMH', 'visit_im',
            'visit_neurology', 'visit_other', 'visit_pharmacy', 'visit_psychiatry', 'EX_CHF', 'EX_Arrhy',
            'EX_VD', 'EX_PCD', 'EX_PVD', 'EX_HPTN_UC', 'EX_HPTN_C', 'EX_Para', 'Ex_OthND', 'Ex_COPD',
            'Ex_Diab_UC', 'Ex_Diab_C', 'Ex_Hptothy', 'Ex_RF', 'Ex_LD', 'Ex_PUD_NB', 'Ex_HIV', 'Ex_Lymp',
            'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A', 'Ex_Coag', 'Ex_Obesity', 'Ex_WL', 'Ex_Fluid', 'Ex_BLA',
            'Ex_DA', 'Ex_Alcohol', 'Ex_Drug', 'Ex_Psycho', 'Ex_Dep', 'Ex_Stroke', 'Ex_Dyslipid', 'Ex_Sleep',
            'Ex_IHD', 'EX_Fall', 'EX_Urinary', 'EX_Visual', 'EX_Hearing', 'EX_Tobacco', 'EX_Delirium', 'Ex_MS',
            'EX_parkinsons']

# Function to map values to 0 or 1
def return_zero_one(value):
    if value == 0:
        return 0
    else:
        return 1

# Map values to 0 or 1 for each column in col_list
for col in col_list:
    subjects_h[col] = subjects_h[col].map(return_zero_one)

# Select desired columns in subjects_h dataframe
subjects_h = subjects_h[['subject_id', 'sex', 'age', 'age_categorical', 'quintmat', 'quintsoc', 'db_claim', 'db_nacrs',
                         'db_dad', 'db_pin', 'substance', 'mood', 'anxiety', 'psychotic', 'cognitive', 'otherpsych',
                         'selfharm', 'visit_emr_MH_non_elect', 'visit_emr_NonMH', 'visit_emr_visit',
                         'visit_family_gp', 'visit_hosp_visit', 'visit_hospitalized_MH', 'visit_hospitalized_NonMH',
                         'visit_im', 'visit_neurology', 'visit_other', 'visit_pharmacy', 'visit_psychiatry', 'EX_CHF',
                         'EX_Arrhy', 'EX_VD', 'EX_PCD', 'EX_PVD', 'EX_HPTN_UC', 'EX_HPTN_C', 'EX_Para', 'Ex_OthND',
                         'Ex_COPD', 'Ex_Diab_UC', 'Ex_Diab_C', 'Ex_Hptothy', 'Ex_RF', 'Ex_LD', 'Ex_PUD_NB', 'Ex_HIV',
                         'Ex_Lymp', 'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A', 'Ex_Coag', 'Ex_Obesity', 'Ex_WL', 'Ex_Fluid',
                         'Ex_BLA', 'Ex_DA', 'Ex_Alcohol', 'Ex_Drug', 'Ex_Psycho', 'Ex_Dep', 'Ex_Stroke', 'Ex_Dyslipid',
                         'Ex_Sleep', 'Ex_IHD', 'EX_Fall', 'EX_Urinary', 'EX_Visual', 'EX_Hearing', 'EX_Tobacco',
                         'EX_Delirium', 'Ex_MS', 'EX_parkinsons', 'homeless_past', 'homeless_recent',
                         'police_interaction', 'homeless', 'date_difference']]

# Get unique values of age_categorical column
subjects_h.age_categorical.unique()

# Generate descriptive analysis report for subjects_h dataframe
profile = pp.ProfileReport(subjects_h, minimal=True)
profile.to_file(output_file="Descriptive_analysis_homelessness.html")

# Filter dataframe to include only rows where homeless is 1
subjects_h_y = subjects_h[subjects_h.homeless == 1]

# Generate descriptive analysis report for subjects_h_y dataframe
profile = pp.ProfileReport(subjects_h_y, minimal=True)
profile.to_file(output_file="Descriptive_analysis_individuals_with_homelessness.html")

# Filter dataframe to include only rows where homeless is 0
subjects_h_n = subjects_h[subjects_h.homeless == 0]

# Generate descriptive analysis report for subjects_h_n dataframe
profile = pp.ProfileReport(subjects_h_n, minimal=True)
profile.to_file(output_file="Descriptive_analysis_individuals_without_homelessness.html")


# Oddsa Ratio

In [None]:
subjects_h=subjects_h.join(pd.get_dummies(subjects_h['sex'], prefix='sex'))

subjects_h=subjects_h.join(pd.get_dummies(subjects_h['age_categorical'], prefix='age'))

subjects_h=subjects_h.join(pd.get_dummies(subjects_h['quintmat'], prefix='quintmat'))
subjects_h=subjects_h.join(pd.get_dummies(subjects_h['quintsoc'], prefix='quintsoc'))

features=subjects_h[['sex_M','substance', 'mood', 'anxiety', 'psychotic', 'cognitive', 'otherpsych',
       'selfharm', 'visit_emr_MH_non_elect', 'visit_emr_NonMH',
       'visit_emr_visit', 'visit_family_gp', 'visit_hosp_visit',
       'visit_hospitalized_MH', 'visit_hospitalized_NonMH', 'visit_im',
       'visit_neurology', 'visit_other', 'visit_pharmacy', 'visit_psychiatry',
       'EX_CHF', 'EX_Arrhy', 'EX_VD', 'EX_PCD', 'EX_PVD', 'EX_HPTN_UC',
       'EX_HPTN_C', 'EX_Para', 'Ex_OthND', 'Ex_COPD', 'Ex_Diab_UC',
       'Ex_Diab_C', 'Ex_Hptothy', 'Ex_RF', 'Ex_LD', 'Ex_PUD_NB', 'Ex_HIV',
       'Ex_Lymp', 'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A', 'Ex_Coag', 'Ex_Obesity',
       'Ex_WL', 'Ex_Fluid', 'Ex_BLA', 'Ex_DA', 'Ex_Alcohol', 'Ex_Drug',
       'Ex_Psycho', 'Ex_Dep', 'Ex_Stroke', 'Ex_Dyslipid', 'Ex_Sleep', 'Ex_IHD',
       'EX_Fall', 'EX_Urinary', 'EX_Visual', 'EX_Hearing', 'EX_Tobacco',
       'EX_Delirium', 'Ex_MS', 'EX_parkinsons']]

y=subjects_h['homeless']

model_odds = pd.DataFrame()#columns= [['Feature','OR','p-value','2.5%', '97.5%', 'LLR p-value','Pseudo R-squared']])

i=0
for x in features.columns.values:
    
    X=features[x]

    X=sm.add_constant(X)
    logit_model=sm.Logit(y,X)
    result=logit_model.fit()

    dict = {'Feature': x , 
            'OR': np.round( np.exp(result.params),2)[1], 
            'Coef':np.round( result.params.values,3)[1], 
            'p-value': np.round( result.pvalues.values,2)[1]  ,
             '2.5%': np.round( np.exp(result.conf_int())[0].values,2)[1] , 
            '97.5%': np.round( np.exp(result.conf_int())[1].values,2)[1],
             'LLR p-value' :np.round(result.llr_pvalue, 5),
            'Pseudo R-squared':result.prsquared
           
           }

    model_odds = model_odds.append(dict, ignore_index = True)



In [None]:
# Select the age-related features
features_age = subjects_h[['age_18-29', 'age_40-49', 'age_50-59', 'age_60+']]

# Perform logistic regression on age features
df_results = pd.DataFrame()
X = features_age
X = sm.add_constant(X)
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Calculate odds ratios and confidence intervals
df_results = np.round(np.exp(result.conf_int()), 2)
df_results.columns = ['2.5%', '97.5%']
df_results['OR'] = pd.DataFrame(np.exp(result.params))
df_results['OR'] = df_results['OR'].map(lambda x: np.round(x, 2))
df_results['P_value'] = np.round(result.pvalues.values, 2)

# Print the results
df_results


In [None]:
# Save subjects_h dataframe to HDF file
subjects_h.to_hdf("df_sub_homeless_preproc.h5",'df_sub_homeless_preproc')