In [1]:
# Import libraries
import pyreadstat
import pandas as pd 
import numpy as np
from datetime import datetime, timedelta
import re
import os
import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
import dataframe_image as dfi
import time
from alive_progress import config_handler
import graphviz 
import h5py

import statsmodels.api as sm 

import pandas_profiling as pp
from pandas_profiling import ProfileReport

# Set display options for rows and columns
pd.options.display.max_rows = 2000
pd.options.display.max_info_columns = 2000
pd.options.display.max_seq_items = 1000
pd.options.display.max_columns = 1000

# Read the subject data from HDF5 file
subjects_p = pd.read_hdf('df_subjects_policing.h5', 'df_subjects_policing')

# Categorize age into age groups
subjects_p['age_categorical'] = subjects_p['age'].map(lambda x: '18-29' if ((x >= 18) and (x < 30)) else ('30-39' if ((x >= 30) and (x < 40)) else ('40-49' if ((x >= 40) and (x < 50)) else ('50-59' if ((x >= 50) and (x < 60)) else ('60+' if x >= 60 else '')))))

# List of columns to be converted to binary (0 or 1)
col_list = ['substance', 'mood', 'anxiety', 'psychotic', 'cognitive', 'otherpsych', 'selfharm', 'visit_emr_MH_non_elect', 'visit_emr_NonMH', 'visit_emr_visit', 'visit_family_gp', 'visit_hosp_visit', 'visit_hospitalized_MH', 'visit_hospitalized_NonMH', 'visit_im', 'visit_neurology', 'visit_other', 'visit_pharmacy', 'visit_psychiatry', 'EX_CHF', 'EX_Arrhy', 'EX_VD', 'EX_PCD', 'EX_PVD', 'EX_HPTN_UC', 'EX_HPTN_C', 'EX_Para', 'Ex_OthND', 'Ex_COPD', 'Ex_Diab_UC', 'Ex_Diab_C', 'Ex_Hptothy', 'Ex_RF', 'Ex_LD', 'Ex_PUD_NB', 'Ex_HIV', 'Ex_Lymp', 'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A', 'Ex_Coag', 'Ex_Obesity', 'Ex_WL', 'Ex_Fluid', 'Ex_BLA', 'Ex_DA', 'Ex_Alcohol', 'Ex_Drug', 'Ex_Psycho', 'Ex_Dep', 'Ex_Stroke', 'Ex_Dyslipid', 'Ex_Sleep', 'Ex_IHD', 'EX_Fall', 'EX_Urinary', 'EX_Visual', 'EX_Hearing', 'EX_Tobacco', 'EX_Delirium', 'Ex_MS', 'EX_parkinsons']

# Convert selected columns to binary (0 or 1)
def return_zero_one(value):
    if value == 0:
        return 0
    else: 
        return 1

for col in col_list:
    subjects_p[col] = subjects_p[col].map(return_zero_one)

# Select the desired columns from the subject data
subjects_p = subjects_p[['subject_id', 'sex', 'age','age_categorical', 
                       'quintmat', 'quintsoc', 'substance', 'mood',
                       'anxiety', 'psychotic', 'cognitive', 'otherpsych', 'selfharm',
                       'visit_emr_MH_non_elect', 'visit_emr_NonMH', 'visit_emr_visit',
                       'visit_family_gp', 'visit_hosp_visit', 'visit_hospitalized_MH',
                       'visit_hospitalized_NonMH', 'visit_im', 'visit_neurology',
                       'visit_other', 'visit_pharmacy', 'visit_psychiatry', 'EX_CHF',
                       'EX_Arrhy', 'EX_VD', 'EX_PCD', 'EX_PVD', 'EX_HPTN_UC', 'EX_HPTN_C',
                       'EX_Para', 'Ex_OthND', 'Ex_COPD', 'Ex_Diab_UC', 'Ex_Diab_C',
                       'Ex_Hptothy', 'Ex_RF', 'Ex_LD', 'Ex_PUD_NB', 'Ex_HIV', 'Ex_Lymp',
                       'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A', 'Ex_Coag', 'Ex_Obesity', 'Ex_WL',
                       'Ex_Fluid', 'Ex_BLA', 'Ex_DA', 'Ex_Alcohol', 'Ex_Drug', 'Ex_Psycho',
                       'Ex_Dep', 'Ex_Stroke', 'Ex_Dyslipid', 'Ex_Sleep', 'Ex_IHD', 'EX_Fall',
                       'EX_Urinary', 'EX_Visual', 'EX_Hearing', 'EX_Tobacco', 'EX_Delirium',
                       'Ex_MS', 'EX_parkinsons',
                       'police_interaction']]


  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


In [None]:
# Generate a profile report for the entire dataset
profile = pp.ProfileReport(subjects_p, minimal=True)
profile.to_file(output_file="Descriptive_analysis_police_interaction.html")
# profile = df.profile_report(plot={'histogram': {'bins': None}}

# Filter the dataset for individuals with police interaction
subjects_p_y = subjects_p[subjects_p.police_interaction == 1]

# Generate a profile report for individuals with police interaction
profile = pp.ProfileReport(subjects_p_y, minimal=True)
profile.to_file(output_file="Descriptive_analysis_individuals_with_police_interaction.html")
# profile = df.profile_report(plot={'histogram': {'bins': None}}

# Filter the dataset for individuals without police interaction
subjects_p_n = subjects_p[subjects_p.police_interaction == 0]

# Generate a profile report for individuals without police interaction
profile = pp.ProfileReport(subjects_p_n, minimal=True)
profile.to_file(output_file="Descriptive_analysis_individuals_without_police_interaction.html")
# profile = df.profile_report(plot={'histogram': {'bins': None}}


# Oddsa Ratio

In [9]:
# Create dummy variables for 'sex' and 'age_categorical' columns
subjects_p = subjects_p.join(pd.get_dummies(subjects_p['sex'], prefix='sex'))
subjects_p = subjects_p.join(pd.get_dummies(subjects_p['age_categorical'], prefix='age'))

# Select the features for the logistic regression model
features = subjects_p[['sex_M', 'substance', 'mood', 'anxiety', 'psychotic', 'cognitive', 'otherpsych',
                      'selfharm', 'visit_emr_MH_non_elect', 'visit_emr_NonMH', 'visit_emr_visit',
                      'visit_hosp_visit', 'visit_hospitalized_MH', 'visit_hospitalized_NonMH',
                      'visit_family_gp', 'visit_im', 'visit_neurology', 'visit_other', 'visit_pharmacy',
                      'visit_psychiatry', 'EX_CHF', 'EX_Arrhy', 'EX_VD', 'EX_PCD', 'EX_PVD', 'EX_HPTN_UC',
                      'EX_HPTN_C', 'EX_Para', 'Ex_OthND', 'Ex_COPD', 'Ex_Diab_UC', 'Ex_Diab_C', 'Ex_Hptothy',
                      'Ex_RF', 'Ex_LD', 'Ex_PUD_NB', 'Ex_HIV', 'Ex_Lymp', 'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A',
                      'Ex_Coag', 'Ex_Obesity', 'Ex_WL', 'Ex_Fluid', 'Ex_BLA', 'Ex_DA', 'Ex_Alcohol', 'Ex_Drug',
                      'Ex_Psycho', 'Ex_Dep', 'Ex_Stroke', 'Ex_Dyslipid', 'Ex_Sleep', 'Ex_IHD', 'EX_Fall',
                      'EX_Urinary', 'EX_Visual', 'EX_Hearing', 'EX_Tobacco', 'EX_Delirium', 'Ex_MS',
                      'EX_parkinsons']]

y = subjects_p['police_interaction']

model_odds = pd.DataFrame()

# Iterate over each feature and fit a logistic regression model
for x in features.columns.values:
    X = features[x]

    # Add constant to the predictor variable
    X = sm.add_constant(X)
    
    # Fit the logistic regression model
    logit_model = sm.Logit(y, X)
    result = logit_model.fit()

    # Store the results in a dictionary
    dict = {
        'Feature': x,
        'OR': np.round(np.exp(result.params), 2)[1],
        'Coef': np.round(result.params.values, 3)[1],
        'p-value': np.round(result.pvalues.values, 2)[1],
        '2.5%': np.round(np.exp(result.conf_int())[0].values, 2)[1],
        '97.5%': np.round(np.exp(result.conf_int())[1].values, 2)[1],
        'LLR p-value': result.llr_pvalue,
        'Pseudo R-squared': result.prsquared
    }

    # Append the dictionary to the model_odds dataframe
    model_odds = model_odds.append(dict, ignore_index=True)


In [14]:
model_odds.sort_values("OR", ascending=False)

Unnamed: 0,Feature,OR,Coef,p-value,2.5%,97.5%,LLR p-value,Pseudo R-squared
49,Ex_Psycho,11.77,2.466,0.0,11.03,12.56,0.0,0.116286
4,psychotic,10.55,2.356,0.0,9.83,11.32,0.0,0.078402
48,Ex_Drug,8.86,2.182,0.0,8.31,9.46,0.0,0.092688
19,visit_psychiatry,7.75,2.047,0.0,7.19,8.35,0.0,0.089725
47,Ex_Alcohol,7.37,1.998,0.0,6.86,7.93,0.0,0.053594
12,visit_hospitalized_MH,7.23,1.978,0.0,6.69,7.81,0.0,0.045224
8,visit_emr_MH_non_elect,6.43,1.861,0.0,6.02,6.86,0.0,0.077739
1,substance,5.58,1.719,0.0,5.23,5.96,0.0,0.067407
7,selfharm,5.32,1.672,0.0,4.46,6.35,1.438657e-51,0.005799
13,visit_hospitalized_NonMH,5.18,1.646,0.0,4.75,5.65,1.816431e-217,0.025173


In [15]:
features_age=subjects_p[['age_18-29', 'age_40-49', 'age_50-59', 'age_60+']]

# ODDs for the age
df_results=pd.DataFrame()
X=features_age
X=sm.add_constant(X)
logit_model=sm.Logit(y,X)
result=logit_model.fit()

df_results=np.round(np.exp(result.conf_int()),2)
df_results.columns=['2.5%','97.5%']
df_results['OR']=pd.DataFrame(np.exp(result.params))
df_results['OR']=df_results['OR'].map(lambda x: np.round(x,2))
df_results['P_value']=np.round(result.pvalues.values,2)
df_results[['OR','2.5%', '97.5%', 'P_value']]

In [104]:
subjects_p.to_hdf("df_sub_policing_preproc.h5",'df_sub_policing_preproc')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['subject_id', 'sex', 'age', 'age_categorical', 'quintmat', 'quintsoc'], dtype='object')]

  subjects_p.to_hdf("df_sub_policing_preproc.h5",'df_sub_policing_preproc')
