In [4]:
# Import libraries
import pyreadstat
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re
import os
import tqdm
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()
import dataframe_image as dfi
import time
from alive_progress import config_handler
import graphviz
import h5py

import statsmodels.api as sm

import pandas_profiling as pp
from pandas_profiling import ProfileReport

# Set display options
pd.options.display.max_rows = 2000
pd.options.display.max_info_columns = 2000
pd.options.display.max_seq_items = 1000
pd.options.display.max_columns = 1000

# Read data from HDF file
subjects_h = pd.read_hdf('data/df_subjects_retro_policing.h5', 'df_subjects_retro_policing')
print("the shape of the dataframe")
print(subjects_h.shape)

# Create age_categorical column
subjects_h['age_categorical'] = subjects_h['age'].map(lambda x: '18-29' if ((x >= 18) and (x < 30)) else
                                                      ('30-39' if ((x >= 30) and (x < 40)) else
                                                       ('40-49' if ((x >= 40) and (x < 50)) else
                                                        ('50-59' if ((x >= 50) and (x < 60)) else
                                                         ('60+' if x >= 60 else '')))))

# Fill missing values in police_interaction_followup column with 0
subjects_h['police_interaction_followup'] = subjects_h['police_interaction_followup'].fillna(0)

print("unique values in police_interaction_followup column")
print(subjects_h['police_interaction_followup'].unique())

# List of columns to be converted to 0 and 1
col_list = ['substance', 'mood', 'anxiety', 'psychotic', 'cognitive', 'otherpsych', 'selfharm',
            'visit_emr_MH_non_elect', 'visit_emr_NonMH', 'visit_emr_visit', 'visit_family_gp',
            'visit_hosp_visit', 'visit_hospitalized_MH', 'visit_hospitalized_NonMH', 'visit_im',
            'visit_neurology', 'visit_other', 'visit_pharmacy', 'visit_psychiatry', 'EX_CHF',
            'EX_Arrhy', 'EX_VD', 'EX_PCD', 'EX_PVD', 'EX_HPTN_UC', 'EX_HPTN_C', 'EX_Para',
            'Ex_OthND', 'Ex_COPD', 'Ex_Diab_UC', 'Ex_Diab_C', 'Ex_Hptothy', 'Ex_RF', 'Ex_LD',
            'Ex_PUD_NB', 'Ex_HIV', 'Ex_Lymp', 'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A', 'Ex_Coag',
            'Ex_Obesity', 'Ex_WL', 'Ex_Fluid', 'Ex_BLA', 'Ex_DA', 'Ex_Alcohol', 'Ex_Drug',
            'Ex_Psycho', 'Ex_Dep', 'Ex_Stroke', 'Ex_Dyslipid', 'Ex_Sleep', 'Ex_IHD', 'EX_Fall',
            'EX_Urinary', 'EX_Visual', 'EX_Hearing', 'EX_Tobacco', 'EX_Delirium', 'Ex_MS',
            'EX_parkinsons', 'police_interaction_followup']

# Function to convert values to 0 or 1
def return_zero_one(value):
    if value == 0:
        return 0
    else:
        return 1

# Convert columns to 0 and 1
for col in col_list:
    subjects_h[col] = subjects_h[col].map(lambda x: return_zero_one(x))

# Select columns of interest
subjects_h = subjects_h[['subject_id', 'sex', 'age', 'age_categorical', 'quintmat', 'quintsoc',
                         'db_claim', 'db_nacrs', 'db_dad', 'db_pin', 'substance', 'mood', 'anxiety',
                         'psychotic', 'cognitive', 'otherpsych', 'selfharm', 'visit_emr_MH_non_elect',
                         'visit_emr_NonMH', 'visit_emr_visit', 'visit_hosp_visit',
                         'visit_hospitalized_MH', 'visit_hospitalized_NonMH', 'visit_family_gp',
                         'visit_im', 'visit_neurology', 'visit_other', 'visit_pharmacy',
                         'visit_psychiatry', 'EX_CHF', 'EX_Arrhy', 'EX_VD', 'EX_PCD', 'EX_PVD',
                         'EX_HPTN_UC', 'EX_HPTN_C', 'EX_Para', 'Ex_OthND', 'Ex_COPD', 'Ex_Diab_UC',
                         'Ex_Diab_C', 'Ex_Hptothy', 'Ex_RF', 'Ex_LD', 'Ex_PUD_NB', 'Ex_HIV', 'Ex_Lymp',
                         'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A', 'Ex_Coag', 'Ex_Obesity', 'Ex_WL',
                         'Ex_Fluid', 'Ex_BLA', 'Ex_DA', 'Ex_Alcohol', 'Ex_Drug', 'Ex_Psycho', 'Ex_Dep',
                         'Ex_Stroke', 'Ex_Dyslipid', 'Ex_Sleep', 'Ex_IHD', 'EX_Fall', 'EX_Urinary',
                         'EX_Visual', 'EX_Hearing', 'EX_Tobacco', 'EX_Delirium', 'Ex_MS',
                         'EX_parkinsons', 'police_interaction_followup']]

print ("unique values in age_categorical column")
print(subjects_h.age_categorical.unique())



the shape of the dataframe
(237141, 81)
unique values in police_interaction_followup column
[  0.   1.  71.  22.  52.   2.   3.  30.   5.  23.   9.  10.  12.  20.
  11.   4.  26.  31.  15.  18.   6.  37.   8.  13.  74.  19.   7.  17.
  32.  50.  29.  43.  45.  54.  27.  60. 100.  25.  58.  55.  44.  91.
  21.  41.  16.  28.  53.  14. 167.  34. 112.  42.  35. 180.  48. 254.
 101. 127.  24.  33.  63.]
unique values in age_categorical column
['50-59' '18-29' '30-39' '60+' '40-49']


In [5]:
# Generate profile report for subjects_h DataFrame
profile = pp.ProfileReport(subjects_h, minimal=True)
profile.to_file(output_file="Descriptive_analysis_police_interaction.html")

# Filter subjects_h for police_interaction_followup equal to 1
subjects_h_y = subjects_h[subjects_h.police_interaction_followup == 1]

# Generate profile report for subjects_h_y DataFrame
profile = pp.ProfileReport(subjects_h_y, minimal=True)
profile.to_file(output_file="Descriptive_analysis_individuals_with_police_interaction.html")

# Filter subjects_h for police_interaction_followup equal to 0
subjects_h_n = subjects_h[subjects_h.police_interaction_followup == 0]

# Generate profile report for subjects_h_n DataFrame
profile = pp.ProfileReport(subjects_h_n, minimal=True)
profile.to_file(output_file="Descriptive_analysis_individuals_without_police_interaction.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Odds Ratio

In [None]:
# One-hot encode 'sex' column and join with subjects_h DataFrame
subjects_h = subjects_h.join(pd.get_dummies(subjects_h['sex'], prefix='sex'))

# One-hot encode 'age_categorical' column and join with subjects_h DataFrame
subjects_h = subjects_h.join(pd.get_dummies(subjects_h['age_categorical'], prefix='age'))

# Select the features of interest
features = subjects_h[['sex_M', 'substance', 'mood', 'anxiety', 'psychotic', 'cognitive', 'otherpsych',
                      'selfharm', 'visit_emr_MH_non_elect', 'visit_emr_NonMH', 'visit_emr_visit',
                      'visit_hosp_visit', 'visit_hospitalized_MH', 'visit_hospitalized_NonMH',
                      'visit_family_gp', 'visit_im', 'visit_neurology', 'visit_other', 'visit_pharmacy',
                      'visit_psychiatry', 'EX_CHF', 'EX_Arrhy', 'EX_VD', 'EX_PCD', 'EX_PVD', 'EX_HPTN_UC',
                      'EX_HPTN_C', 'EX_Para', 'Ex_OthND', 'Ex_COPD', 'Ex_Diab_UC', 'Ex_Diab_C', 'Ex_Hptothy',
                      'Ex_RF', 'Ex_LD', 'Ex_PUD_NB', 'Ex_HIV', 'Ex_Lymp', 'Ex_METS', 'Ex_Tumor', 'Ex_Rheum_A',
                      'Ex_Coag', 'Ex_Obesity', 'Ex_WL', 'Ex_Fluid', 'Ex_BLA', 'Ex_DA', 'Ex_Alcohol', 'Ex_Drug',
                      'Ex_Psycho', 'Ex_Dep', 'Ex_Stroke', 'Ex_Dyslipid', 'Ex_Sleep', 'Ex_IHD', 'EX_Fall',
                      'EX_Urinary', 'EX_Visual', 'EX_Hearing', 'EX_Tobacco', 'EX_Delirium', 'Ex_MS',
                      'EX_parkinsons']]

# Set y as the target variable
y = subjects_h['police_interaction_followup']

# Initialize an empty DataFrame for model odds
model_odds = pd.DataFrame()

# Iterate over each feature
for x in features.columns.values:
    X = features[x]

    X = sm.add_constant(X)
    logit_model = sm.Logit(y, X)
    result = logit_model.fit()

    # Create a dictionary with the model results
    dict = {'Feature': x,
            'OR': np.round(np.exp(result.params), 2)[1],
            'Coef': np.round(result.params.values, 3)[1],
            'p-value': np.round(result.pvalues.values, 2)[1],
            '2.5%': np.round(np.exp(result.conf_int())[0].values, 2)[1],
            '97.5%': np.round(np.exp(result.conf_int())[1].values, 2)[1],
            'LLR p-value': np.round(result.llr_pvalue, 5),
            'Pseudo R-squared': result.prsquared
            }

    # Append the dictionary to the model_odds DataFrame
    model_odds = model_odds.append(dict, ignore_index=True)


In [8]:
model_odds

Unnamed: 0,Feature,OR,Coef,p-value,2.5%,97.5%,LLR p-value,Pseudo R-squared
0,sex_M,3.37,1.216,0.0,2.88,3.95,0.0,0.02512084
1,substance,8.37,2.124,0.0,7.19,9.73,0.0,0.07853904
2,mood,3.0,1.1,0.0,2.58,3.5,0.0,0.02116889
3,anxiety,1.48,0.395,0.0,1.26,1.75,0.0,0.002363076
4,psychotic,7.55,2.022,0.0,6.36,8.96,0.0,0.03525499
5,cognitive,1.3,0.265,0.49,0.62,2.75,0.50418,4.358587e-05
6,otherpsych,1.84,0.612,0.0,1.59,2.14,0.0,0.006221421
7,selfharm,8.79,2.174,0.0,6.37,12.14,0.0,0.009816966
8,visit_emr_MH_non_elect,8.86,2.181,0.0,7.65,10.26,0.0,0.08168024
9,visit_emr_NonMH,3.97,1.38,0.0,2.8,5.63,0.0,0.009112638


In [131]:
# Select the age-related features
features_age = subjects_h[['age_18-29', 'age_40-49', 'age_50-59', 'age_60+']]

# Calculate odds ratios for age
df_results = pd.DataFrame()

# Add constant term to features
X = sm.add_constant(features_age)

# Fit logistic regression model
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Calculate odds ratios, confidence intervals, and p-values
df_results = np.round(np.exp(result.conf_int()), 2)
df_results.columns = ['2.5%', '97.5%']
df_results['OR'] = pd.DataFrame(np.exp(result.params))
df_results['OR'] = df_results['OR'].map(lambda x: np.round(x, 2))
df_results['P_value'] = np.round(result.pvalues.values, 2)

# Select relevant columns in desired order
df_results = df_results[['OR', '2.5%', '97.5%', 'P_value']]


In [None]:
# Save preprocessed DataFrame to HDF file
subjects_h.to_hdf("data/df_subjects_retro_policing_preproc.h5", 'df_subjects_retro_policing_preproc')