In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#### Read in first dataset - Form I dataset includes baseline demographic and clinical information of persons who met eligibility criteria.

In [None]:
f1 = pd.read_csv('C:/Users/dmora/Documents/NSS/Capstone/data/f1_public_2021.csv')
f1

#### Create df of base variables

In [None]:
f1_base = (
    f1
    [[
        'UniID', 'AAdmDt', 'ARbAdmDt', 'ADisDt',
        'AI2ADays', 'AI2RhADa', 'AInjAge', 'ASex', 
        'APResInj', 'APResDis',
        'AMarStIj', 'AEducLvl','ABPHQ1', 
        'ABPHQ2', 'ABPHQ9', 'ABPHQ6', 'ABPHQMDS',
        'ABPHQSDS', 'APrLvlSt', 'AFmIncLv', 'AHDaSyRb',
        'AFScorRb', 'AFScorDs',
        
        
    ]]
   
)

f1_base

#### Rename columns for easier understanding.

In [None]:
f1_base = f1_base.rename(columns={'UniID' : 'unique_id', 'AAdmDt' : 'sys_adm_dt', 'ARbAdmDt' : 'reh_adm_dt', 'ADisDt' : 'discharge_dt', 
                          'AI2ADays' : 'days_inj_to_sys', 'AI2RhADa' : 'days_inj_to_reh',  'AInjAge' : 'age_at_injury', 'ASex': 'sex',
                          'APResInj' : 'res_at_inj','APResDis' : 'res_at_dis', 
                          'AMarStIj' : 'marital_stat_ai', 'AEducLvl' : 'educ_lvl_ai', 'ABPHQ1' : 'interest_or_pleasure', 
                          'ABPHQ2' : 'depressed', 'ABPHQ9' : 'self_harm', 'ABPHQ6' : 'feel_bad', 'ABPHQMDS': 'depressive_syndrome', 
                          'ABPHQSDS' : 'severity_of_depression', 'APrLvlSt' : 'occup_status', 'AFmIncLv' : 'income_level', 
                          'AHDaSyRb' : 'days_rehab1'})

f1_base

#### Fill in n/a values in rehab_adm_dt column, so we can then change to int for year consistency across the df.

In [None]:
f1_base['reh_adm_dt'] = f1_base['reh_adm_dt'].fillna(0)
f1_base

#### Change dtype of reh_adm_dt to integer

In [None]:
f1_base['reh_adm_dt'] = f1_base['reh_adm_dt'].astype('int')

#### Run loop on sex column to create gender column to include gender name

In [None]:
for ind, row in f1_base.iterrows():
    if row.sex == 1:
        f1_base.loc[ind, 'gender'] = 'm'
    elif row.sex == 2:
        f1_base.loc[ind, 'gender'] = 'f'
    elif row.sex == 3:
        f1_base.loc[ind, 'gender'] = 't'
    elif row.sex == 9:
        f1_base.loc[ind, 'gender'] = 'u'

#### Drop sex column

In [None]:
f1_base = f1_base.drop('sex', axis = 1)
f1_base

#### Run loop on res_at_inj column to create res_at_inj column to include type of residence instead of #

In [None]:
for ind, row in f1_base.iterrows():
    if row.res_at_inj == 1:
        f1_base.loc[ind, 'res_at_injb'] = 'pri_res'
    elif row.res_at_inj == 2:
        f1_base.loc[ind, 'res_at_injb'] = 'hospital'
    elif row.res_at_inj == 3:
        f1_base.loc[ind, 'res_at_injb'] = 'nurs_home'
    elif row.res_at_inj == 4:
        f1_base.loc[ind, 'res_at_injb'] = 'group'
    elif row.res_at_inj == 5:
        f1_base.loc[ind, 'res_at_injb'] = 'correctional'
    elif row.res_at_inj == 6:
        f1_base.loc[ind, 'res_at_injb'] = 'motel'
    elif row.res_at_inj == 7:
        f1_base.loc[ind, 'res_at_injb'] = 'deceased'
    elif row.res_at_inj == 8:
        f1_base.loc[ind, 'res_at_injb'] = 'other'
    elif row.res_at_inj == 9:
        f1_base.loc[ind, 'res_at_injb'] = 'homeless'    
    elif row.res_at_inj == 10:
        f1_base.loc[ind, 'res_at_injb'] = 'assisted'    
    elif row.res_at_inj == 99:
        f1_base.loc[ind, 'res_at_injb'] = 'unknown'     

#### Drop original numerical res_at_inj column

In [None]:
f1_base = f1_base.drop('res_at_inj', axis = 1)
f1_base

#### Run loop on res_at_dis column to create res_at_dis column to include type of residence instead of #

In [None]:
for ind, row in f1_base.iterrows():
    if row.res_at_dis == 1:
        f1_base.loc[ind, 'res_at_disb'] = 'pri_res'
    elif row.res_at_dis == 2:
        f1_base.loc[ind, 'res_at_disb'] = 'hospital'
    elif row.res_at_dis == 3:
        f1_base.loc[ind, 'res_at_disb'] = 'nurs_home'
    elif row.res_at_dis == 4:
        f1_base.loc[ind, 'res_at_disb'] = 'group'
    elif row.res_at_dis == 5:
        f1_base.loc[ind, 'res_at_disb'] = 'correctional'
    elif row.res_at_dis == 6:
        f1_base.loc[ind, 'res_at_disb'] = 'motel'
    elif row.res_at_dis == 7:
        f1_base.loc[ind, 'res_at_disb'] = 'deceased'
    elif row.res_at_dis == 8:
        f1_base.loc[ind, 'res_at_disb'] = 'other'
    elif row.res_at_dis == 9:
        f1_base.loc[ind, 'res_at_disb'] = 'homeless'    
    elif row.res_at_dis == 10:
        f1_base.loc[ind, 'res_at_disb'] = 'assisted'    
    elif row.res_at_dis == 99:
        f1_base.loc[ind, 'res_at_disb'] = 'unknown'

#### Drop original res_at_dis column

In [None]:
f1_base = f1_base.drop('res_at_dis', axis = 1)
f1_base

In [None]:
f1_base = f1_base.rename(columns={'res_at_injb' : 'res_at_inj', 'res_at_disb' : 'res_at_dis'})
f1_base

In [None]:
for ind, row in f1_base.iterrows():
    if row.marital_stat_ai == 1:
        f1_base.loc[ind, 'marital_stat_aib'] = 'single'
    elif row.marital_stat_ai == 2:
        f1_base.loc[ind, 'marital_stat_aib'] = 'married'
    elif row.marital_stat_ai == 3:
        f1_base.loc[ind, 'marital_stat_aib'] = 'divorced'
    elif row.marital_stat_ai == 4:
        f1_base.loc[ind, 'marital_stat_aib'] = 'separated'
    elif row.marital_stat_ai == 5:
        f1_base.loc[ind, 'marital_stat_aib'] = 'widowed'
    elif row.marital_stat_ai == 6:
        f1_base.loc[ind, 'marital_stat_aib'] = 'other'
    elif row.marital_stat_ai == 7:
        f1_base.loc[ind, 'marital_stat_aib'] = 'living_with'
    elif row.marital_stat_ai == 9:
        f1_base.loc[ind, 'marital_stat_aib'] = 'unknown'    

In [None]:
f1_base = f1_base.drop('marital_stat_ai', axis = 1)

In [None]:
for ind, row in f1_base.iterrows():
    if row.educ_lvl_ai == 1:
        f1_base.loc[ind, 'educ_lvl_aib'] = '8th_or_less'
    elif row.educ_lvl_ai == 2:
        f1_base.loc[ind, 'educ_lvl_aib'] = '9th_11th'
    elif row.educ_lvl_ai == 3:
        f1_base.loc[ind, 'educ_lvl_aib'] = 'hsd'
    elif row.educ_lvl_ai == 4:
        f1_base.loc[ind, 'educ_lvl_aib'] = 'ad'
    elif row.educ_lvl_ai == 5:
        f1_base.loc[ind, 'educ_lvl_aib'] = 'bd'
    elif row.educ_lvl_ai == 6:
        f1_base.loc[ind, 'educ_lvl_aib'] = 'md'
    elif row.educ_lvl_ai == 7:
        f1_base.loc[ind, 'educ_lvl_aib'] = 'doc'
    elif row.educ_lvl_ai == 8:
        f1_base.loc[ind, 'educ_lvl_aib'] = 'other'    
    elif row.educ_lvl_ai == 9:
        f1_base.loc[ind, 'educ_lvl_aib'] = 'unknown'    

In [None]:
f1_base = f1_base.drop('educ_lvl_ai', axis = 1)

In [None]:
for ind, row in f1_base.iterrows():
    if row.interest_or_pleasure == 0:
        f1_base.loc[ind, 'int_or_pleas'] = 'none'
    elif row.interest_or_pleasure == 1:
        f1_base.loc[ind, 'int_or_pleas'] = 'several_days'
    elif row.interest_or_pleasure == 2:
        f1_base.loc[ind, 'int_or_pleas'] = 'more_than_half'
    elif row.interest_or_pleasure == 3:
        f1_base.loc[ind, 'int_or_pleas'] = 'nearly_every_day'
    elif row.interest_or_pleasure == 7:
        f1_base.loc[ind, 'int_or_pleas'] = 'declined'
    elif row.interest_or_pleasure == 9:
        f1_base.loc[ind, 'int_or_pleas'] = 'unknown'

In [None]:
f1_base = f1_base.drop('interest_or_pleasure', axis = 1)

In [None]:
for ind, row in f1_base.iterrows():
    if row.depressed == 0:
        f1_base.loc[ind, 'depressedb'] = 'none'
    elif row.depressed == 1:
        f1_base.loc[ind, 'depressedb'] = 'several_days'
    elif row.depressed == 2:
        f1_base.loc[ind, 'depressedb'] = 'more_than_half'
    elif row.depressed == 3:
        f1_base.loc[ind, 'depressedb'] = 'nearly_every_day'
    elif row.depressed == 7:
        f1_base.loc[ind, 'depressedb'] = 'declined'
    elif row.depressed == 9:
        f1_base.loc[ind, 'depressedb'] = 'unknown'

In [None]:
f1_base = f1_base.drop('depressed', axis = 1)

In [None]:
for ind, row in f1_base.iterrows():
    if row.self_harm == 0:
        f1_base.loc[ind, 'self_harmb'] = 'none'
    elif row.self_harm == 1:
        f1_base.loc[ind, 'self_harmb'] = 'several_days'
    elif row.self_harm == 2:
        f1_base.loc[ind, 'self_harmb'] = 'more_than_half'
    elif row.self_harm == 3:
        f1_base.loc[ind, 'self_harmb'] = 'nearly_every_day'
    elif row.self_harm == 7:
        f1_base.loc[ind, 'self_harmb'] = 'declined'
    elif row.self_harm == 9:
        f1_base.loc[ind, 'self_harmb'] = 'unknown'

In [None]:
f1_base = f1_base.drop('self_harm', axis = 1)

In [None]:
for ind, row in f1_base.iterrows():
    if row.feel_bad == 0:
        f1_base.loc[ind, 'feel_badb'] = 'none'
    elif row.feel_bad == 1:
        f1_base.loc[ind, 'feel_badb'] = 'several_days'
    elif row.feel_bad == 2:
        f1_base.loc[ind, 'feel_badb'] = 'more_than_half'
    elif row.feel_bad == 3:
        f1_base.loc[ind, 'feel_badb'] = 'nearly_every_day'
    elif row.feel_bad == 7:
        f1_base.loc[ind, 'feel_badb'] = 'declined'
    elif row.feel_bad == 9:
        f1_base.loc[ind, 'feel_badb'] = 'unknown'

In [None]:
f1_base = f1_base.drop('feel_bad', axis = 1)

In [None]:
for ind, row in f1_base.iterrows():
    if row.depressive_syndrome == 0:
        f1_base.loc[ind, 'dep_syn'] = 'none'
    elif row.depressive_syndrome == 1:
        f1_base.loc[ind, 'dep_syn'] = 'major'
    elif row.depressive_syndrome == 2:
        f1_base.loc[ind, 'dep_syn'] = 'other'
    elif row.depressive_syndrome == 9:
        f1_base.loc[ind, 'dep_syn'] = 'unknown'

In [None]:
f1_base = f1_base.drop('depressive_syndrome', axis = 1)

In [None]:
for ind, row in f1_base.iterrows():
    if row.occup_status == 1:
        f1_base.loc[ind, 'occ_stat'] = 'working'
    elif row.occup_status == 2:
        f1_base.loc[ind, 'occ_stat'] = 'homemaker'
    elif row.occup_status == 3:
        f1_base.loc[ind, 'occ_stat'] = 'otj_train'
    elif row.occup_status == 4:
        f1_base.loc[ind, 'occ_stat'] = 'shel_work'
    elif row.occup_status == 5:
        f1_base.loc[ind, 'occ_stat'] = 'retired'
    elif row.occup_status == 6:
        f1_base.loc[ind, 'occ_stat'] = 'student'
    elif row.occup_status == 7:
        f1_base.loc[ind, 'occ_stat'] = 'unemployed'
    elif row.occup_status == 8:
        f1_base.loc[ind, 'occ_stat'] = 'retired_dis'
    elif row.occup_status == 9:
        f1_base.loc[ind, 'occ_stat'] = 'retired_nondis'   
    elif row.occup_status == 10:
        f1_base.loc[ind, 'occ_stat'] = 'other'
    elif row.occup_status == 99:
        f1_base.loc[ind, 'occ_stat'] = 'unknown'       

In [None]:
f1_base = f1_base.drop('occup_status', axis = 1)

In [None]:
for ind, row in f1_base.iterrows():
    if row.income_level == 1:
        f1_base.loc[ind, 'inc_lvl1'] = 'lt25k'
    elif row.income_level == 2:
        f1_base.loc[ind, 'inc_lvl1'] = '25k_49999'
    elif row.income_level == 3:
        f1_base.loc[ind, 'inc_lvl1'] = '50k_74999'
    elif row.income_level == 4:
        f1_base.loc[ind, 'inc_lvl1'] = '75pl'
    elif row.income_level == 6:
        f1_base.loc[ind, 'inc_lvl1'] = 'not_sure'
    elif row.income_level == 7:
        f1_base.loc[ind, 'inc_lvl1'] = 'declined'
    elif row.income_level == 9:
        f1_base.loc[ind, 'inc_lvl1'] = 'unknown'   

In [None]:
f1_base = f1_base.drop('income_level', axis = 1)

In [None]:
f1_base = f1_base.rename(columns={'age_at_injury' : 'age1', 'severity_of_depression' : 'sev_dep', 'res_at_inj' : 'res1', 'marital_stat_aib' : 'mar_stat1',
                                  'educ_lvl_aib' : 'educ_lvl1', 'int_or_pleas' : 'int_pleas1', 'depressedb' : 'depr1', 'self_harmb' : 'self_harm1',
                                  'feel_badb' : 'feel_bad1', 'dep_syn' : 'dep_syn1', 'occ_stat' : 'occ_stat1'})


In [None]:
f1_base.columns

In [None]:
f1_menphy = f1_base[['unique_id', 'age1', 'sev_dep', 'AFScorRb', 'AFScorDs', 'gender',
                     'int_pleas1', 'depr1', 'self_harm1', 'feel_bad1','dep_syn1']]
f1_menphy

In [None]:
f1_menphy = f1_menphy.int_pleas1.value_counts().to_frame().reset_index()
f1_menphy

In [None]:
f1_menphy = f1_menphy.loc[(f1_menphy.int_pleas1 != 'unknown') & (f1_menphy.int_pleas1 != 'declined')]
f1_menphy 

In [None]:
f1_menphy = f1_menphy.rename(columns={'count' : 'total'})
f1_menphy['records'] = f1_menphy.total.sum()  
f1_menphy['ai_per'] = (f1_menphy.total / f1_menphy.records) * 100    
f1_menphy

#### Read in second dataset - Form II dataset includes sociodemographic and outcome data of Form I participants obtained at follow-up.

In [None]:
f2 = pd.read_csv('C:/Users/dmora/Documents/NSS/Capstone/data/f2_public_2021.csv')

In [None]:
f2_base = (
    f2
    [[
        'UniID', 'BYear', 'BPlcRes', 'BMarStat',
        'BMarStCh', 'BEducLvl', 'BBPHQ1', 'BBPHQ2',
        'BBPHQ9', 'BBPHQ6', 'BBPHQMDS', 'BBPHQSDS',
        'BPrLvlSt', 'BFmIncLv', 'BSPHthSt', 'BSPHthRC', 
        'BRhspRs1', 'BRhspRs2', 'BRhspRs3', 'BRhspRs4',
        'BRhspRs5', 'BRhspRs6', 'BRhspRs7', 'BRhspRs8',
        'BRhspNbr'
        
        
    ]]
   
)


In [None]:
f2_base = f2_base.rename(columns={'UniID' : 'unique_id', 'BYear' : 'post_inj_yr', 'BPlcRes' : 'res_pi', 'BMarStat' : 'marital_stat_pi',
        'BMarStCh' : 'marital_stat_change', 'BEducLvl' : 'educ_lvl_pi', 'BBPHQ1' : 'int_pleas2', 'BBPHQ2' : 'depr2',
        'BBPHQ9' : 'self_harm2','BBPHQ6' : 'feel_bad2','BBPHQMDS': 'dep_syn2', 'BBPHQSDS' : 'sev_dep2',
        'BPrLvlSt' : 'occup_status_pi', 'BFmIncLv' : 'income_level'})

In [None]:
f2_base

In [None]:
f2_menphy = f2_base[['unique_id', 'sev_dep2', 'int_pleas2', 'depr2', 'self_harm2', 'feel_bad2','dep_syn2']]
f2_menphy

In [None]:
for ind, row in f2_base.iterrows():
    if row.int_pleas2 == 0:
        f2_base.loc[ind, 'int_pleasb'] = 'none'
    elif row.int_pleas2 == 1:
        f2_base.loc[ind, 'int_pleasb'] = 'several_days'
    elif row.int_pleas2 == 2:
        f2_base.loc[ind, 'int_pleasb'] = 'more_than_half'
    elif row.int_pleas2 == 3:
        f2_base.loc[ind, 'int_pleasb'] = 'nearly_every_day'
    elif row.int_pleas2 == 7:
        f2_base.loc[ind, 'int_pleasb'] = 'declined'
    elif row.int_pleas2 == 9:
        f2_base.loc[ind, 'int_pleas2b'] = 'unknown'

In [None]:
f2_base = f2_base.drop('int_pleas2', axis = 1)

In [None]:
f2_int = f2_base[['unique_id', 'post_inj_yr', 'int_pleasb']]
f2_int

In [None]:
yr1 = f2_int.loc[(f2_int.post_inj_yr == 1)]
yr1

In [None]:
yr1.value_counts(dropna = False)

In [None]:
yr1 = yr1.int_pleasb.value_counts().to_frame().reset_index()
yr1

In [None]:
yr1 = yr1.rename(columns={'count' : 'total'})
yr1

In [None]:
yr1 = yr1.loc[(yr1.int_pleasb != 'declined')]
yr1

In [None]:
yr1['records'] = yr1.total.sum() 
yr1

In [None]:
yr1['yr1_per_int'] = (yr1.total / yr1.records) * 100   
yr1

In [None]:
yr5 = f2_int.loc[(f2_int.post_inj_yr == 5)]
yr5 = yr5.int_pleasb.value_counts().to_frame().reset_index()
yr5 = yr5.rename(columns={'count' : 'total'})
yr5 = yr5.loc[(yr5.int_pleasb != 'declined')]
yr5['records'] = yr5.total.sum() 
yr5['yr5_per_int'] = (yr5.total / yr5.records) * 100   
yr5

In [None]:
yr10 = f2_int.loc[(f2_int.post_inj_yr == 10)]
yr10 = yr10.int_pleasb.value_counts().to_frame().reset_index()
yr10 = yr10.rename(columns={'count' : 'total'})
yr10 = yr10.loc[(yr10.int_pleasb != 'declined')]
yr10['records'] = yr10.total.sum() 
yr10['yr10_per_int'] = (yr10.total / yr10.records) * 100   
yr10

In [None]:
yr15 = f2_int.loc[(f2_int.post_inj_yr == 15)]
yr15 = yr15.int_pleasb.value_counts().to_frame().reset_index()
yr15 = yr15.rename(columns={'count' : 'total'})
yr15 = yr15.loc[(yr15.int_pleasb != 'declined')]
yr15['records'] = yr15.total.sum() 
yr15['yr15_per_int'] = (yr15.total / yr15.records) * 100   
yr15

In [None]:
yr20 = f2_int.loc[(f2_int.post_inj_yr == 20)]
yr20 = yr20.int_pleasb.value_counts().to_frame().reset_index()
yr20 = yr20.rename(columns={'count' : 'total'})
yr20 = yr20.loc[(yr20.int_pleasb != 'declined')]
yr20['records'] = yr20.total.sum() 
yr20['yr20_per_int'] = (yr20.total / yr20.records) * 100   
yr20

In [None]:
yr25 = f2_int.loc[(f2_int.post_inj_yr == 25)]
yr25 = yr25.int_pleasb.value_counts().to_frame().reset_index()
yr25 = yr25.rename(columns={'count' : 'total'})
yr25 = yr25.loc[(yr25.int_pleasb != 'declined')]
yr25['records'] = yr25.total.sum() 
yr25['yr25_per_int'] = (yr25.total / yr25.records) * 100   
yr25

In [None]:
yr30 = f2_int.loc[(f2_int.post_inj_yr == 30)]
yr30 = yr30.int_pleasb.value_counts().to_frame().reset_index()
yr30 = yr30.rename(columns={'count' : 'total'})
yr30 = yr30.loc[(yr30.int_pleasb != 'declined')]
yr30['records'] = yr30.total.sum() 
yr30['yr30_per_int'] = (yr30.total / yr30.records) * 100   
yr30

In [None]:
yr1_5 = pd.merge(yr1, yr5, left_on=['int_pleasb'], right_on=['int_pleasb'],how='inner')
yr1_5

In [None]:
yr1_10 = pd.merge(yr1_5, yr10, left_on=['int_pleasb'], right_on=['int_pleasb'],how='inner')
yr1_10

In [None]:
yr1_10 = yr1_10[['int_pleasb','yr1_per_int', 'yr5_per_int', 'yr10_per_int']]
yr1_10

In [None]:
yr1_15 = pd.merge(yr1_10, yr15, left_on=['int_pleasb'], right_on=['int_pleasb'],how='inner')
yr1_15

In [None]:
yr1_20 = pd.merge(yr1_15, yr20, left_on=['int_pleasb'], right_on=['int_pleasb'],how='inner')
yr1_20

In [None]:
yr1_25 = pd.merge(yr1_20, yr25, left_on=['int_pleasb'], right_on=['int_pleasb'],how='inner')
yr1_25

In [None]:
yr1_25 = yr1_25[['int_pleasb','yr1_per_int', 'yr5_per_int', 'yr10_per_int', 'yr15_per_int', 'yr20_per_int', 'yr25_per_int']]
yr1_25

In [None]:
yr1_30 = pd.merge(yr1_25, yr30, left_on=['int_pleasb'], right_on=['int_pleasb'],how='inner')
yr1_30

In [None]:
yr1_30 = yr1_30[['int_pleasb','yr1_per_int', 'yr5_per_int', 'yr10_per_int', 'yr15_per_int', 'yr20_per_int', 'yr25_per_int', 'yr30_per_int']]
yr1_30

In [None]:
yr1_30 = yr1_30.rename(columns={
    'int_pleasb' : 'int_ples',
    'yr1_per_int' : 'yr1',
    'yr5_per_int' : 'yr5',
    'yr10_per_int' : 'yr10',
    'yr15_per_int' : 'yr15',
    'yr20_per_int' : 'yr20',
    'yr25_per_int' : 'yr25',
    'yr30_per_int' : 'yr30',
})

yr1_30

In [None]:
yr_ai = f1_menphy[['int_pleas1','ai_per']]
yr_ai

In [None]:
yr_ai = yr_ai.rename(columns={
    'int_pleas1' : 'int_ples',
    'ai_per' : 'ai',
})

yr_ai

In [None]:
yr_all_int = pd.merge(yr_ai, yr1_30, left_on=['int_ples'], right_on=['int_ples'],how='inner')
yr_all_int

In [None]:
f1_menphy.int_pleas1.value_counts().to_frame().reset_index()