## Aggregate Health Statistics on UKHLS dataset

#### March 22nd by Gavin Qu

In [5]:
import pandas as pd
import numpy as np
import pyreadstat

In [7]:
# use latin-1 encoding if UTF8 does not work
wave1, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/a_indresp.dta")
wave2, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/b_indresp.dta")
wave3, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/c_indresp.dta", encoding="iso-8859-1")
wave4, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/d_indresp.dta")
wave5, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/e_indresp.dta", encoding="iso-8859-1")
wave6, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/f_indresp.dta")
wave7, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/g_indresp.dta")
wave8, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/h_indresp.dta")
wave9, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/i_indresp.dta")
wave10, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/j_indresp.dta")
wave11, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/k_indresp.dta")
wave12, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/l_indresp.dta")
wave13, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/m_indresp.dta")

#### Read into the Pandas framework

In [8]:
df1 = pd.DataFrame(wave1)
df2 = pd.DataFrame(wave2)
df3 = pd.DataFrame(wave3)
df4 = pd.DataFrame(wave4)
df5 = pd.DataFrame(wave5)
df6 = pd.DataFrame(wave6)
df7 = pd.DataFrame(wave7)
df8 = pd.DataFrame(wave8)
df9 = pd.DataFrame(wave9)
df10 = pd.DataFrame(wave10)
df11 = pd.DataFrame(wave11)
df12 = pd.DataFrame(wave12)
df13 = pd.DataFrame(wave13)

#### Clean the data before combining

In [None]:
waves = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13]
prefixes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm']

for wave, prefix in zip(waves, prefixes):
    wave[f'{prefix}_lf_stat'] = None # generate the lf variable

    # employed = 1
    employed_conditions = wave[f'{prefix}_lf_stat'].isin([1, 2, 5, 9, 10, 11, 12, 13])
    wave.loc[employed_conditions, f'{prefix}_lf_stat'] = 1

    # unemployed = 2
    wave.loc[wave[f'{prefix}_lf_stat'] == 3, f'{prefix}_lf_stat'] = 2

    # inactive = 3
    inactive_conditions = wave[f'{prefix}_lf_stat'].isin([4, 6, 7, 8])
    wave.loc[inactive_conditions, f'{prefix}_lf_stat'] = 3

    wave[f'{prefix}_changejbstat'] = None # generate the changejbstat variable

    # check if this wave includes the empchk variable
    if f'{prefix}_empchk' in wave.columns and f'{prefix}_notempchk' in wave.columns:
        wave[f'{prefix}_changejbstat'] = None  # initialize the changejbstat variable

        # conditions for previous employment
        condition_0 = (wave[f'{prefix}_empchk'] == 1) | (wave[f'{prefix}_notempchk'] == 1)
        condition_1 = (wave[f'{prefix}_empchk'] == 2) | (wave[f'{prefix}_notempchk'] == 2)

        # assign values based on conditions
        wave.loc[condition_0, f'{prefix}_changejbstat'] = 0
        wave.loc[condition_1, f'{prefix}_changejbstat'] = 1
    else:
        print(f"'empchk' and 'notempchk' variables not found in wave {prefix}")

    # create dummy for private sector
    wave[f'{prefix}_private'] = np.nan
    wave.loc[wave[f'{prefix}_jbsect'] == 1, f'{prefix}_private'] = 1 # 1 for private
    wave.loc[wave[f'{prefix}_jbsect'] == 2, f'{prefix}_private'] = 0 # 2 for other types

    # replace missing values in pay variables
    pay_vars = [f'{prefix}_paygu_dv', f'{prefix}_paynu_dv']
    for var in pay_vars:
        wave[var].replace(list(range(-9, 0)), np.nan, inplace=True)

    # convert to hourly pay
    wave[f'{prefix}_hrgpay'] = wave[f'{prefix}_paygu_dv'] / (4.333 * 40)
    wave[f'{prefix}_hrnpay'] = wave[f'{prefix}_paynu_dv'] / (4.333 * 40)

    # log hourly pay (only for non-zero values)
    wave[f'{prefix}_logpay'] = np.log(wave.loc[wave[f'{prefix}_hrgpay'] > 0, f'{prefix}_hrgpay'])

    # earnings
    wave[f'{prefix}_earnings'] = wave[f'{prefix}_paygu_dv']
    wave.loc[wave[f'{prefix}_jbhrs'] <= 0, f'{prefix}_earnings'] = np.nan

    # create education categories
    education_mapping = {1: 1, 2: 1, 3: 2, 4: 3, 5: 4, 6: 4, 7: 4, 8: 4, 9: 4}
    wave[f'{prefix}_education'] = wave[f'{prefix}_hiqual_dv'].map(education_mapping)

    # create aggregate health variable where 1 is unhealthy, 2 is healthy
    wave[f'{prefix}_agghealth'] = wave[f'{prefix}_health'].map({1: 1, 2: 2})

#### Combine data with the relevant variables