## Construction of the Frailty Index
#### April 28th by Gavin Qu
First we do the data cleaning by concatenating dataframes using pandas 

In [2]:
import pandas as pd
import numpy as np
import pyreadstat

In [3]:
# use latin-1 encoding if UTF8 does not work
wave1, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/a_indresp.dta")
wave2, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/b_indresp.dta")
wave3, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/c_indresp.dta", encoding="iso-8859-1")
wave4, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/d_indresp.dta")
wave5, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/e_indresp.dta", encoding="iso-8859-1")
wave6, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/f_indresp.dta")
wave7, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/g_indresp.dta")
wave8, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/h_indresp.dta")
wave9, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/i_indresp.dta")
wave10, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/j_indresp.dta")
wave11, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/k_indresp.dta")
wave12, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/l_indresp.dta")
wave13, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/m_indresp.dta")

Read the death data in the xhhrel.dta, the variable for it is 'dcsedfl_dv'

In [4]:
death_df, meta = pyreadstat.read_dta("/Users/gavinqu/Desktop/School/Dissertation/UKDA-6614-stata/stata/stata13_se/ukhls/xhhrel.dta")

#### Define Columns to extract

In [16]:
# List of base columns (personal ID)
base_columns = ['pidp']

# Age columns for 13 waves stored as a list
age_columns = [f'{chr(97+i)}_age_dv' for i in range(13)]  

# "Difficulty doing something" columns for 13 waves, from 'disdif1' to 'disdif12'
difficulty_columns = [f'{chr(97+i)}_disdif{j}' for i in range(13) for j in range(1, 13)]

# Combine all columns to extract
columns_to_extract = base_columns + age_columns + difficulty_columns

#### Extract data and include waves

In [17]:
# Function to extract data and add wave information with corrected column name
def extract_wave_data(wave, wave_letter):
    # Include the correct personal ID column
    specific_columns = [f'{wave_letter}_age_dv'] + [f'{wave_letter}_disdif{j}' for j in range(1, 13)]
    specific_columns.insert(0, 'pidp')  # Insert 'pidp' as the first column
    
    # Create a new dataframe with the specified columns
    df = wave[specific_columns]
    df['wave'] = wave_letter  # Add a wave identifier
    return df

#### Concatenate data frame

In [None]:
# List of extracted dataframes for each wave
waves_extracted = [
    extract_wave_data(wave1, 'a'),
    extract_wave_data(wave2, 'b'),
    extract_wave_data(wave3, 'c'),
    extract_wave_data(wave4, 'd'),
    extract_wave_data(wave5, 'e'),
    extract_wave_data(wave6, 'f'),
    extract_wave_data(wave7, 'g'),
    extract_wave_data(wave8, 'h'),
    extract_wave_data(wave9, 'i'),
    extract_wave_data(wave10, 'j'),
    extract_wave_data(wave11, 'k'),
    extract_wave_data(wave12, 'l'),
    extract_wave_data(wave13, 'm'),
]

# Concatenate all extracted dataframes
df_combined = pd.concat(waves_extracted, ignore_index=True)

In [None]:
df_combined.describe

In [20]:
# Total count of missing values in each column
missing_values_count = df_combined.isna().sum()

# Display the count of missing values
print("Missing Values Count:\n", missing_values_count)

Missing Values Count:
 pidp               0
a_age_dv      482482
a_disdif1     482482
a_disdif2     482482
a_disdif3     482482
               ...  
m_disdif8     505478
m_disdif9     505478
m_disdif10    505478
m_disdif11    505478
m_disdif12    505478
Length: 171, dtype: int64


In [21]:
# Display basic statistics for numerical columns
summary_statistics = df_combined.describe()

# Display the summary
print("Summary Statistics:\n", summary_statistics)

# General information about the dataframe
df_info = df_combined.info()

# Display general information (includes data types and non-null counts)
print("DataFrame Information:\n", df_info)


Summary Statistics:
                pidp      a_age_dv     a_disdif1     a_disdif2     a_disdif3  \
count  5.334760e+05  50994.000000  50994.000000  50994.000000  50994.000000   
mean   7.791696e+08     45.638781     -5.166490     -5.142880     -5.235557   
std    4.651655e+08     18.189291      3.909239      3.943244      3.807181   
min    2.244500e+04     15.000000     -8.000000     -8.000000     -8.000000   
25%    4.080871e+08     31.000000     -8.000000     -8.000000     -8.000000   
50%    7.483971e+08     44.000000     -8.000000     -8.000000     -8.000000   
75%    1.157270e+09     59.000000      0.000000      0.000000      0.000000   
max    1.653277e+09    101.000000      1.000000      1.000000      1.000000   

          a_disdif4     a_disdif5    a_disdif6     a_disdif7     a_disdif8  \
count  50994.000000  50994.000000  50994.00000  50994.000000  50994.000000   
mean      -5.255638     -5.256187     -5.25954     -5.276307     -5.238303   
std        3.776754      3.775917