Loads the public use data from from the Survey of Income and Program Participation, waves 1 and 4

In [1]:
import pandas as pd
import io

Preselecting the columns reduces the size of the dataset to a manageable load

In [2]:
columns = [
    "ssuid", 
    'shhadid', 
    'pnum', 
    "monthcode",
    "aage",
    "adob_bmonth",
    "adob_byear",    
    "edob_bmonth",
    "tage",
    "tage_ehc",
    "tdob_byear",
    'eothsuprt1yn', 
    "eothsuprt2yn",
    "apartotamt",
    "tpartotamt",
    "tkidtotamt",
    "wpfinwgt"
]

Iterate through the dataset to avoid overloading memory

In [3]:
def load_file(filename):
    itr = pd.read_stata(
        filename, 
        chunksize = 10000,
        columns = columns,
        preserve_dtypes = True
    )
    
    df = pd.DataFrame()

    for chunk in itr:
        df = df.append(chunk)
        
    # add uid
    return df.assign(
        uid = lambda x: x['ssuid'].astype(str) + "-" + x["pnum"].astype(str),
    )
    

In [4]:
# wave 1
w1 = load_file("../data/pu2014w1.dta")
w1.to_csv("../output/wave1.csv", index = False)

In [5]:
# wave 4
w4 = load_file("../data/pu2014w4.dta")
w4.to_csv("../output/wave4.csv", index = False)

---
---
---