### bd econ CPS extract

bd_CPS_revisions_reader.ipynb

January 24, 2019

Contact: Brian Dew, @bd_econ

-----

Initial goals:

1) Retrieve variable names and locations from data dictionary

2) Get the basic variables: YEAR, MONTH, HHID, PERSON LINE, AGE, SEX, RACE

3) Review basic variables and adjust to match 1994-onward

4) Check that the results make sense and match to benchmarks

5) Repeat with more variables: EMP, OCC, IND, LMSTAT, etc.

In [48]:
import os, re, struct
import numpy as np
import pandas as pd

os.chdir('/home/brian/Documents/CPS/data/')

In [69]:
# User-defined functions
def id_dtype(size):
    '''Return data type based on fixed-width size'''
    size = int(size)
    dtype = ('intp' if size > 9 
             else 'int32' if size > 4 
             else 'int16' if size > 2 
             else 'int8')
    return dtype

def data_dict_reader(dd_file, var_list):
    '''Read data dictionary and return variable locations'''
    data_dict = open(dd_file, 'r', encoding='iso-8859-1').read()
    p = ('(\w{1,2}[\$\-%]\w*|PADDING)\s'
         '*CHARACTER\*(\d{3})\s*\.{0,1}\s*\((\d*):(\d*)\).*')
    d = {s[0]: [int(s[2])-1, int(s[3]), f'{int(s[1])}s', id_dtype(s[1])]
         for s in re.findall(p, data_dict) if s[0] in var_list}
    return d

def struct_unpacker(d):
    '''Return struct unpacker from variable locations'''
    start, end, width, size = zip(*d.values())
    skip = ([f'{s - e}x' for s, e in zip(start, [0] + list(end[:-1]))])
    unpack_fmt = ''.join([j for i in zip(skip, width) for j in i])
    return struct.Struct(unpack_fmt).unpack_from

def data_file_reader(file, unpacker, dtypes, wgt):
    '''Convert raw monthly file to dataframe'''
    raw_data = open(file, 'rb')
    data = [unpacker(row) for row in raw_data if len(row) >= 405]
    data = [tuple(int(i) if i.strip() else -1 for i in row) for row in data]
    np_data = np.array(data, dtype=dtypes)
    df = pd.DataFrame(np_data[np_data[wgt] > 0])
    return df

In [70]:
dd_file = 'cps92.ddf'

var_list = ['H-MONTH', 'H-YEAR', 'H-MIS', 'HG-FIPS', 'H-METSTA', 
            'H-ID', 'A-LINENO', 'A-AGE', 'A-MARITL', 'A-SEX', 'A-HGA',
            'A-RACE', 'A-MAJACT', 'A-IND', 'A-OCC', 'A-USLFT', 'A-CLSWKR',
            'A-USLHRS', 'A-UNMEM', 'A-FTPT', 'A-REORGN', 'A-LFSR',
            'A-UNTYPE', 'A-NLFREA', 'A-RCOW', 'A-FNLWGT']

filter_wgt = 'A-FNLWGT'

d = data_dict_reader(dd_file, var_list)

dtypes = [(k, v[-1]) for k, v in d.items()]

unpacker = struct_unpacker(d)

file = 'cpsb9205.dat'

df = data_file_reader(file, unpacker, dtypes, filter_wgt) 

In [72]:
df.head()

Unnamed: 0,H-MONTH,H-YEAR,H-MIS,HG-FIPS,H-METSTA,H-ID,A-LINENO,A-AGE,A-MARITL,A-SEX,...,A-CLSWKR,A-USLHRS,A-UNMEM,A-FTPT,A-REORGN,A-LFSR,A-UNTYPE,A-NLFREA,A-RCOW,A-FNLWGT
0,5,2,8,39,1,10400730,1,43,1,1,...,1,50,2,-1,8,1,-1,-1,1,146031
1,5,2,8,39,1,10400730,2,44,1,2,...,1,40,2,-1,8,1,-1,-1,1,146989
2,5,2,8,39,1,10400730,3,13,-1,1,...,-1,-1,-1,-1,8,-1,-1,-1,-1,165194
3,5,2,8,39,1,10400730,4,20,7,2,...,-1,-1,-1,1,8,7,-1,1,-1,179107
4,5,2,6,12,1,11659970,1,83,4,2,...,-1,-1,-1,-1,8,7,-1,-1,-1,189912
