# Preparing the NSFG Data

*Elements of Data Science*

Copyright 2021 [Allen B. Downey](https://allendowney.com)

License: [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-nc-sa/4.0/)

## Loading

Downloaded November 16, 2018:

* https://ftp.cdc.gov/pub/health_statistics/nchs/datasets/NSFG/

* https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NSFG/stata/

In [1]:
import pandas as pd
import numpy as np

In [18]:
from statadict import parse_stata_dict

stata_dict = parse_stata_dict('data/2013_2015_FemPregSetup.dct')

In [19]:
import gzip

fp = gzip.open('data/2013_2015_FemPregData.dat.gz')

nsfg = pd.read_fwf(fp, 
                   names=stata_dict.names, 
                   colspecs=stata_dict.colspecs)

In [21]:
nsfg.head()

Unnamed: 0,CASEID,PREGORDR,HOWPREG_N,HOWPREG_P,MOSCURRP,NOWPRGDK,PREGEND1,PREGEND2,HOWENDDK,NBRNALIV,...,SECU,SEST,CMINTVW,CMLSTYR,CMJAN3YR,CMJAN4YR,CMJAN5YR,QUARTER,PHASE,INTVWYEAR
0,60418,1,,,,,5.0,,,1.0,...,4,342,1381,1369,1345,1333,1321,14,1,2015
1,60418,2,,,,,5.0,,,1.0,...,4,342,1381,1369,1345,1333,1321,14,1,2015
2,60418,3,,,,,5.0,,,1.0,...,4,342,1381,1369,1345,1333,1321,14,1,2015
3,60419,1,33.0,1.0,8.0,,,,,,...,3,318,1388,1376,1345,1333,1321,16,1,2015
4,60420,1,,,,,6.0,,,1.0,...,1,339,1388,1376,1345,1333,1321,16,1,2015


In [6]:
import utils

nsfg = utils.read_stata('data/2013_2015_FemPregSetup.dct', 
                        'data/2013_2015_FemPregData.dat.gz',
                        compression='gzip')

In [8]:
variables = ['caseid', 'outcome', 'birthwgt_lb1', 'birthwgt_oz1',
             'prglngth', 'nbrnaliv', 'agecon', 'agepreg', 'birthord',
             'hpagelb', 'wgt2013_2015']

nsfg = nsfg[variables]
nsfg.shape

(9358, 11)

In [9]:
nsfg.to_hdf('nsfg.hdf5', 'nsfg')

In [10]:
%time nsfg = pd.read_hdf('nsfg.hdf5', 'nsfg')

CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 12.1 ms


In [16]:
np.random.seed(18)

sample = utils.resample_rows_weighted(nsfg, 'wgt2013_2015')
sample.shape

(9358, 11)

In [17]:
sample.to_hdf('nsfg_sample.hdf5', 'nsfg')

### Loading the unsampled data

In [8]:
%time nsfg = pd.read_hdf('nsfg.hdf5', 'nsfg')

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 11 ms


In [9]:
type(nsfg)

pandas.core.frame.DataFrame

In [10]:
nsfg.shape

(9358, 11)

In [11]:
nsfg.head()

Unnamed: 0,caseid,outcome,birthwgt_lb1,birthwgt_oz1,prglngth,nbrnaliv,agecon,agepreg,birthord,hpagelb,wgt2013_2015
0,60418,1,5.0,4.0,40,1.0,2000,2075.0,1.0,22.0,3554.964843
1,60418,1,4.0,12.0,36,1.0,2291,2358.0,2.0,25.0,3554.964843
2,60418,1,5.0,4.0,36,1.0,3241,3308.0,3.0,52.0,3554.964843
3,60419,6,,,33,,3650,,,,2484.535358
4,60420,1,8.0,13.0,41,1.0,2191,2266.0,1.0,24.0,2903.782914


In [12]:
nsfg.columns

Index(['caseid', 'outcome', 'birthwgt_lb1', 'birthwgt_oz1', 'prglngth',
       'nbrnaliv', 'agecon', 'agepreg', 'birthord', 'hpagelb', 'wgt2013_2015'],
      dtype='object')

In [13]:
for column in nsfg.columns:
    print(column)

caseid
outcome
birthwgt_lb1
birthwgt_oz1
prglngth
nbrnaliv
agecon
agepreg
birthord
hpagelb
wgt2013_2015
