# Exploratory Data Analysis

Clean the NSFG data

Allen B. Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [2]:
# Configure Jupyter so figures appear in the notebook
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white')


## Loading and validation

Downloaded November 16, 2018:

* ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NSFG 

* ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NSFG/stata/

In [6]:
import utils

nsfg = utils.read_stata('data/2013_2015_FemPregSetup.dct', 
                        'data/2013_2015_FemPregData.dat.gz',
                        compression='gzip')

In [8]:
variables = ['caseid', 'outcome', 'birthwgt_lb1', 'birthwgt_oz1',
             'prglngth', 'nbrnaliv', 'agecon', 'agepreg', 'birthord',
             'hpagelb', 'wgt2013_2015']

nsfg = nsfg[variables]
nsfg.shape

(9358, 11)

In [9]:
nsfg.to_hdf('nsfg.hdf5', 'nsfg')

In [10]:
%time nsfg = pd.read_hdf('nsfg.hdf5', 'nsfg')

CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 12.1 ms


In [16]:
np.random.seed(18)

sample = utils.resample_rows_weighted(nsfg, 'wgt2013_2015')
sample.shape

(9358, 11)

In [17]:
sample.to_hdf('nsfg_sample.hdf5', 'nsfg')

### Loading the unsampled data

In [8]:
%time nsfg = pd.read_hdf('nsfg.hdf5', 'nsfg')

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 11 ms


In [9]:
type(nsfg)

pandas.core.frame.DataFrame

In [10]:
nsfg.shape

(9358, 11)

In [11]:
nsfg.head()

Unnamed: 0,caseid,outcome,birthwgt_lb1,birthwgt_oz1,prglngth,nbrnaliv,agecon,agepreg,birthord,hpagelb,wgt2013_2015
0,60418,1,5.0,4.0,40,1.0,2000,2075.0,1.0,22.0,3554.964843
1,60418,1,4.0,12.0,36,1.0,2291,2358.0,2.0,25.0,3554.964843
2,60418,1,5.0,4.0,36,1.0,3241,3308.0,3.0,52.0,3554.964843
3,60419,6,,,33,,3650,,,,2484.535358
4,60420,1,8.0,13.0,41,1.0,2191,2266.0,1.0,24.0,2903.782914


In [12]:
nsfg.columns

Index(['caseid', 'outcome', 'birthwgt_lb1', 'birthwgt_oz1', 'prglngth',
       'nbrnaliv', 'agecon', 'agepreg', 'birthord', 'hpagelb', 'wgt2013_2015'],
      dtype='object')

In [13]:
for column in nsfg.columns:
    print(column)

caseid
outcome
birthwgt_lb1
birthwgt_oz1
prglngth
nbrnaliv
agecon
agepreg
birthord
hpagelb
wgt2013_2015
