# Are first babies more likely to be late?

Copyright 2019 Allen Downey

License: [Creative Commons Attribution 4.0 International](http://creativecommons.org/licenses/by/4.0/)

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from utils import read_stata_dict
from utils import values
from utils import resample_rows_weighted

In [2]:
def read_preg_file(dct_file, dat_file, usecols):
    """Reads the NSFG pregnancy data.

    dct_file: string file name
    dat_file: string file name

    returns: DataFrame
    """
    dct = read_stata_dict(dct_file, encoding='ISO-8859-1')
    df = dct.read_fixed_width(dat_file, compression='gzip', usecols=usecols)
    return df

Cycle 10

In [3]:
usecols = ['birthord', 'outcome', 'prglngth', 'pregend1', 'nbrnaliv', 'wgt2015_2017']
df = read_preg_file('2015_2017_FemPregSetup.dct', '2015_2017_FemPregData.dat.gz', usecols)
df.rename(columns={'wgt2015_2017': 'finalwgt'}, inplace=True)
print(df.shape)

(9553, 6)


In [4]:
values(df['outcome'])

1    6693
2     901
3     120
4    1515
5     123
6     201
Name: outcome, dtype: int64

In [5]:
values(df['prglngth'])

0        5
1        7
2       46
3       77
4      250
5      142
6      421
7      153
8      282
9      335
10     154
11      76
12     139
13     237
14      38
15      32
16      41
17     100
18      12
19      20
20      27
21      16
22      54
23      13
24      17
25      14
26      69
27      17
28      27
29      11
30      87
31      33
32     104
33      47
34     105
35     191
36     276
37     423
38     776
39    2440
40    1521
41     438
42     215
43      56
44       8
49       1
Name: prglngth, dtype: int64

In [6]:
values(df['pregend1'])

1.0    1523
2.0     116
3.0     885
4.0     125
5.0    1865
6.0    4814
8.0      12
9.0       6
Name: pregend1, dtype: int64

In [7]:
values(df['birthord'])

1.0     3150
2.0     2107
3.0      941
4.0      325
5.0      104
6.0       38
7.0       13
8.0        5
9.0        2
10.0       2
11.0       2
12.0       2
13.0       1
14.0       1
Name: birthord, dtype: int64

In [8]:
df['finalwgt'].describe()

count      9553.000000
mean      13337.425944
std       16138.878271
min        1924.916000
25%        4575.221221
50%        7292.490835
75%       15724.902673
max      106774.400000
Name: finalwgt, dtype: float64

In [9]:
preg10 = df

Cycle 9

In [10]:
usecols = ['birthord', 'outcome', 'prglngth', 'pregend1', 'nbrnaliv', 'wgt2013_2015']
df = read_preg_file('2013_2015_FemPregSetup.dct', '2013_2015_FemPregData.dat.gz', usecols)
df.rename(columns={'wgt2013_2015': 'finalwgt'}, inplace=True)
print(df.shape)

(9358, 6)


In [11]:
values(df['outcome'])

1    6489
2     947
3      86
4    1469
5     118
6     249
Name: outcome, dtype: int64

In [12]:
values(df['prglngth'])

0        7
1       11
2       50
3      102
4      274
5      128
6      375
7      185
8      299
9      331
10     125
11      82
12     132
13     222
14      39
15      27
16      40
17      85
18      25
19      25
20      21
21       9
22      76
23       7
24      25
25      13
26      58
27      17
28      39
29      25
30      95
31      15
32     118
33      35
34      73
35     239
36     313
37     432
38     755
39    2384
40    1311
41     422
42     231
43      65
44       9
45       3
46       3
48       1
Name: prglngth, dtype: int64

In [13]:
values(df['pregend1'])

1.0    1473
2.0      82
3.0     936
4.0     116
5.0    1842
6.0    4633
8.0      16
9.0       8
Name: pregend1, dtype: int64

In [14]:
values(df['birthord'])

1.0     3067
2.0     2002
3.0      937
4.0      322
5.0      106
6.0       32
7.0       14
8.0        6
9.0        2
10.0       1
Name: birthord, dtype: int64

In [15]:
df['finalwgt'].describe()

count     9358.000000
mean     10759.200147
std      11400.179222
min       1859.105000
25%       4340.053407
50%       7062.994158
75%      12629.322982
max      75399.410000
Name: finalwgt, dtype: float64

In [16]:
preg09 = df

Cycle 8

In [17]:
usecols = ['birthord', 'outcome', 'prglngth', 'pregend1', 'nbrnaliv', 'wgt2011_2013']
df = read_preg_file('2011_2013_FemPregSetup.dct', '2011_2013_FemPregData.dat.gz', usecols)
df.rename(columns={'wgt2011_2013': 'finalwgt'}, inplace=True)
print(df.shape)

(9543, 6)


In [18]:
values(df['outcome'])

1    6670
2     993
3      71
4    1451
5     120
6     238
Name: outcome, dtype: int64

In [19]:
values(df['prglngth'])

0       11
1       10
2       45
3      115
4      318
5      132
6      400
7      127
8      283
9      332
10     111
11      79
12     138
13     226
14      32
15      32
16      32
17      90
18      15
19      24
20      21
21      10
22      89
23      15
24      29
25      12
26      61
27      17
28      25
29      22
30      80
31      35
32     103
33      34
34     104
35     237
36     290
37     418
38     710
39    2590
40    1310
41     426
42     277
43      63
44       8
45       2
46       1
47       2
Name: prglngth, dtype: int64

In [20]:
values(df['pregend1'])

1.0    1459
2.0      69
3.0     981
4.0     121
5.0    1861
6.0    4795
8.0       7
9.0      12
Name: pregend1, dtype: int64

In [21]:
values(df['birthord'])

1.0    3141
2.0    2076
3.0     957
4.0     327
5.0     113
6.0      34
7.0      16
8.0       5
9.0       1
Name: birthord, dtype: int64

In [22]:
df['finalwgt'].describe()

count     9543.000000
mean     11040.049389
std      13033.547986
min       1714.541000
25%       3935.813218
50%       6641.818091
75%      12742.521135
max      85207.950000
Name: finalwgt, dtype: float64

In [23]:
preg08 = df

Cycle 7

In [24]:
usecols = ['birthord', 'outcome', 'prglngth', 'pregend1', 'nbrnaliv', 'wgtq1q16']
df = read_preg_file('2006_2010_FemPregSetup.dct', '2006_2010_FemPreg.dat.gz', usecols)
df.rename(columns={'wgtq1q16': 'finalwgt'}, inplace=True)
print(df.shape)

(20492, 6)


In [25]:
values(df['outcome'])

1    14292
2     2295
3      166
4     2945
5      278
6      516
Name: outcome, dtype: int64

In [26]:
values(df['prglngth'])

0       11
1       19
2      117
3      219
4      561
5      274
6      896
7      271
8      513
9      770
10     275
11     173
12     248
13     685
14      79
15      88
16      53
17     241
18      36
19      45
20      39
21      24
22     174
23      14
24      45
25      24
26     151
27      29
28      59
29      36
30     248
31      44
32     173
33      69
34     164
35     558
36     636
37     848
38    1311
39    6308
40    2285
41     910
42     553
43     185
44      19
45       3
46       4
48       3
52       1
57       1
Name: prglngth, dtype: int64

In [27]:
values(df['pregend1'])

1.0     2958
2.0      165
3.0     2288
4.0      281
5.0     3420
6.0    10849
8.0        2
9.0        5
Name: pregend1, dtype: int64

In [28]:
values(df['birthord'])

1.0     6683
2.0     4415
3.0     2030
4.0      734
5.0      269
6.0      102
7.0       36
8.0       13
9.0        7
10.0       2
11.0       1
Name: birthord, dtype: int64

In [29]:
df['finalwgt'].describe()

count    20492.000000
mean      5195.422926
std       5909.631184
min         44.023984
25%       1480.501506
50%       2976.610130
75%       6351.218612
max      30226.354508
Name: finalwgt, dtype: float64

In [30]:
preg07 = df

Cycle 6

In [31]:
usecols = ['birthord', 'outcome', 'prglngth', 'pregend1', 'nbrnaliv', 'finalwgt']
df = read_preg_file('2002FemPreg.dct', '2002FemPreg.dat.gz', usecols)
print(df.shape)

(13593, 6)


In [32]:
values(df['outcome'])

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

In [33]:
values(df['prglngth'])

0       15
1        9
2       78
3      151
4      412
5      181
6      543
7      175
8      409
9      594
10     137
11     202
12     170
13     446
14      29
15      39
16      44
17     253
18      17
19      34
20      18
21      37
22     147
23      12
24      31
25      15
26     117
27       8
28      38
29      23
30     198
31      29
32     122
33      50
34      60
35     357
36     329
37     457
38     609
39    4744
40    1120
41     591
42     328
43     148
44      46
45      10
46       1
47       1
48       7
50       2
Name: prglngth, dtype: int64

In [34]:
values(df['pregend1'])

1.0    1921
2.0     120
3.0    1831
4.0     187
5.0    2015
6.0    7129
8.0      20
9.0      18
Name: pregend1, dtype: int64

In [35]:
values(df['birthord'])

1.0     4413
2.0     2874
3.0     1234
4.0      421
5.0      126
6.0       50
7.0       20
8.0        7
9.0        2
10.0       1
Name: birthord, dtype: int64

In [36]:
df['finalwgt'].describe()

count     13593.000000
mean       8196.422280
std        9325.918114
min         118.656790
25%        3841.375308
50%        6256.592133
75%        9432.360931
max      261879.953864
Name: finalwgt, dtype: float64

In [37]:
preg06 = df

In [38]:
cycles = [preg10, preg09, preg08, preg07, preg06]
df = pd.concat(cycles, sort=False, ignore_index=True)
df.shape

(62539, 6)

In [39]:
live = (df['outcome'] == 1)
live.sum()

43292

In [40]:
multiple = (df['nbrnaliv'] > 1)
multiple.sum()

737

In [41]:
737 / 43292

0.01702393051834057

In [42]:
sec = (df['pregend1'] == 5)
sec.sum()

11003

In [43]:
nosec = (df['pregend1'] == 6)
nosec.sum()

32220

In [44]:
single = (df['nbrnaliv'] == 1)
single.sum()

42542

In [45]:
valid = single & nosec
valid.sum()

31909

In [46]:
valid = single & nosec & (df['prglngth'] < 50)
valid.sum()

31906

In [47]:
def resample_by_cycle(dfs, column='finalwgt'):
    samples = [resample_rows_weighted(df, column) for df in dfs]
    sample = pd.concat(samples, sort=False, ignore_index=True)
    return sample

In [48]:
cycles = [preg10, preg09, preg08, preg07, preg06]
sample = resample_by_cycle(cycles)
sample.shape

(62539, 6)

In [49]:
sample.columns

Index(['pregend1', 'nbrnaliv', 'prglngth', 'outcome', 'birthord', 'finalwgt'], dtype='object')

In [50]:
# if the file already exists, remove it
import os

if os.path.isfile('nsfg.hdf5'):
    !rm nsfg.hdf5

In [51]:
# generate and store three resamplings
for i in range(101):
    np.random.seed(i)
    sample = sample = resample_by_cycle(cycles)

    key = f'nsfg{i}'
    sample.to_hdf('nsfg_samples.hdf5', key)

In [53]:
# read back one of the resamplings and time it
%time nsfg = pd.read_hdf('nsfg_samples.hdf5', 'nsfg0')
nsfg.shape

CPU times: user 12.4 ms, sys: 16 µs, total: 12.4 ms
Wall time: 11.4 ms


(62539, 6)