Group by age and generation. Join with replicate weights and calculate standard errors.

In [1]:
import pandas as pd
from math import sqrt

from IPython.display import display, HTML

Set options

In [2]:
pd.options.display.float_format = '{:,.0f}'.format

Function for building table of estimates and standard errors

In [3]:
def calc_estimates(group):
    
    # Constant from Formula (16) in the SIPP Source and Accuracy Statement
    # 240 = number of replicant columns
    const = (1 / (240 * (0.5**2)))
    
    cweight = group['wpfinwgt'].sum()
    # get only the rep columns
    cols = [ each for each in group.columns if each[0:5] == "repwt"]
    
    # sum each colummn
    sums = group[cols].apply(lambda col: col.sum())
    
    res = sums.apply( lambda each: (each - cweight)**2 )
    var = const * res.sum()
    stder = sqrt(var)
    conf = stder * 1.645
    return pd.Series([cweight, conf, stder])


Function for grouping by age

In [4]:
def group_age(joined, variable):
    
    df = pd.DataFrame(
        joined.groupby(variable).apply(calc_estimates)
    ).reset_index().rename( columns = {
        0: "estimate",
        1: "conf_interval",
        2: "standard_error",
    })
    
    return df
    

Function for grouping and joining for both variables

In [5]:
# takes a df of unique supporters and the replicates file
# returns a dataframe containing age group estimates and standard errors
def group_and_join(supports, replicates):
    
    joined = (
        supports
        .merge(
            replicates,
            on = "uid",
            how = "left"
        )
    )
    
    ages = group_age(joined, 'age_group')
    
    generations = group_age(joined, 'generation')
    
    return ages, generations
    

Run for each in Wave 1

In [6]:
wave1_files = ["../output/w1_supports_children.csv", "../output/w1_supports_parents.csv"]
wave1_replicates = pd.read_csv("../output/w1_replicates.csv")

for each in wave1_files:
    print(each.split("/")[-1])
    
    supports = pd.read_csv(each, dtype = {"uid": "object"})
    
    dfs = group_and_join(supports, wave1_replicates)
    
    [display(df) for df in dfs]
    

w1_supports_children.csv


Unnamed: 0,age_group,estimate,conf_interval,standard_error
0,30-39,145244,49761,30250
1,40-49,565397,100725,61231
2,50-59,910687,122259,74321
3,60 years and over,924824,124207,75506
4,under 30,52046,26253,15959


Unnamed: 0,generation,estimate,conf_interval,standard_error
0,Boomer,1527262,154016,93627
1,Gen Z,1508,2490,1514
2,Generation x,607156,97983,59564
3,Millennials,101292,38155,23195
4,Silent,360980,79200,48146


w1_supports_parents.csv


Unnamed: 0,age_group,estimate,conf_interval,standard_error
0,30-39,1208062,142681,86736
1,40-49,1029450,141781,86189
2,50-59,784984,109440,66529
3,60 years and over,264215,60542,36804
4,under 30,1102351,159549,96990


Unnamed: 0,generation,estimate,conf_interval,standard_error
0,Boomer,1063121,129048,78448
1,Gen Z,108070,39523,24026
2,Generation x,1813100,194923,118494
3,Millennials,1362416,169199,102856
4,Silent,42355,26853,16324


Run for each in Wave 4

In [7]:
wave4_files = ["../output/w4_supports_children.csv", "../output/w4_supports_parents.csv"]
wave4_replicates = pd.read_csv("../output/w4_replicates.csv")

for each in wave4_files:
    print(each.split("/")[-1])
    
    supports = pd.read_csv(each, dtype = {"uid": "object"})
        
    dfs = group_and_join(supports, wave4_replicates)
    
    [display(df) for df in dfs]
    

w4_supports_children.csv


Unnamed: 0,age_group,estimate,conf_interval,standard_error
0,30-39,145244,94582,57497
1,40-49,565397,231598,140789
2,50-59,910687,1014765,616878
3,60 years and over,924824,208240,126590
4,under 30,52046,37378,22722


Unnamed: 0,generation,estimate,conf_interval,standard_error
0,Boomer,1527262,1087168,660892
1,Gen Z,1508,8970,5453
2,Generation x,607156,238693,145102
3,Millennials,101292,59185,35979
4,Silent,360980,252808,153683


w4_supports_parents.csv


Unnamed: 0,age_group,estimate,conf_interval,standard_error
0,30-39,1208062,849805,516599
1,40-49,1029450,650319,395330
2,50-59,784984,514167,312563
3,60 years and over,264215,201982,122786
4,under 30,1102351,624566,379675


Unnamed: 0,generation,estimate,conf_interval,standard_error
0,Boomer,1063121,361194,219571
1,Gen Z,108070,130794,79510
2,Generation x,1813100,1168401,710274
3,Millennials,1362416,296563,180282
4,Silent,42355,41064,24963


## Hypothesis test

Testing this statement: "In 2016, more Baby Boomers provided for children outside the home than Millennials provided for parents living outside the home."

In [8]:
w4_parents_main = pd.read_csv(
    "../output/w4_supports_parents.csv", 
    dtype = {"uid": "object"}
)

w4_children_main = pd.read_csv(
    "../output/w4_supports_children.csv", 
    dtype = {"uid": "object"}
)

w4_par = group_and_join(w4_parents_main, wave4_replicates)[1]
w4_chi = group_and_join(w4_children_main, wave4_replicates)[1]

In [9]:
w4_par

Unnamed: 0,generation,estimate,conf_interval,standard_error
0,Boomer,1063121,361194,219571
1,Gen Z,108070,130794,79510
2,Generation x,1813100,1168401,710274
3,Millennials,1362416,296563,180282
4,Silent,42355,41064,24963


In [10]:
w4_chi

Unnamed: 0,generation,estimate,conf_interval,standard_error
0,Boomer,1527262,1087168,660892
1,Gen Z,1508,8970,5453
2,Generation x,607156,238693,145102
3,Millennials,101292,59185,35979
4,Silent,360980,252808,153683


In [11]:
boomer = w4_chi.loc[ lambda x: x['generation'] == "Boomer"]
millennial = w4_par.loc[ lambda x: x['generation'] == "Millennials"]

In [12]:
# calculate standard error of a difference
def get_sdiff(a,b):
    sdiff = sqrt(a**2 + b**2)
    
    return sdiff

In [13]:
# test standard error
def test(a, b):
    sdiff = get_sdiff(a['standard_error'].array[0], b['standard_error'].array[0])
    
    diff = a["estimate"].array[0] - b["estimate"].array[0]
    
    return ~((-1.645 * sdiff) < diff < (1.645 * sdiff))

In [14]:
test(boomer, millennial)

False

---
---
---