In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from linearmodels.iv import IV2SLS

### First, Replicate Angrist and Evans (1998)


Main concern with replicating this over time: compliers change a lot, especially since fertility has dropped so much. 
I suppose everyone is a complier for at least 9 months when you use twin births as an instrument (maybe not if abortion?). Very likely not enough power to do this.

f first two children as instrument for having more than two children. Evaluate effects on:
- Weeks worked
- Hours worked
- Wages
- Labor force participation



In [42]:
df = pd.read_csv("data/angrist_ipums.csv")

In [43]:
df.columns = df.columns.str.lower()
df = df.drop(
    columns=[
        "age_mom2",
        "wkswork1_mom2",
        "hrswork1_mom2",
        "ftotinc_mom2",
        "incwage_mom2",
        "ftotinc_mom",
        "labforce",
        "wkswork1",
        "hrswork1",
        "ftotinc",
        "incwage",
        "race_mom2",
        "labforce_mom2",
        "related_mom2",
        "raced",
    ]
)
df = df.rename(
    columns={
        "sample": "sample_id",
        "serial": "household_id",
        "pernum": "person_id",
        "sex": "sex",
        "age": "age",
        "race": "race",
        "age_mom": "mom_age",
        "race_mom": "mom_race",
        "labforce_mom": "mom_in_labor_force",
        "wkswork1_mom": "mom_weeks_worked",  # in last year
        "hrswork1_mom": "mom_hours_worked",  # in last week
        "incwage_mom": "mom_wage",  # wage and salary income
        "ftotinc_mom": "total_family_income",
    }
)

In [44]:
print(
    f"Excluding {np.round(100 * df.mom_age.isna().sum() / len(df),2)}% of sample that is not matched to a mother."
)
df = df.dropna(subset=["mom_age"])

Excluding 7.29% of sample that is not matched to a mother.


In [52]:
df["mom_unique_id"] = df["household_id"].astype(str) + "_" + df["momloc"].astype(str)

Unnamed: 0,year,sample_id,household_id,person_id,momloc,relate,related,sex,age,race,related_mom,mom_age,mom_race,mom_in_labor_force,mom_weeks_worked,mom_hours_worked,mom_wage
0,1980,198002,2,4,2,3,301,2,16,1,201.0,38.0,1.0,1.0,2.0,0.0,125.0
1,1980,198002,2,5,2,3,301,2,10,1,201.0,38.0,1.0,1.0,2.0,0.0,125.0
2,1980,198002,2,6,2,3,301,2,9,1,201.0,38.0,1.0,1.0,2.0,0.0,125.0
3,1980,198002,2,7,2,3,301,2,2,1,201.0,38.0,1.0,1.0,2.0,0.0,125.0
4,1980,198002,6,3,2,3,301,1,15,1,201.0,38.0,1.0,2.0,51.0,36.0,13005.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
686187,1980,198002,942208,4,2,3,301,1,3,1,201.0,32.0,1.0,2.0,52.0,25.0,0.0
686188,1980,198002,942211,6,5,9,901,1,0,1,401.0,19.0,1.0,1.0,0.0,0.0,0.0
686189,1980,198002,942213,3,2,3,301,2,14,1,201.0,34.0,1.0,2.0,0.0,3.0,0.0
686190,1980,198002,942213,4,2,3,301,2,13,1,201.0,34.0,1.0,2.0,0.0,3.0,0.0


In [None]:
# All children born should be in the sample, i.e. be <= 18 years old
# All moms should be betweeb 21 and 35 years old. Older moms are more likely to have an older child with > 18 years old.
# Including these moms would bias the sample of moms towards moms that have children at an older age.
# TODO: maybe lower 35 threshold if using earlier samples. Pick a threshold where ~95% of moms below that age have no children > 18 years old.