# ASSIGNMENT 7 — OREOPOULOS (2006) PARTIAL REPLICATION
# RD + IV for Great Britain

In [1]:
import pandas as pd
import numpy as np
from linearmodels.iv import IV2SLS

## Data Preprocessing

In [2]:
# 1. Load data
df = pd.read_stata("/Users/macbookpro/Documents/DSDM/Econometrics/assignment_7/assignment7.dta")

In [3]:
df.head()

Unnamed: 0,sex,age,datyear,agelfted,nireland,yearat14,missing_earn,wght,ones,yearat14_2,...,goodhealth,badhealth,unemployed,incsupport,yobirth,drop14,drop15,drop16,educb14,educb15
0,male,63,84.0,14.0,0.0,35.0,0.0,35,1.0,1225.0,...,0.628571,0.0,0.029412,0.028571,21.0,1.0,0.0,0.0,1.0,1.0
1,female,63,84.0,14.0,0.0,35.0,0.0,16,1.0,1225.0,...,0.75,0.0625,0.0,0.0625,21.0,1.0,0.0,0.0,1.0,1.0
2,male,63,84.0,15.0,0.0,35.0,0.0,3,1.0,1225.0,...,0.0,0.333333,0.0,0.0,21.0,1.0,0.0,0.0,0.0,1.0
3,female,63,84.0,15.0,0.0,35.0,0.0,1,1.0,1225.0,...,1.0,0.0,0.0,0.0,21.0,1.0,0.0,0.0,0.0,1.0
4,male,63,84.0,16.0,0.0,35.0,0.0,2,1.0,1225.0,...,1.0,0.0,0.0,0.0,21.0,1.0,0.0,0.0,0.0,0.0


In [4]:
df.shape

(30487, 28)

In [5]:
# Keep Great Britain
df = df[df["nireland"] == 0]

df.shape

(22574, 28)

In [6]:
np.sort(df['age'].unique()), np.sort(df['yearat14'].unique())

(array([28, 29, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
        48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64],
       dtype=int16),
 array([35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47.,
        48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., 60.,
        61., 62., 63., 64., 65.], dtype=float32))

In [7]:
# Ages 25-64, cohorts 1935-1965, Great Britain only
df = df[(df['age'] >= 25) & (df['age'] <= 64)]
df = df[(df['yearat14'] >= 35) & (df['yearat14'] <= 65)]
df.shape

(22574, 28)

In [8]:
nulls_df = df.isnull().sum().sort_values(ascending=False).reset_index()

nulls_df[nulls_df[0] != 0]

Unnamed: 0,index,0
0,linc,12613
1,learn,9456
2,unemployed,7914
3,lhinc,7019
4,badhealth,671
5,goodhealth,671


In [9]:
# Drop missing outcome vars
df = df.dropna(subset=["learn", "agelfted", "drop15", "yearat14"])

df.shape

(13118, 28)

In [10]:
# Create cohort polynomial for RD
df["cohort"] = df["yearat14"]
df["cc"] = df["cohort"] - df["cohort"].mean()
df["cc2"] = df["cc"]**2
df["cc3"] = df["cc"]**3
df["cc4"] = df["cc"]**4

In [11]:
# Create age polynomial
df["cage"] = df["age"] - df["age"].mean()
df['cage2'] = df['cage']**2
df['cage3'] = df['cage']**3
df['cage4'] = df['cage']**4

In [12]:
# Create age dummies
age_dummies = pd.get_dummies(df['age'], prefix='age', drop_first=True)
df = pd.concat([df, age_dummies], axis=1)

In [13]:
age_dummy_cols = age_dummies.columns

age_dummy_cols

Index(['age_29', 'age_33', 'age_34', 'age_35', 'age_36', 'age_37', 'age_38',
       'age_39', 'age_40', 'age_41', 'age_42', 'age_43', 'age_44', 'age_45',
       'age_46', 'age_47', 'age_48', 'age_49', 'age_50', 'age_51', 'age_52',
       'age_53', 'age_54', 'age_55', 'age_56', 'age_57', 'age_58', 'age_59',
       'age_60', 'age_61', 'age_62', 'age_63', 'age_64'],
      dtype='object')

In [14]:
nulls_df = df.isnull().sum().sort_values(ascending=False).reset_index()

nulls_df[nulls_df[0] != 0]

Unnamed: 0,index,0
0,linc,6686
1,lhinc,3563
2,unemployed,2654
3,goodhealth,2
4,badhealth,2


## RD-IV

In [15]:


# Column (4)
formula_4 = "learn ~ 1 + cc + cc2 + cc3 + cc4 + [agelfted ~ drop15]"

res_4 = IV2SLS.from_formula(
    formula_4,
    data=df,
    weights=df["wght"]
).fit(cov_type="clustered", clusters=df["cohort"])

print("\n=== TABLE 2 Column (4) ===")
print(res_4.summary)


=== TABLE 2 Column (4) ===
                          IV-2SLS Estimation Summary                          
Dep. Variable:                  learn   R-squared:                      0.1181
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1178
No. Observations:               13118   F-statistic:                    4037.4
Date:                Tue, Nov 25 2025   P-value (F-stat)                0.0000
Time:                        18:56:32   Distribution:                  chi2(5)
Cov. Estimator:             clustered                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      7.1404     0.5439     13.127     0.0000      6.0743      8.2065
cc             0.0169   

In [16]:
# Column (5)
formula_5 = (
    "learn ~ 1 + cc + cc2 + cc3 + cc4 + "
    "cage + cage2 + cage3 + cage4 + "
    "[agelfted ~ drop15]"
)

res_5 = IV2SLS.from_formula(
    formula_5,
    data=df,
    weights=df["wght"]
).fit(cov_type="clustered", clusters=df["cohort"])

print("\n=== TABLE 2 Column (5) ===")
print(res_5.summary)



=== TABLE 2 Column (5) ===
                          IV-2SLS Estimation Summary                          
Dep. Variable:                  learn   R-squared:                      0.1956
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1951
No. Observations:               13118   F-statistic:                    5599.0
Date:                Tue, Nov 25 2025   P-value (F-stat)                0.0000
Time:                        18:56:36   Distribution:                  chi2(9)
Cov. Estimator:             clustered                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      7.1264     0.5539     12.866     0.0000      6.0408      8.2121
cc             0.0346   

In [17]:
# Build dummy variable part
dummy_part = " + ".join(age_dummy_cols)

# Column (6)
formula_6 = (
    "learn ~ 1 + cc + cc2 + cc3 + cc4 + "
    f"{dummy_part} + "
    "[agelfted ~ drop15]"
)

res_6 = IV2SLS.from_formula(
    formula_6,
    data=df,
    weights=df["wght"]
).fit(cov_type="clustered", clusters=df["cohort"])

print("\n=== TABLE 2 Column (6) ===")
print(res_6.summary)



=== TABLE 2 Column (6) ===
                          IV-2SLS Estimation Summary                          
Dep. Variable:                  learn   R-squared:                      0.1901
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1878
No. Observations:               13118   F-statistic:                -4.407e+14
Date:                Tue, Nov 25 2025   P-value (F-stat)                1.0000
Time:                        18:56:45   Distribution:                 chi2(38)
Cov. Estimator:             clustered                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      4.9527     0.6834     7.2471     0.0000      3.6132      6.2921
cc             0.0343   