In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

In [2]:
cps = pd.read_csv('cps.csv')

# cleanig zero wage
cps = cps[cps['wage']>0]
cps

Unnamed: 0,age,wage,educ
0,22,12000,some college but no degree
1,21,3500,some college but no degree
3,49,30000,some college but no degree
4,31,32000,bachelor's degree
5,42,89630,doctorate degree
...,...,...,...
26111,37,32000,master's degree
26112,50,56000,high school diploma or equivalent
26113,24,9000,high school diploma or equivalent
26114,49,29000,high school diploma or equivalent


In [3]:
# Numerical value for education

#1.Creating a dictionary

educ_dict = {
    "grade 11": 11, 
    "some college but no degree" : 13, 
    "associate's degree, academic program" : 14,
    "grade 10" : 10, 
    "grades 7 or 8" : 8, 
    "grades 1, 2, 3, or 4": 4, 
    "associate's degree, occupational/vocational program": 14, 
    "high school diploma or equivalent": 12, 
    "grade 9": 9, 
    "none or preschool": 0, 
    "doctorate degree": 21,
    "bachelor's degree": 16, 
    "master's degree": 14, 
    "grades 5 or 6": 6, 
    "professional school degree": 14, 
    "12th grade, no diploma": 12
}

#2. transforming cps.educ

cps.replace({'educ': educ_dict}, inplace=True)

#3. log of wage
cps['log_wage'] = np.log(cps['wage'])

cps

Unnamed: 0,age,wage,educ,log_wage
0,22,12000,13,9.392662
1,21,3500,13,8.160518
3,49,30000,13,10.308953
4,31,32000,16,10.373491
5,42,89630,21,11.403445
...,...,...,...,...
26111,37,32000,14,10.373491
26112,50,56000,12,10.933107
26113,24,9000,12,9.104980
26114,49,29000,12,10.275051


# Best Linear Predictor

In [4]:
model1 = smf.ols(formula='log_wage ~ educ',data=cps).fit(cov_type='HC1')

print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:               log_wage   R-squared:                       0.092
Model:                            OLS   Adj. R-squared:                  0.092
Method:                 Least Squares   F-statistic:                     2441.
Date:                Sat, 17 Sep 2022   Prob (F-statistic):               0.00
Time:                        18:12:04   Log-Likelihood:                -29890.
No. Observations:               22715   AIC:                         5.978e+04
Df Residuals:                   22713   BIC:                         5.980e+04
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      8.5711      0.034    254.650      0.0

In [5]:
model2 = smf.ols(formula='log_wage ~ educ+age',data=cps).fit(cov_type='HC1')

print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:               log_wage   R-squared:                       0.209
Model:                            OLS   Adj. R-squared:                  0.209
Method:                 Least Squares   F-statistic:                     2684.
Date:                Sat, 17 Sep 2022   Prob (F-statistic):               0.00
Time:                        18:12:04   Log-Likelihood:                -28323.
No. Observations:               22715   AIC:                         5.665e+04
Df Residuals:                   22712   BIC:                         5.668e+04
Df Model:                           2                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      7.4145      0.039    187.891      0.0

# Interval Data

In [6]:
wage_quantiles = np.array(cps['wage'].quantile([0.0, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]))
wage_quantiles[-1]+=1
logwage_quantiles = np.log(wage_quantiles)

In [7]:
cps['wage_qt'] = np.floor(cps['wage'].rank(method = 'max', pct=True)*10).astype(int).replace({10:9})
cps['wage_lower'] = wage_quantiles[cps['wage_qt']]
cps['wage_upper'] = wage_quantiles[cps['wage_qt']+1]

cps['logwage_lower'] = logwage_quantiles[cps['wage_qt']]
cps['logwage_upper'] = logwage_quantiles[cps['wage_qt']+1]

In [8]:
cps

Unnamed: 0,age,wage,educ,log_wage,wage_qt,wage_lower,wage_upper,logwage_lower,logwage_upper
0,22,12000,13,9.392662,1,9000.0,15000.0,9.104980,9.615805
1,21,3500,13,8.160518,0,1.0,9000.0,0.000000,9.104980
3,49,30000,13,10.308953,5,30000.0,36000.0,10.308953,10.491274
4,31,32000,16,10.373491,5,30000.0,36000.0,10.308953,10.491274
5,42,89630,21,11.403445,9,71000.0,362303.0,11.170435,12.800236
...,...,...,...,...,...,...,...,...,...
26111,37,32000,14,10.373491,5,30000.0,36000.0,10.308953,10.491274
26112,50,56000,12,10.933107,8,53000.0,71000.0,10.878047,11.170435
26113,24,9000,12,9.104980,1,9000.0,15000.0,9.104980,9.615805
26114,49,29000,12,10.275051,4,25000.0,30000.0,10.126631,10.308953


# Partial Identification

In [9]:
import setBLP
from importlib import reload # reload 
reload(setBLP)

<module 'setBLP' from '/Users/yunyun/Desktop/Bounds/Bounds_Python/setBLP.py'>

In [10]:
r = setBLP.oneDproj(cps.logwage_lower, cps.logwage_upper, cps.educ)

In [11]:
r

[0.001786042221938278, 0.3685793555816805]