In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# Uploading CPS data

In [2]:
cps = pd.read_csv('cps.csv')

# cleanig zero wage
cps = cps[cps['wage']>0]
cps

Unnamed: 0,age,wage,educ
0,22,12000,some college but no degree
1,21,3500,some college but no degree
3,49,30000,some college but no degree
4,31,32000,bachelor's degree
5,42,89630,doctorate degree
...,...,...,...
26111,37,32000,master's degree
26112,50,56000,high school diploma or equivalent
26113,24,9000,high school diploma or equivalent
26114,49,29000,high school diploma or equivalent


In [3]:
# Numerical value for education

#1.Creating a dictionary

educ_dict = {
    "grade 11": 11, 
    "some college but no degree" : 13, 
    "associate's degree, academic program" : 14,
    "grade 10" : 10, 
    "grades 7 or 8" : 8, 
    "grades 1, 2, 3, or 4": 4, 
    "associate's degree, occupational/vocational program": 14, 
    "high school diploma or equivalent": 12, 
    "grade 9": 9, 
    "none or preschool": 0, 
    "doctorate degree": 21,
    "bachelor's degree": 16, 
    "master's degree": 14, 
    "grades 5 or 6": 6, 
    "professional school degree": 14, 
    "12th grade, no diploma": 12
}

#2. transforming cps.educ

cps.replace({'educ': educ_dict}, inplace=True)

#3. log of wage
cps['log_wage'] = np.log(cps['wage'])

cps

Unnamed: 0,age,wage,educ,log_wage
0,22,12000,13,9.392662
1,21,3500,13,8.160518
3,49,30000,13,10.308953
4,31,32000,16,10.373491
5,42,89630,21,11.403445
...,...,...,...,...
26111,37,32000,14,10.373491
26112,50,56000,12,10.933107
26113,24,9000,12,9.104980
26114,49,29000,12,10.275051


# Best Linear Predictor

In [4]:
ols1 = smf.ols(formula='log_wage ~ educ',data=cps).fit(cov_type='HC1')

print(ols1.summary())

                            OLS Regression Results                            
Dep. Variable:               log_wage   R-squared:                       0.092
Model:                            OLS   Adj. R-squared:                  0.092
Method:                 Least Squares   F-statistic:                     2441.
Date:                Tue, 04 Oct 2022   Prob (F-statistic):               0.00
Time:                        00:05:24   Log-Likelihood:                -29890.
No. Observations:               22715   AIC:                         5.978e+04
Df Residuals:                   22713   BIC:                         5.980e+04
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      8.5711      0.034    254.650      0.0

In [5]:
beta1 = ols1._results.params[1]

In [6]:
ols2 = smf.ols(formula='log_wage ~ educ+age',data=cps).fit(cov_type='HC1')

print(ols2.summary())

                            OLS Regression Results                            
Dep. Variable:               log_wage   R-squared:                       0.209
Model:                            OLS   Adj. R-squared:                  0.209
Method:                 Least Squares   F-statistic:                     2684.
Date:                Tue, 04 Oct 2022   Prob (F-statistic):               0.00
Time:                        00:05:24   Log-Likelihood:                -28323.
No. Observations:               22715   AIC:                         5.665e+04
Df Residuals:                   22712   BIC:                         5.668e+04
Df Model:                           2                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      7.4145      0.039    187.891      0.0

# Creating Interval Data

In [7]:
def createIntervalData(df, Y, thresholds):
    # The function accepts a dataframe, df, and make interval data from the Y (string) variable. 
    # It adds to the dataframe the lower and upper values for Y (based on the thresholds) and the covariates.
    
    thresholds = np.array(thresholds)

    idx = [sum(t <= y for t in thresholds)-1 for y in df[Y]]
    df[Y+'_l'] = thresholds[idx]
    df[Y+'_u'] = thresholds[np.array(idx)+1]
    
    df['log'+Y+'_l']= np.log(df[Y+'_l'])
    df['log'+Y+'_u']= np.log(df[Y+'_u'])
    
    return df

In [8]:
# wage_quantiles = np.array(cps['wage'].quantile([0.0, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]))
# wage_quantiles[-1]+=1
# logwage_quantiles = np.log(wage_quantiles)

# cps['wage_qt'] = np.floor(cps['wage'].rank(method = 'max', pct=True)*10).astype(int).replace({10:9})
# cps['wage_lower'] = wage_quantiles[cps['wage_qt']]
# cps['wage_upper'] = wage_quantiles[cps['wage_qt']+1]

# cps['logwage_lower'] = logwage_quantiles[cps['wage_qt']]
# cps['logwage_upper'] = logwage_quantiles[cps['wage_qt']+1]

In [9]:
wage_quantiles = np.array(cps['wage'].quantile(np.linspace(0,1,11)))
wage_quantiles[-1]+=1
interval_cps = createIntervalData(cps, 'wage', wage_quantiles)

In [10]:
interval_cps

Unnamed: 0,age,wage,educ,log_wage,wage_l,wage_u,logwage_l,logwage_u
0,22,12000,13,9.392662,9000.0,15000.0,9.104980,9.615805
1,21,3500,13,8.160518,1.0,9000.0,0.000000,9.104980
3,49,30000,13,10.308953,30000.0,36000.0,10.308953,10.491274
4,31,32000,16,10.373491,30000.0,36000.0,10.308953,10.491274
5,42,89630,21,11.403445,71000.0,362303.0,11.170435,12.800236
...,...,...,...,...,...,...,...,...
26111,37,32000,14,10.373491,30000.0,36000.0,10.308953,10.491274
26112,50,56000,12,10.933107,53000.0,71000.0,10.878047,11.170435
26113,24,9000,12,9.104980,9000.0,15000.0,9.104980,9.615805
26114,49,29000,12,10.275051,25000.0,30000.0,10.126631,10.308953


# Partial Identification

In [11]:
import setBLP
from importlib import reload # reload 
reload(setBLP)

<module 'setBLP' from '/Users/yunyun/Desktop/Bounds/Bounds_Python/setBLP.py'>

In [12]:
r = setBLP.oneDproj(interval_cps.logwage_l, interval_cps.logwage_u, interval_cps.educ)

In [13]:
beta1>=min(r) and beta1 <=max(r)

True

# Simulations

## parameter

In [14]:
rng = np.random.MT19937(15217)
rng = np.random.Generator(rng)
np.random.seed(setBLP.default_options.seed)

In [15]:
popSize = cps.shape[0]

In [16]:
Nobs = 100;  #size of sub sample
Nsim = 5000; #number of simulations

In [17]:
Nintervals = 8; #number of intervals in the survey

## Using quantiles

In [18]:
c = 0 
for i in range(Nsim):
    indx = rng.integers(low=0, high=popSize, size=Nobs)
    sample = interval_cps.iloc[indx, :]
    r = setBLP.oneDproj(sample.logwage_l,sample.logwage_u,sample.educ)
    if beta1>=min(r) and beta1 <=max(r):
        c+=1

In [19]:
c/Nsim

0.9926

## Using fixed intervals, ver 1

In [20]:
thresholds = np.linspace(1,max(cps.wage)+1,10)
thresholds

array([1.00000000e+00, 4.02567778e+04, 8.05125556e+04, 1.20768333e+05,
       1.61024111e+05, 2.01279889e+05, 2.41535667e+05, 2.81791444e+05,
       3.22047222e+05, 3.62303000e+05])

In [21]:
interval_cps = createIntervalData(cps, 'wage', thresholds)
interval_cps.head(15)

Unnamed: 0,age,wage,educ,log_wage,wage_l,wage_u,logwage_l,logwage_u
0,22,12000,13,9.392662,1.0,40256.777778,0.0,10.603034
1,21,3500,13,8.160518,1.0,40256.777778,0.0,10.603034
3,49,30000,13,10.308953,1.0,40256.777778,0.0,10.603034
4,31,32000,16,10.373491,1.0,40256.777778,0.0,10.603034
5,42,89630,21,11.403445,80512.555556,120768.333333,11.296168,11.701629
6,35,229339,21,12.342957,201279.888889,241535.666667,12.212452,12.394772
7,42,39000,12,10.571317,1.0,40256.777778,0.0,10.603034
8,48,50000,12,10.819778,40256.777778,80512.555556,10.603034,11.296168
9,41,37500,13,10.532096,1.0,40256.777778,0.0,10.603034
10,41,52000,14,10.858999,40256.777778,80512.555556,10.603034,11.296168


In [22]:
c = 0 
for i in range(Nsim):
    indx = rng.integers(low=0, high=popSize, size=Nobs)
    sample = interval_cps.iloc[indx, :]
    r = setBLP.oneDproj(sample.logwage_l,sample.logwage_u,sample.educ)
    if beta1>=min(r) and beta1 <=max(r):
        c+=1

In [23]:
c/Nsim

1.0

## Using fixed intervals, ver 2

In [24]:
max(cps.wage)

362302

In [25]:
thresholds = np.array([0, 10, 20, 40, 75, 100, 200, 300, 500 ])*1000
thresholds[0] =1

In [26]:
interval_cps = createIntervalData(cps, 'wage', thresholds)
interval_cps.head(15)

Unnamed: 0,age,wage,educ,log_wage,wage_l,wage_u,logwage_l,logwage_u
0,22,12000,13,9.392662,10000,20000,9.21034,9.903488
1,21,3500,13,8.160518,1,10000,0.0,9.21034
3,49,30000,13,10.308953,20000,40000,9.903488,10.596635
4,31,32000,16,10.373491,20000,40000,9.903488,10.596635
5,42,89630,21,11.403445,75000,100000,11.225243,11.512925
6,35,229339,21,12.342957,200000,300000,12.206073,12.611538
7,42,39000,12,10.571317,20000,40000,9.903488,10.596635
8,48,50000,12,10.819778,40000,75000,10.596635,11.225243
9,41,37500,13,10.532096,20000,40000,9.903488,10.596635
10,41,52000,14,10.858999,40000,75000,10.596635,11.225243


In [27]:
c = 0 
for i in range(Nsim):
    indx = rng.integers(low=0, high=popSize, size=Nobs)
    sample = interval_cps.iloc[indx, :]
    r = setBLP.oneDproj(sample.logwage_l,sample.logwage_u,sample.educ)
    if beta1>=min(r) and beta1 <=max(r):
        c+=1

In [28]:
c/Nsim

0.9998

### Conclusion:  
The id interval is so wide that there is no chance that true $\beta_1$ is not in the computed identification set.