In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm

# Uploading CPS data

In [2]:
cps = pd.read_csv('cps_test.csv')

# Best Linear Predictor

In [3]:
ols1 = smf.ols(formula='log_wage ~ educ',data=cps).fit(cov_type='HC1')

print(ols1.summary())

                            OLS Regression Results                            
Dep. Variable:               log_wage   R-squared:                       0.092
Model:                            OLS   Adj. R-squared:                  0.092
Method:                 Least Squares   F-statistic:                     2441.
Date:                Thu, 20 Jul 2023   Prob (F-statistic):               0.00
Time:                        20:51:40   Log-Likelihood:                -29890.
No. Observations:               22715   AIC:                         5.978e+04
Df Residuals:                   22713   BIC:                         5.980e+04
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      8.5711      0.034    254.650      0.0

In [4]:
beta1 = ols1._results.params[1]

In [5]:
ols2 = smf.ols(formula='log_wage ~ educ+age',data=cps).fit(cov_type='HC1')

print(ols2.summary())

                            OLS Regression Results                            
Dep. Variable:               log_wage   R-squared:                       0.209
Model:                            OLS   Adj. R-squared:                  0.209
Method:                 Least Squares   F-statistic:                     2684.
Date:                Thu, 20 Jul 2023   Prob (F-statistic):               0.00
Time:                        20:51:40   Log-Likelihood:                -28323.
No. Observations:               22715   AIC:                         5.665e+04
Df Residuals:                   22712   BIC:                         5.668e+04
Df Model:                           2                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      7.4145      0.039    187.891      0.0

In [6]:
beta2_0 = ols2._results.params[0]
beta2_1 = ols2._results.params[1]
beta2_2 = ols2._results.params[2]

# Creating Interval Data

In [7]:
def createIntervalData(df, Y, thresholds):
    # The function accepts a dataframe, df, and make interval data from the Y (string) variable. 
    # It adds to the dataframe the lower and upper values for Y (based on the thresholds) and the covariates.
    
    thresholds = np.array(thresholds)

    idx = [sum(t <= y for t in thresholds)-1 for y in df[Y]]
    df[Y+'_l'] = thresholds[idx]
    df[Y+'_u'] = thresholds[np.array(idx)+1]
    
    df['log'+Y+'_l']= np.log(df[Y+'_l'])
    df['log'+Y+'_u']= np.log(df[Y+'_u'])
    
    return df

In [8]:
# wage_quantiles = np.array(cps['wage'].quantile([0.0, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]))
# wage_quantiles[-1]+=1
# logwage_quantiles = np.log(wage_quantiles)

# cps['wage_qt'] = np.floor(cps['wage'].rank(method = 'max', pct=True)*10).astype(int).replace({10:9})
# cps['wage_lower'] = wage_quantiles[cps['wage_qt']]
# cps['wage_upper'] = wage_quantiles[cps['wage_qt']+1]

# cps['logwage_lower'] = logwage_quantiles[cps['wage_qt']]
# cps['logwage_upper'] = logwage_quantiles[cps['wage_qt']+1]

In [9]:
wage_quantiles = np.array(cps['wage'].quantile(np.linspace(0,1,11)))
wage_quantiles[-1]+=1
cps = createIntervalData(cps, 'wage', wage_quantiles)

In [10]:
cps

Unnamed: 0.1,Unnamed: 0,age,wage,educ,log_wage,wage_l,wage_u,logwage_l,logwage_u
0,0,22,12000,13,9.392662,9000.0,15000.0,9.104980,9.615805
1,1,21,3500,13,8.160518,1.0,9000.0,0.000000,9.104980
2,3,49,30000,13,10.308953,30000.0,36000.0,10.308953,10.491274
3,4,31,32000,16,10.373491,30000.0,36000.0,10.308953,10.491274
4,5,42,89630,21,11.403445,71000.0,362303.0,11.170435,12.800236
...,...,...,...,...,...,...,...,...,...
22710,26111,37,32000,14,10.373491,30000.0,36000.0,10.308953,10.491274
22711,26112,50,56000,12,10.933107,53000.0,71000.0,10.878047,11.170435
22712,26113,24,9000,12,9.104980,9000.0,15000.0,9.104980,9.615805
22713,26114,49,29000,12,10.275051,25000.0,30000.0,10.126631,10.308953


# Partial Identification

In [11]:
import setBLP

In [12]:
# Preparing vector and matrix versions of the data

yl = cps.logwage_l;
yu = cps.logwage_u;
x1 = cps.age;
x2 = cps.educ;
x = cps[['age','educ']];

## Testing

In [13]:
setBLP.oneDproj(yl,yu,x1)

[-0.004887858329797926, 0.14865963017398875]

In [14]:
setBLP.oneDproj(yl,yu,x2)

[0.001786042221938278, 0.3685793555816805]

In [15]:
setBLP.oneDproj(yl,yu,x,0)

[-0.009051777215653997, 0.14432389300936818]

In [16]:
setBLP.oneDproj(yl,yu,x,1)

[-0.025567837808786964, 0.3456382284304264]

In [17]:
setBLP.oneDproj(yl,yu,x)

[[-0.009051777215653997, 0.14432389300936818],
 [-0.025567837808786964, 0.3456382284304264]]

In [18]:
setBLP.oneDproj(yl, 'logwage_u', 'age', data = cps)

[-0.004887858329797926, 0.14865963017398875]

In [19]:
setBLP.oneDproj(yl, yu, 'educ', data = cps)

[0.001786042221938278, 0.3685793555816805]

In [20]:
setBLP.oneDproj(yl, yu, ['age','educ'], 0, data = cps)

[-0.009051777215653997, 0.14432389300936818]

In [21]:
setBLP.oneDproj('logwage_l', 'logwage_u', ['age','educ'], 1, data = cps)

[-0.025567837808786964, 0.3456382284304264]

In [22]:
setBLP.oneDproj(yl, yu, ['age','educ'], data = cps)

[[-0.009051777215653997, 0.14432389300936818],
 [-0.025567837808786964, 0.3456382284304264]]

# Testing CI1d  

In [23]:
H0 = [0, 0.15]

In [30]:
result = setBLP.CI1d(yl, yu, x1, H0)

In [31]:
result.bound

[-0.004887858329797926, 0.14865963017398875]

In [34]:
result.Htest.destStat 

0.7366733861557335

In [35]:
result.Htest.criticalVal 

0.7455607394764423

In [36]:
result.Htest.ConfidenceInterval

[9.291387124570699, 10.53487842690386]

In [37]:
result.dHtest.destStat 

0.2020137884130314

In [38]:
result.dHtest.criticalVal 

0.6138535541937418

In [39]:
result.dHtest.ConfidenceInterval

[9.292261007182418, 10.534004544292142]

# Simulations

## parameter

In [25]:
rng = np.random.MT19937(15217)
rng = np.random.Generator(rng)
np.random.seed(setBLP.default_options.seed)

In [26]:
popSize = cps.shape[0]

In [27]:
Nobs = 100;  #size of sub sample
Nsim = 5000; #number of simulations

In [28]:
Nintervals = 8; #number of intervals in the survey

## Using quantiles

In [29]:
c = 0 
width=0
for i in range(Nsim):
    indx = rng.integers(low=0, high=popSize, size=Nobs)
    sample = cps.iloc[indx, :]
    r = setBLP.oneDproj(sample.logwage_l,sample.logwage_u,sample.educ)
    width += max(r)-min(r)
    if beta1>=min(r) and beta1 <=max(r):
        c+=1
    
width=width/Nsim

In [55]:
i = 0 
indx = rng.integers(low=0, high=popSize, size=Nobs)
sample = cps.iloc[indx, :]
r = setBLP.oneDproj(sample.logwage_l,sample.logwage_u,sample.educ)

In [56]:
r

[-0.14332789145653219, 0.16148943332841725]

In [57]:
c/Nsim

0.9926

In [58]:
width

0.38923085100548893

## Using fixed intervals, ver 1

In [59]:
thresholds = np.linspace(1,max(cps.wage)+1,10)
thresholds

array([1.00000000e+00, 4.02567778e+04, 8.05125556e+04, 1.20768333e+05,
       1.61024111e+05, 2.01279889e+05, 2.41535667e+05, 2.81791444e+05,
       3.22047222e+05, 3.62303000e+05])

In [60]:
interval_cps = createIntervalData(cps, 'wage', thresholds)
interval_cps.head(15)

Unnamed: 0.1,Unnamed: 0,age,wage,educ,log_wage,wage_l,wage_u,logwage_l,logwage_u
0,0,22,12000,13,9.392662,1.0,40256.777778,0.0,10.603034
1,1,21,3500,13,8.160518,1.0,40256.777778,0.0,10.603034
2,3,49,30000,13,10.308953,1.0,40256.777778,0.0,10.603034
3,4,31,32000,16,10.373491,1.0,40256.777778,0.0,10.603034
4,5,42,89630,21,11.403445,80512.555556,120768.333333,11.296168,11.701629
5,6,35,229339,21,12.342957,201279.888889,241535.666667,12.212452,12.394772
6,7,42,39000,12,10.571317,1.0,40256.777778,0.0,10.603034
7,8,48,50000,12,10.819778,40256.777778,80512.555556,10.603034,11.296168
8,9,41,37500,13,10.532096,1.0,40256.777778,0.0,10.603034
9,10,41,52000,14,10.858999,40256.777778,80512.555556,10.603034,11.296168


In [61]:
c = 0 
width =0
for i in range(Nsim):
    indx = rng.integers(low=0, high=popSize, size=Nobs)
    sample = interval_cps.iloc[indx, :]
    r = setBLP.oneDproj(sample.logwage_l,sample.logwage_u,sample.educ)
    width+=max(r)-min(r)
    if beta1>=min(r) and beta1 <=max(r):
        c+=1
width=width/Nsim

In [62]:
c/Nsim

1.0

In [63]:
width

2.260491454846311

## Using fixed intervals, ver 2

In [64]:
max(cps.wage)

362302

In [65]:
thresholds = np.array([0, 10, 20, 40, 75, 100, 200, 300, 500 ])*1000
thresholds[0] =1

In [66]:
interval_cps = createIntervalData(cps, 'wage', thresholds)
interval_cps.head(15)

Unnamed: 0.1,Unnamed: 0,age,wage,educ,log_wage,wage_l,wage_u,logwage_l,logwage_u
0,0,22,12000,13,9.392662,10000,20000,9.21034,9.903488
1,1,21,3500,13,8.160518,1,10000,0.0,9.21034
2,3,49,30000,13,10.308953,20000,40000,9.903488,10.596635
3,4,31,32000,16,10.373491,20000,40000,9.903488,10.596635
4,5,42,89630,21,11.403445,75000,100000,11.225243,11.512925
5,6,35,229339,21,12.342957,200000,300000,12.206073,12.611538
6,7,42,39000,12,10.571317,20000,40000,9.903488,10.596635
7,8,48,50000,12,10.819778,40000,75000,10.596635,11.225243
8,9,41,37500,13,10.532096,20000,40000,9.903488,10.596635
9,10,41,52000,14,10.858999,40000,75000,10.596635,11.225243


In [None]:
c = 0 
width =0
for i in range(Nsim):
    indx = rng.integers(low=0, high=popSize, size=Nobs)
    sample = interval_cps.iloc[indx, :]
    r = setBLP.oneDproj(sample.logwage_l,sample.logwage_u,sample.educ)
    width+=max(r)-min(r)
    if beta1>=min(r) and beta1 <=max(r):
        c+=1
        
width=width/Nsim

In [None]:
c/Nsim

In [None]:
width

### Conclusion:  
The id interval is so wide that there is no chance that true $\beta_1$ is not in the computed identification set.

(age, wage) pair