In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm

# Uploading CPS data

In [2]:
cps = pd.read_csv('cps_test.csv')

# Best Linear Predictor

In [3]:
ols1 = smf.ols(formula='log_wage ~ educ+age',data=cps).fit(cov_type='HC1')

print(ols1.summary())

                            OLS Regression Results                            
Dep. Variable:               log_wage   R-squared:                       0.209
Model:                            OLS   Adj. R-squared:                  0.209
Method:                 Least Squares   F-statistic:                     2684.
Date:                Thu, 06 Jul 2023   Prob (F-statistic):               0.00
Time:                        19:21:12   Log-Likelihood:                -28323.
No. Observations:               22715   AIC:                         5.665e+04
Df Residuals:                   22712   BIC:                         5.668e+04
Df Model:                           2                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      7.4145      0.039    187.891      0.0

In [4]:
beta1 = ols1._results.params

# Creating Interval Data

In [5]:
def createIntervalData(df, Y, thresholds):
    # The function accepts a dataframe, df, and make interval data from the Y (string) variable. 
    # It adds to the dataframe the lower and upper values for Y (based on the thresholds) and the covariates.
    
    thresholds = np.array(thresholds)

    idx = [sum(t <= y for t in thresholds)-1 for y in df[Y]]
    df[Y+'_l'] = thresholds[idx]
    df[Y+'_u'] = thresholds[np.array(idx)+1]
    
    df['log'+Y+'_l']= np.log(df[Y+'_l'])
    df['log'+Y+'_u']= np.log(df[Y+'_u'])
    
    return df

In [6]:
wage_quantiles = np.array(cps['wage'].quantile(np.linspace(0,1,11)))
wage_quantiles[-1]+=1
interval_cps = createIntervalData(cps, 'wage', wage_quantiles)

## add dummy education 

In [7]:
interval_cps['educ_d'] = interval_cps.educ>12
interval_cps

Unnamed: 0.1,Unnamed: 0,age,wage,educ,log_wage,wage_l,wage_u,logwage_l,logwage_u,educ_d
0,0,22,12000,13,9.392662,9000.0,15000.0,9.104980,9.615805,True
1,1,21,3500,13,8.160518,1.0,9000.0,0.000000,9.104980,True
2,3,49,30000,13,10.308953,30000.0,36000.0,10.308953,10.491274,True
3,4,31,32000,16,10.373491,30000.0,36000.0,10.308953,10.491274,True
4,5,42,89630,21,11.403445,71000.0,362303.0,11.170435,12.800236,True
...,...,...,...,...,...,...,...,...,...,...
22710,26111,37,32000,14,10.373491,30000.0,36000.0,10.308953,10.491274,True
22711,26112,50,56000,12,10.933107,53000.0,71000.0,10.878047,11.170435,False
22712,26113,24,9000,12,9.104980,9000.0,15000.0,9.104980,9.615805,False
22713,26114,49,29000,12,10.275051,25000.0,30000.0,10.126631,10.308953,False


# OLS using dummy education instead of education

In [8]:
ols2 = smf.ols(formula='log_wage ~ educ_d+age',data=interval_cps).fit(cov_type='HC1')

print(ols2.summary())

                            OLS Regression Results                            
Dep. Variable:               log_wage   R-squared:                       0.180
Model:                            OLS   Adj. R-squared:                  0.179
Method:                 Least Squares   F-statistic:                     2341.
Date:                Thu, 06 Jul 2023   Prob (F-statistic):               0.00
Time:                        19:21:12   Log-Likelihood:                -28745.
No. Observations:               22715   AIC:                         5.750e+04
Df Residuals:                   22712   BIC:                         5.752e+04
Df Model:                           2                                         
Covariance Type:                  HC1                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          8.5907      0.027    318.

In [9]:
beta2 = ols2._results.params

# OneDproj

In [10]:
import setBLP
from importlib import reload # reload 
reload(setBLP)

<module 'setBLP' from '/Users/yunyun/Desktop/Bounds/Bounds_Python/setBLP.py'>

In [11]:
r = setBLP.oneDproj(interval_cps.logwage_l, interval_cps.logwage_u, interval_cps.educ)
r

[0.001786042221938278, 0.3685793555816805]

In [12]:
beta1[1]>=min(r) and beta1[1] <=max(r)

True

In [13]:
r = setBLP.oneDproj(interval_cps.logwage_l, interval_cps.logwage_u, interval_cps.age)
r

[-0.004887858329797926, 0.14865963017398875]

# OneDproj_V2

In [14]:
def oneDproj_v2(yl:list, yu:list, x:pd.DataFrame, j: int):
    
    # linear projection of x_j on x_(-j)
    r = sm.OLS(X.iloc[:, j], X.drop(X.columns[j], axis = 1)).fit().resid
    
    M1 = np.multiply(r, yl)
    M2 = np.multiply(r, yu)
    s = np.dot(r,r)
    bound = [sum(np.minimum(M1, M2))/s, sum(np.maximum(M1, M2))/s]
    return bound

## x is education

In [15]:
X = sm.add_constant(interval_cps[['educ','age']])

In [16]:
r = oneDproj_v2(interval_cps.logwage_l, interval_cps.logwage_u, X,1)
r

[-0.025567837808631744, 0.3456382284305771]

In [17]:
beta1[1]>=min(r) and beta1[1] <=max(r)

True

## x is age

In [18]:
r = oneDproj_v2(interval_cps.logwage_l, interval_cps.logwage_u, X,2)
r

[-0.009051777215647453, 0.14432389300937354]

In [19]:
beta1[2]>=min(r) and beta1[2] <=max(r)

True

## x is the constant

In [20]:
r = oneDproj_v2(interval_cps.logwage_l, interval_cps.logwage_u, X,0)
r

[1.4596840360564338, 9.379493860118757]

In [21]:
beta1[0]>=min(r) and beta1[0] <=max(r)

True

## x is dummy education

In [22]:
X = sm.add_constant(interval_cps[['educ_d','age']])
r = oneDproj_v2(interval_cps.logwage_l, interval_cps.logwage_u, X,1)
r

[-0.7590053281491242, 1.7318699827566628]

In [23]:
beta2[1]>=min(r) and beta2[1] <=max(r)

True

In [24]:
X = sm.add_constant(interval_cps['educ'])
oneDproj_v2(interval_cps.logwage_l, interval_cps.logwage_u, X,1)

[0.0017860422218549156, 0.3685793555816234]

# Combined as OneDproj

In [25]:
def oneDproj(df, yl, yu, x, j=0):
    
    # df is the data
    # x is the (list of) name(s) of independent variable(s), must be included in df
    # j is the index of independent variable (in x) that we are interested in
    # yl(yu) is a list of the lower(upper) bounds or its column name in df
    
    if type(yl) == str:
        yl = df[yl]
    if type(yu) == str:
        yu = df[yu]
    x = sm.add_constant(df[x])
    
    # linear projection of x_j on x_(-j)
    j+= 1
    r = sm.OLS(x.iloc[:, j], x.drop(x.columns[j], axis = 1)).fit().resid
    

    M1 = np.multiply(r, yl)
    M2 = np.multiply(r, yu)
    s = np.dot(r,r)
    bound = [sum(np.minimum(M1, M2))/s, sum(np.maximum(M1, M2))/s]
    return bound

In [26]:
oneDproj(interval_cps, 'logwage_l', 'logwage_u', ['educ_d','age'],0)

## problem if x=['educ_d','age'] and j=1

[-0.7590053281491242, 1.7318699827566628]