# Packages

In [1]:
using CSV, DataFrames
using Statistics

In [19]:
using Random

# Uploading CPS data

In [2]:
cps = DataFrame(CSV.File("c:\\data\\Bounds\\cps.csv")) # <-change this to the right directory

## Cleaning zero wage

cps = cps[cps.wage .> 0, :]
first(cps,5)

Unnamed: 0_level_0,age,wage,educ
Unnamed: 0_level_1,Int64,Int64,String
1,22,12000,some college but no degree
2,21,3500,some college but no degree
3,49,30000,some college but no degree
4,31,32000,bachelor's degree
5,42,89630,doctorate degree


In [3]:
# Numerical value for education

#1.Creating a dictionary

educ_dict = Dict( 
    "grade 11" => 11, 
    "some college but no degree" => 13, 
    "associate's degree, academic program" => 14,
    "grade 10" => 10, 
    "grades 7 or 8" => 8, 
    "grades 1, 2, 3, or 4" => 4, 
    "associate's degree, occupational/vocational program" => 14, 
    "high school diploma or equivalent" => 12, 
    "grade 9" => 9, 
    "none or preschool" => 0, 
    "doctorate degree" => 21,
    "bachelor's degree" => 16, 
    "master's degree" => 14, 
    "grades 5 or 6" => 6, 
    "professional school degree" => 14, 
    "12th grade, no diploma" => 12
    )

Dict{String, Int64} with 16 entries:
  "grade 11"                                            => 11
  "some college but no degree"                          => 13
  "associate's degree, academic program"                => 14
  "none or preschool"                                   => 0
  "doctorate degree"                                    => 21
  "grade 10"                                            => 10
  "bachelor's degree"                                   => 16
  "master's degree"                                     => 14
  "grades 5 or 6"                                       => 6
  "grades 7 or 8"                                       => 8
  "grades 1, 2, 3, or 4"                                => 4
  "associate's degree, occupational/vocational program" => 14
  "professional school degree"                          => 14
  "high school diploma or equivalent"                   => 12
  "grade 9"                                             => 9
  "12th grade, no diploma"            

In [4]:
#2. transforming cps.educ
f(x) = educ_dict[x]
cps.educ_num = f.(cps.educ)

#3. log of wage
cps.log_wage = log.(cps.wage)
first(cps,10)

Unnamed: 0_level_0,age,wage,educ,educ_num,log_wage
Unnamed: 0_level_1,Int64,Int64,String,Int64,Float64
1,22,12000,some college but no degree,13,9.39266
2,21,3500,some college but no degree,13,8.16052
3,49,30000,some college but no degree,13,10.309
4,31,32000,bachelor's degree,16,10.3735
5,42,89630,doctorate degree,21,11.4034
6,35,229339,doctorate degree,21,12.343
7,42,39000,high school diploma or equivalent,12,10.5713
8,48,50000,high school diploma or equivalent,12,10.8198
9,41,37500,some college but no degree,13,10.5321
10,41,52000,"associate's degree, occupational/vocational program",14,10.859


In [5]:
Nobs, = size(cps)

(22715, 5)

# Best Linear Predictor 

In [6]:
using FixedEffectModels

In [7]:
ols1 = reg(cps, @formula(log_wage ~ educ_num ), Vcov.robust())

                             Linear Model                             
Number of obs:               22715  Degrees of freedom:              1
R2:                          0.092  R2 Adjusted:                 0.092
F-Stat:                    2440.69  p-value:                     0.000
log_wage    | Estimate  Std.Error t value Pr(>|t|) Lower 95% Upper 95%
----------------------------------------------------------------------
educ_num    | 0.124936 0.00252889 49.4034    0.000  0.119979  0.129893
(Intercept) |  8.57109  0.0336583  254.65    0.000   8.50512   8.63707


In [8]:
β₁ = ols1.coef[2]

0.12493581688862578

In [9]:
reg(cps, @formula(log_wage ~ educ_num + age ), Vcov.robust())

                              Linear Model                              
Number of obs:                22715  Degrees of freedom:               2
R2:                           0.209  R2 Adjusted:                  0.209
F-Stat:                     2684.15  p-value:                      0.000
log_wage    |  Estimate   Std.Error t value Pr(>|t|) Lower 95% Upper 95%
------------------------------------------------------------------------
educ_num    |  0.110902   0.0024148  45.926    0.000  0.106169  0.115635
age         | 0.0377451 0.000712846 52.9498    0.000 0.0363478 0.0391423
(Intercept) |   7.41447   0.0394616 187.891    0.000   7.33712   7.49182


# Creating Interval Data

In [10]:
function createIntervalData(df::DataFrame,
                            Y::Symbol,
                            X::Vector{Symbol},
                            thresholds::Vector{<:Real})
    # The function accepts a dataframe and make interval data from the Y variable. It creates a new dataframe
    # which contains the lower and upper values for Y (based on the thresholds) and the covariates.
    
    data = DataFrame()  
    ql = x -> thresholds[sum(thresholds .<=x)]
    qu = x -> thresholds[sum(thresholds .<=x)+1]
    
    data.yl = ql.(df[!,Y])
    data.yu = qu.(df[!,Y])
    
    data.lyl = log.(data.yl)
    data.lyu = log.(data.yu)
    
    for s in X
        data[:,s] = df[:,s]
    end
    
    return data
end
    

createIntervalData (generic function with 1 method)

In [11]:
wage_quantiles = quantile(cps[!,:wage],[0.0, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], sorted=false);

wage_quantiles[end] +=1

362303.0

In [12]:
interval_cps = createIntervalData(cps,:wage,[:educ_num,:age],wage_quantiles)

first(interval_cps,5)

Unnamed: 0_level_0,yl,yu,lyl,lyu,educ_num,age
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Int64,Int64
1,9000.0,15000.0,9.10498,9.61581,13,22
2,1.0,9000.0,0.0,9.10498,13,21
3,30000.0,36000.0,10.309,10.4913,13,49
4,30000.0,36000.0,10.309,10.4913,16,31
5,71000.0,362303.0,11.1704,12.8002,21,42


# Partial Indentification

In [13]:
include("C:\\Users\\The Group Leader\\Notebooks\\Research\\Bounds.jl-1\\src\\setBLP.jl")

Main.setBLP

In [14]:
using Main.setBLP

In [15]:
r = oneDproj(interval_cps.lyl,interval_cps.lyu,interval_cps.educ_num)

1×2 Matrix{Float64}:
 0.00178604  0.368579

In [16]:
import Base

In [17]:
#Overloading the function just in case

function Base.:(∈)(x::Real,v::Vector{<:Real})
    return minimum(v) <= x <=maximum(v)
end

function Base.:(∈)(x::Real,v::Matrix{<:Real})
    return minimum(v) <= x <=maximum(v)
end

In [18]:
β₁ ∈ r

true

## Simulations

#### parameter

In [21]:
rng = MersenneTwister(15217);

In [32]:
popSize, =size(interval_cps) #number of observations in the "population" (i.e. the whole cps sample)

(22715, 6)

In [44]:
Nobs = 100; #size of sub sample
Nsim = 5000; #number of simulations

In [67]:
Nintervals = 8; #number of intervals in the survey

### Using quantiles

In [45]:
c = 0

for i in 1:Nsim
    indx = rand(1:popSize,Nobs)
    sample = interval_cps[indx,:]
    r = oneDproj(sample.lyl,sample.lyu,sample.educ_num)
    c += β₁ ∈ r
end

In [47]:
c/Nsim

0.9944

### Using fixed intervals, ver 1

In [60]:
thresholds = convert(Vector,range(1,stop=maximum(cps.wage)+1,length=10))

10-element Vector{Float64}:
      1.0
  40256.77777777778
  80512.55555555556
 120768.33333333333
 161024.11111111112
 201279.88888888888
 241535.66666666666
 281791.44444444444
 322047.22222222225
 362303.0

In [62]:
interval_cps = createIntervalData(cps,:wage,[:educ_num,:age],thresholds)

first(interval_cps,15)

Unnamed: 0_level_0,yl,yu,lyl,lyu,educ_num,age
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Int64,Int64
1,1.0,40256.8,0.0,10.603,13,22
2,1.0,40256.8,0.0,10.603,13,21
3,1.0,40256.8,0.0,10.603,13,49
4,1.0,40256.8,0.0,10.603,16,31
5,80512.6,120768.0,11.2962,11.7016,21,42
6,201280.0,241536.0,12.2125,12.3948,21,35
7,1.0,40256.8,0.0,10.603,12,42
8,40256.8,80512.6,10.603,11.2962,12,48
9,1.0,40256.8,0.0,10.603,13,41
10,40256.8,80512.6,10.603,11.2962,14,41


In [63]:
c = 0

for i in 1:Nsim
    indx = rand(1:popSize,Nobs)
    sample = interval_cps[indx,:]
    r = oneDproj(sample.lyl,sample.lyu,sample.educ_num)
    c += β₁ ∈ r
end

In [64]:
c/Nsim 

1.0

### Using fixed intervals, ver 2

In [66]:
maximum(cps.wage)

36230.2

In [72]:
thresholds = [0, 10.0, 20, 40, 75, 100, 200, 300, 500 ] *1000
thresholds[1] =1.0

1.0

In [73]:
interval_cps = createIntervalData(cps,:wage,[:educ_num,:age],thresholds)

first(interval_cps,15)

Unnamed: 0_level_0,yl,yu,lyl,lyu,educ_num,age
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Int64,Int64
1,10000.0,20000.0,9.21034,9.90349,13,22
2,1.0,10000.0,0.0,9.21034,13,21
3,20000.0,40000.0,9.90349,10.5966,13,49
4,20000.0,40000.0,9.90349,10.5966,16,31
5,75000.0,100000.0,11.2252,11.5129,21,42
6,200000.0,300000.0,12.2061,12.6115,21,35
7,20000.0,40000.0,9.90349,10.5966,12,42
8,40000.0,75000.0,10.5966,11.2252,12,48
9,20000.0,40000.0,9.90349,10.5966,13,41
10,40000.0,75000.0,10.5966,11.2252,14,41


In [74]:
c = 0

for i in 1:Nsim
    indx = rand(1:popSize,Nobs)
    sample = interval_cps[indx,:]
    r = oneDproj(sample.lyl,sample.lyu,sample.educ_num)
    c += β₁ ∈ r
end

In [75]:
c/Nsim 

0.9994

**Conclusion:** The id interval is so wide that there is no chance that true $\beta_1$ is not in the computed identification set.