In [1]:
## Preamble: Package Loading
import numpy as np
import ipywidgets as ipw
from IPython.display import display,display_html
import matplotlib.pyplot as plt
from matplotlib import gridspec
import itertools as itr
import pandas as pd
import json
import kernel as kr
import psc_dbl_sumdisp_pres as psd 

In [2]:
%%html
<style>
# .cell.selected~.unselected { display: none; }
 .cell.code_cell.unselected .input { display: none; }
</style>

<a id='index'></a>

<h1> Selection of Heterogeneous Instruments in Partially Linear Fixed Effects Panel Regression </h1>
<h2> Monte Carlo Results </h2>
<h3> By: Eric Penner </h2>

<h2> Summary </h2>

The following notebook contains results of a Monte Carlo Exercise conducted on the estimator detailed in 'psc.ipynb' and 'psc_proposal.pdf' with a data sets generated by 'psc_dgp.ipnyb' (see this notebook for details of the DGP). 

Important features of each of the following trials are presented here



* The number of datasets used from each component of each trial is 'nds = 500'


<h2> Index </h2>
<ul>
    <li> <a href='#var_desc'> Variables Description Table </a> <br><br>
    <li> <a href='#trial_1'> Trial Set 1:  Estimator comparison varying the total number of instruments</a> <br>
        <br>
    <ul> 
        <li> <a href='#trial_11'> Trial Set 1.1:  Estimator Comparison when $t_{inst} = 15$ </a> <br>
        <br>
        <li> <a href='#trial_12'> Trial Set 1.2:  Estimator Comparison when $t_{inst} = 30$ </a> <br>
        <br>
        <li> <a href='#trial_13'> Trial Set 1.3:  Estimator Comparison when $t_{inst} = 45$ </a> <br>
        <br>
        <li> <a href='#trial_14'> Trial Set 1.4:   Lasso Comparison where $t_{inst} \in \{15,30,45,100\}$ </a> <br>
        <br>
    </ul> 
     <li> <a href='#trial_2'> Trial Set 2.0: Properties of Lasso Estimator, Increasing Number of Cross Sections </a> <br>
       <br>
     <li> <a href='#trial_3'> Trial Set 3.0: Properties of Lasso Estimator, Increasing Number of Time Periods </a> <br>
       <br>
</ul>
<br><br><br><br><br><br><br><br><br><br>

In [3]:
# Trial 1.1: 0
inpt_filenames=[['pscout_9_4_1347.json' ,'pscout_9_4_1505.json' ,'pscout_9_4_1913.json' ,'pscout_9_4_1816.json']]
line_nms=[['Oracle','Known','Unknown','Lasso']]

#Trial 1.2: 1
inpt_filenames.append(['pscout_9_4_1232.json' ,'pscout_9_4_1837.json','pscout_9_4_1969.json' ,'pscout_9_4_1655.json' ])
line_nms.append(['Oracle','Known','Unknown','Lasso'])

#Trial 1.3: 2
inpt_filenames.append(['pscout_9_4_1636.json','pscout_9_4_1191.json','pscout_9_4_1510.json'])
line_nms.append(['Oracle','Known','Lasso'])

#Trial 1.4: 3
inpt_filenames.append(['pscout_9_4_1816.json','pscout_9_4_1655.json','pscout_9_4_1510.json','pscout_9_5_1624.json'])
line_nms.append(['t_inst=15','t_inst=30','t_inst=45','t_inst=100'])

#Trial 2.0: 4
inpt_filenames.append(['pscout_9_5_1624.json','pscout_9_6_1331.json','pscout_9_6_1606.json'])
line_nms.append(['ncs = 10','ncs = 25','ncs = 40'])

#Trial 3.0: 5
inpt_filenames.append(['pscout_9_5_1624.json','pscout_9_6_1537.json','pscout_9_6_1725.json'])
line_nms.append(['ntp = 50','ntp = 100','ntp = 150'])

In [4]:
res_out = [[psd.psc_load(inpt_filenames[k][i]) for i in range(len(inpt_filenames[k]))] 
                                               for k in range(len(line_nms))]

estin_dcts = [[res_out[k][i][0] for i in range(len(inpt_filenames[k]))]
                                for k in range(len(line_nms))]

dgp_sum_filenames = [[estin_dcts[k][i]['input_filename'].replace('pscdata','pscsum')
                                for i in range(len(inpt_filenames[k]))]
                                for k in range(len(line_nms))]
    
dgp_dicts = [[psd.pscsum_load(dgp_sum_filenames[k][i]) 
                                for i in range(len(dgp_sum_filenames[k]))]
                                for k in range(len(line_nms))]

dgpin_dcts =  [[dgp_dicts[k][i][0] for i in range(len(inpt_filenames[k]))]
                                   for k in range(len(line_nms))]

merged_dcts = [[{**estin_dcts[k][i],**dgpin_dcts[k][i]} 
                                   for i in range(len(inpt_filenames[k]))]
                                   for k in range(len(line_nms))]

true_bcoeffs = [[dgp_dicts[k][i][1] for i in range(len(inpt_filenames[k]))]
                                    for k in range(len(line_nms))]

true_acoeffs = [[dgp_dicts[k][i][2] for i in range(len(inpt_filenames[k]))]
                                    for k in range(len(line_nms))]

bcoeff  = [[res_out[k][i][1] for i in range(len(inpt_filenames[k]))]
                             for k in range(len(line_nms))]

acoeff  = [[res_out[k][i][3] for i in range(len(inpt_filenames[k]))]
                             for k in range(len(line_nms))]

btables = [[res_out[k][i][2] for i in range(len(inpt_filenames[k]))]
                             for k in range(len(line_nms))]

atables = [[res_out[k][i][4] for i in range(len(inpt_filenames[k]))]
                             for k in range(len(line_nms))]

<a id = 'var_desc'></a>
<h3> Variable Description Table </h3>

A number of variables are used below, here are their descriptions. Refer back to 'psc.ipynb' or 'psc_dgp.ipynb' for more details.

Variable Name  |  Description  
--|--
k_H| Kernel number used for H function Estimation  
c_H |  Plug in bandwidth constant for H function Estimation
k_mvd  | Kernel number used for multivariate d>2 density estimation
c_mvd|  Plug in bandwidth constant for multivariate d>2 density estimation
k_uvd  |  Kernel number used for bivariate density  estimation 
c_uvd |  Plug in bandwidth used for bivariate density estimation
dep_nm|  Variable name of the dependent variable
en_nm |  Variable names of each endogenous variabble
ex_nm |  Variable names of each exogenous variable
in_nm |  Variable names of instruments relevant to each cross section
err_vpro|  Vector of covariances used to construct the error cov matrix
ex_vpro|  Vector of covariances used to construct the exog variable cov matrix
inst_vpro | Vector of covariances used to construct the instrument cov matrix
frc |  Indicator for whether the functional form of control function is forced
input_filename|  Filename of dataset used to generate the results. 
kwnsub  | Indicator for ifthe subset of instrument relevant to each crs is known
n_end  |  Number of endogenous variables 
n_exo|  Number of exogenous variables
ncs  |  Number cross sections
nds  |  Number of dgp data sets
ntp |  Number of time periods
orcl |  Indicator for whether residuals $V$ are observed (=1) or not
r_seed|  Random number generator seed used to generate the data set
sec_pan|  Indicator for whether the secondary eqn data is panel or not
c_inst  |  Number of instrument relevant to each cross section   
t_inst|  Total number of instruments
inc | List of instrument relevant to at least one cross section
tin  |  Variable name of the time period index
cin  |  Variable name of the cross section index 
lasso | Indicator for lasso estimation
alph | lasso penalty value
epsil | Threshold for averaging "non zero" coefficients

<a href='#index'>Index </a>,<a href='#sl1'> Next </a>


<br><br><br><br><br><br><br><br><br>

<a id='trial_1'></a>

<h2> Trial Set 1: Estimator comparison by varying the total number of instruments </h2>

<a id='sl1'></a>
<a id='trial_11'></a>

<h3> Trial Set 1.1: Estimator Comparison when $t_{inst} = 15$ </h3> 


* Number of Cross Sections: 10


* Number of Time Periods: 50


* Number of Endogenous Regressors: 1


* Number of Exogenous Regressors: 1


* Total Number of Instruments: 15


* Number of Instrument Relevant to Each Cross Section: 3

<h3> Trial Set 1.1: Merged DGP and Estimator Function Input Dictionary Comparison </h3> 


In [35]:
psd.indict_dsp(merged_dcts[0],1)

<a href='#index'>Index </a>,<a href='#sl3'> Next </a>,<a href='#var_desc'> Back </a>
<a id='sl2'><a><br><br><br><br><br><br><br><br><br>

<a id='sl3'></a><a id='sl4'></a>
<h3> Trial Set 1.1: True Secondary Equation Coefficients Comparison </h3> 

Here I interactively display the coefficent vectors $\alpha_{1jd}$ used to generate the data set (by row indicating cross section and equation) corresponding to the position its file name appears in 'input_filenames0' above. Here they should also be identical across data sets. 

**Note:** 

1.) That since in the above 'sec_pan = 1' the secondary equations are panel type so all non zero coefficients in a columns should be identical. 

2.) A zero coefficient in the following matrix means that the instrument it multiplies is not relevant to that cross section. 

3.) In accordance with the description above they should be identical across results data sets.

4.) The density of the secondary regression coefficient matrix is **25%**


In [6]:
psd.indict_dsp(true_acoeffs[0],2)

<a href='#index'>Index </a>,<a href='#sl5'> Next </a>,<a href='#sl1'> Back </a>
<br><br><br><br><br><br><br><br><br>

<h3> Trial Set 1.1: Secondary Function Coefficient Estimates </h3>

Here I interactively show the sampling distribution of the elements of $\hat{\alpha}_{dj}$.  

In [7]:
display(psd.cfs_dsp(acoeff[0],atables[0],2,5,line_nms[0]))

<br><br><br><br><br><br><br><br><br>

<h3> Trial Set 1.1: True Primary Equations Coefficients Comparison </h3>

Here I interactively display the coefficent vector $\beta_1$ used to generate the data set corresponding to the position its file name appears in 'input_filenames0' above. Here they should be identical. 

In [8]:
psd.indict_dsp(true_bcoeffs[0],1)

<a id='sl5'></a>
<h3> Trial Set 1.1: Primary Function Coefficient Estimates </h3>


In [9]:
display(psd.cfs_dsp(bcoeff[0],btables[0],1,9,line_nms[0],1))

<a href='#index'>Index </a>,<a href='#sl6'> Next </a>,<a href='#sl4'> Back </a><br><br><br><br><br><br><br><br><br><br><br><br>

<a id='sl6'></a>
<a id='trial_12'></a>
<h2> Trial Set 1.2: Estimator Comparison when $t_{inst} = 30$ </h2> 

Here we examine the sampling distribution of $\hat{\beta}_1, \hat{\alpha}_{1}$


<ul> 
    <li> Number of Cross Sections: 10 <br> <br>
    <li> Number of Time Periods: 50 <br> <br>
    <li> Number of Endogenous Regressors: 1    <br><br>
    <li> Number of Exogenous Regressors: 1    <br><br>
    <li> Total Number of Instruments: 30    <br><br>
    <li> Number of Instrument Relevant to Each Cross Section: 3    <br><br>
</ul>

<h3> Trial Set 1.2: Merged DGP and Estimator Function Input Dictionary Comparison </h3> 


In [10]:
psd.indict_dsp(merged_dcts[1],1)

<a href='#index'>Index </a>,<a href='#sl7'> Next </a>,<a href='#sl5'> Back </a>
<br><br><br><br><br><br><br><br><br><br><br><br>

<a id='sl7'></a>
<h3> Trial Set 1.2: True Secondary Equation Coefficients Comparison </h3> 

Here I interactively display the coefficent vectors $\alpha_{1jd}$ used to generate the data set (by row indicating cross section and equation) corresponding to the position its file name appears in 'input_filenames0' above. Here they should also be identical across data sets. 

**Note:** 

1.) That since in the above 'sec_pan = 1' the secondary equations are panel type so all non zero coefficients in a columns should be identical. 

2.) A zero coefficient in the following matrix means that the instrument it multiplies is not relevant to that cross section. 

3.) In accordance with the description above they should be identical across results data sets.


4.) The density of the secondary regression coefficient matrix is **13%**


In [11]:
psd.indict_dsp(true_acoeffs[1],2)

<a href='#index'>Index </a>,<a href='#sl8'> Next </a>,<a href='#sl6'> Back </a>
<br><br><br><br><br><br><br><br><br><br><br><br>

<h3> Trial Set 1.2: Secondary Function Coefficient Estimates </h3>

Here I interactively show the sampling distribution of the elements of $\hat{\alpha}_{dj}$.  

In [12]:
display(psd.cfs_dsp(acoeff[1],atables[1],2,5,line_nms[1]))

<br><br><br><br><br><br><br><br><br>

<h3> Trial Set 1.2: True Primary Equations Coefficients Comparison </h3>

Here I interactively display the coefficent vector $\beta_1$ used to generate the data set corresponding to the position its file name appears in 'input_filenames0' above. Here they should be identical. 

In [13]:
psd.indict_dsp(true_bcoeffs[1],1)

<a id='sl8'></a>
<h3> Trial Set 1.2: Primary Function Coefficient Estimates </h3>

In [14]:
display(psd.cfs_dsp(bcoeff[1],btables[1],1,9,line_nms[1],1))

<a href='#index'>Index </a>,<a href='#sl9'> Next </a>,<a href='#sl7'> Back </a>
<br><br><br><br><br><br><br><br><br><br><br><br>

<a id='sl9'></a>
<a id='trial_13'></a>
<h2>Trial Set 1.3: Estimator Comparison when $t_{inst} = 45$ </h2> 

* Number of Cross Sections: 10


* Number of Endogenous Regressors: 1


* Number of Exogenous Regressors: 1


* Number of Time Periods: 50


* Total Number of Instruments: 45


* Number of Instrument Relevant to Each Cross Section: 3

<h3> Trial Set 1.3: Merged DGP and Estimator Function Input Dictionary Comparison </h3> 



In [15]:
psd.indict_dsp(merged_dcts[2],1)

<a href='#index'>Index </a>,<a href='#sl10'> Next </a>,<a href='#sl8'> Back </a>
<br><br><br><br><br><br><br><br><br><br><br><br>

<a id = 'sl10'></a>
<h3> Trial Set 1.3: True Secondary Equation Coefficients Comparison </h3> 

Here I interactively display the coefficent vectors $\alpha_{1jd}$ used to generate the data set (by row indicating cross section and equation) corresponding to the position its file name appears in 'input_filenames0' above. Here they should also be identical across data sets. 

**Note:** 

1.) That since in the above 'sec_pan = 1' the secondary equations are panel type so all non zero coefficients in a columns should be identical. 

2.) A zero coefficient in the following matrix means that the instrument it multiplies is not relevant to that cross section. 

3.) In accordance with the description above they should be identical across results data sets.

4.) The density of the secondary regression coefficient matrix is **8%**


In [16]:
psd.indict_dsp(true_acoeffs[2],2)

<a href='#index'>Index </a>,<a href='#sl11'> Next </a>,<a href='#sl9'> Back </a>

<br><br><br><br><br><br><br><br><br><br><br><br>

<h3> Trial Set 1.3: Secondary Function Coefficient Estimates </h3>

Here I interactively show the sampling distribution of the elements of $\hat{\alpha}_{dj}$.  

In [17]:
display(psd.cfs_dsp(acoeff[2],atables[2],2,5,line_nms[2]))

<br><br><br><br><br><br><br><br><br><br><br>

<h3> Trial Set 1.3: True Primary Equations Coefficients Comparison </h3>

Here I interactively display the coefficent vector $\beta_1$ used to generate the data set corresponding to the position its file name appears in 'input_filenames0' above. Here they should be identical. 

In [18]:
psd.indict_dsp(true_bcoeffs[2],1)

<a id = 'sl11'></a>
<h3> Trial Set 1.3: Primary Function Coefficient Estimates </h3>

Here I show the sampling distribution of the elements of $\hat{\beta}_1$.  

In [19]:
display(psd.cfs_dsp(bcoeff[2],btables[2],1,8.5,line_nms[2],1))

<a href='#index'>Index </a>,<a href='#sl12'> Next </a>,<a href='#sl10'> Back </a> <br><br><br><br><br><br><br><br><br><br><br><br>

<a id = 'sl12'> </a>
<a id='trial_14'></a>
<h2>Trial Set 1.4: Lasso Comparison where $t_{inst} \in \{15,30,45,100\}$ </h2> 

Here we examine the sampling distribution of $\hat{\beta}_1, \hat{\alpha}_{1}$.

* Number of Cross Sections: 10


* Number of time periods: 50


* Number of Endogenous Regressors: 1


* Number of Exogenous Regressors: 1


* Number of Instrument Relevant to Each Cross Section: 3

<h3> Trial Set 1.4: Merged DGP and Estimator Function Input Dictionary Comparison </h3> 


In [20]:
psd.indict_dsp(merged_dcts[3],1)

<a href='#index'>Index </a>,<a href='#sl14'> Next </a>,<a href='#sl11'> Back </a>
<br><br><br><br><br><br><br><br><br><br><br><br>

<h3> Trial Set 1.4: True Secondary Equation Coefficients Comparison </h3> 

Here I interactively display the coefficent vectors $\alpha_{1jd}$ used to generate the data set (by row indicating cross section and equation) corresponding to the position its file name appears in 'input_filenames0' above. Here they should also be identical across data sets. 

**Note:** 

1.) That since in the above 'sec_pan = 1' the secondary equations are panel type so all non zero coefficients in a columns should be identical. 

2.) A zero coefficient in the following matrix means that the instrument it multiplies is not relevant to that cross section. 

3.) In accordance with the description above they should be identical across results data sets.

4.) The density of the secondary regression coefficient matrices are **26%,13%,8%,3%**


In [21]:
psd.indict_dsp(true_acoeffs[3],2)

<a href='#index'>Index </a>,<a href='#sl14'> Next </a>,<a href='#sl12'> Back </a> <br><br><br><br><br><br><br><br><br><br>

<h3> Trial Set 1.4: Secondary Function Coefficient Estimates </h3>

Here I interactively show the sampling distribution of the elements of $\hat{\alpha}_{dj}$.  

In [22]:
display(psd.cfs_dsp(acoeff[3],atables[3],2,5,line_nms[3]))

<br><br><br><br><br><br><br><br><br><br><br>

<h3> Trial Set 1.4: True Primary Equations Coefficients Comparison </h3>

In [23]:
psd.indict_dsp(true_bcoeffs[3],1)

<a id ='sl14'> </a>

<h3> Trial Set 1.4: Primary Function Coefficient Estimates </h3>

In [24]:
display(psd.cfs_dsp(bcoeff[3],btables[3],1,8,line_nms[3],1))

<a href='#index'>Index </a>,<a href='#sl15'> Next </a>,<a href='#sl12'>Back </a>
<br><br><br><br><br><br><br><br><br><br><br><br>

<a id = 'sl15'> </a>
<a id='trial_2'></a>
<h2>Trial Set 2.0:  Properties of Lasso Estimator, Increasing Number of Cross Sections  </h2> 

Here we examine the sampling distribution of $\hat{\beta}_1$, and $\hat{\alpha}_{1}$ as the number of cross section increases.

* Number of Cross Sections: 10,25,40


* Number of Time Periods: 50


* Number of Endogenous Regressors: 1


* Number of Exogenous Regressors: 1


* Total Number of Instruments: 100


* Number of Instrument Relevant to Each Cross Section: 3

<h3> Trial Set 2.0: Merged DGP and Estimator Function Input Dictionary Comparison </h3> 

In [25]:
psd.indict_dsp(merged_dcts[4],1)

<a href='#index'>Index </a>,<a href='#sl16'> Next </a>,<a href='#sl14'>Back </a>
<br><br><br><br><br><br><br><br><br><br><br><br>

<h3> Trial Set 2.0: True Secondary Equation Coefficients Comparison </h3> 

Here I interactively display the coefficent vectors $\alpha_{1jd}$ used to generate the data set (by row indicating cross section and equation) corresponding to the position its file name appears in 'input_filenamesXX' above. Here they should also be identical across data sets. 

**Note:** 

1.) That since in the above 'sec_pan = 1' the secondary equations are panel type so all non zero coefficients in a columns should be identical. 

2.) A zero coefficient in the following matrix means that the instrument it multiplies is not relevant to that cross section. 


In [26]:
psd.indict_dsp(true_acoeffs[4],2)

<br><br><br><br><br><br><br><br><br>

<h3> Trial Set 2.0: Secondary Function Coefficient Estimates </h3>

Here I interactively show the sampling distribution of the elements of $\hat{\alpha}_{dj}$.  

In [27]:
display(psd.cfs_dsp(acoeff[4],atables[4],2,5,line_nms[4]))

<br><br><br><br><br><br><br><br><br><br><br>

<h3> Trial Set 2.0: True Primary Equations Coefficients Comparison </h3>

In [28]:
psd.indict_dsp(true_bcoeffs[4],1)

<a id ='sl16'> </a>

<h3> Trial Set 2.0: Primary Function Coefficients </h3>

Here I show the sampling distribution of the elements of $\hat{\beta}_1$.  

In [29]:
display(psd.cfs_dsp(bcoeff[4],btables[4],1,11,line_nms[4],1))

<a href='#index'>Index </a>,<a href='#sl17'> Next </a>,<a href='#sl15'>Back </a>
<br><br><br><br><br><br><br><br><br><br><br><br>

<a href='#index'> Back to Index </a>
<a id='trial_3'><a>

<a id = 'sl17'> </a>

<h2>Trial Set 3.0: Properties of Lasso Estimator, Increasing Number of Time Periods </h2> 

Here we examine the sampling distribution of $\hat{\beta}_1, \hat{\alpha}_{1}$.

* Number of Cross Sections: 10


* Number of Time Periods: 50,100,150


* Number of Endogenous Regressors: 1


* Number of Exogenous Regressors: 1


* Total Number of Instruments: 100


* Number of Instrument Relevant to Each Cross Section: 3 

<h3> Trial Set 3.0: Merged DGP and Estimator Function Input Dictionary Comparison </h3> 

In [30]:
psd.indict_dsp(merged_dcts[5],1)

<a href='#index'>Index </a>,<a href='#sl18'> Next </a>,<a href='#sl16'>Back </a>
<br><br><br><br><br><br><br><br><br><br><br><br>

<h3> Trial Set 3.0: True Secondary Equation Coefficients Comparison </h3> 

Here I interactively display the coefficent vectors $\alpha_{1jd}$ used to generate the data set (by row indicating cross section and equation) corresponding to the position its file name appears in 'input_filenames0' above. Here they should also be identical across data sets. 

**Note:** 

1.) That since in the above 'sec_pan = 1' the secondary equations are panel type so all non zero coefficients in a columns should be identical. 

2.) A zero coefficient in the following matrix means that the instrument it multiplies is not relevant to that cross section. 



In [31]:
psd.indict_dsp(true_acoeffs[5],2)

<br><br><br><br><br><br><br><br><br>

<h3> Trial Set 3.0: Secondary Function Coefficient Estimates </h3>

Here I interactively show the sampling distribution of the elements of $\hat{\alpha}_{dj}$.  

In [32]:
display(psd.cfs_dsp(acoeff[5],atables[5],2,7,line_nms[5]))

<br><br><br><br><br><br><br><br><br><br><br>

<h3> Trial Set 3.0: True Primary Equations Coefficients Comparison </h3>

Here I interactively display the coefficent vector $\beta_1$ used to generate the data set corresponding to the position its file name appears in 'input_filenames0' above. Here they should be identical. 

In [33]:
psd.indict_dsp(true_bcoeffs[5],1)

<a id = 'sl18'> </a>


<h3> Trial Set 3.0: Primary Function Coefficient Estimates </h3>

Here I show the sampling distribution of the elements of $\hat{\beta}_1$.  

In [34]:
display(psd.cfs_dsp(bcoeff[5],btables[5],1,9,line_nms[5],1))

<a href='#index'>Index </a>, <a href='#sl17'> Back </a>
<br><br><br><br><br><br><br><br><br><br><br><br>