In [1]:
## Preamble: Package Loading
import numpy as np
from sklearn import linear_model
from IPython.display import display
import matplotlib.pyplot as plt
from matplotlib import gridspec
import pandas as pd
import itertools as iter
import os
import datetime as dt
import json
import kernel as kr
import psc_sumdisp as psd 
# Preamble working directory retreival
wkng_folder = os.getcwd()

<h2> 2 Required Functions </h2>

<h3> 2.1 Merging and Differencing Function </h3>

In [2]:
def mrg_dff(dp_in,di_in,inpt):
    """
INPUTS 
  dp_in                  (df) panel dataframe 
  di_in                  (df) instruments dataframe
  inpt                   (dict) Containing the following elements
    inpt['tin']          (str) time index column name
    inpt['cin']          (str) cross section index column name
    inpt['ncs']          (int) number of cross sections
    inpt['inc_vec']       (lst) of list of binary vectors 
      inpt['inc_vec'][i]  (lst) of binary vectors listing which instruments in inc are relevant to ith crs
    
OUTPUTS
  out[0]             (df) dp_in and di_in merged on tin for each cin and 1st differenced
  out[1]             (df) dp_in and di_in merged on tin for each cin 
    """
    # Extracting variables from input dictionary
    tin = inpt['tin']
    cin = inpt['cin']
    ncs = inpt['ncs']
    inst_incl_vec = inpt['inc_vec']
    
    # Looping over each cross section
    for i in range(ncs):
        # Merging panel and instrument df on tin for ith cross section
        di_no_tin = di_in.drop(inpt['tin'],axis=1).copy()
        di_no_tin_relev = di_no_tin.values.dot(np.diag(inst_incl_vec[i]))
        tin_as_ndarray = di_in.loc[:,inpt['tin']].values.reshape(di_in.shape[0],1)
        di_relev = pd.DataFrame(np.hstack((tin_as_ndarray,di_no_tin_relev)),columns = di_in.columns)
        dpdi_relev = pd.merge(dp_in.loc[dp_in[cin] == i+1,:],di_relev,how = 'inner', on = tin)
        # Initializing the temp difference matrix 
        dpdi_relev_dff = dpdi_relev.loc[1:,[tin]+[cin]] 
        # Looping over all the column names in b1
        for nm in dpdi_relev.columns[2:].tolist():
            # 1st Differencing nm column of b1
            dpdi_relev_dff[''.join(['D',nm])] = (dpdi_relev.loc[:,nm].values 
                                                 - dpdi_relev.loc[:,nm].shift(1).values)[1:]
        if i == 0:
            # Initializing final matrix out
            dpdi_relev_all = dpdi_relev
            dpdi_relev_dff_all = dpdi_relev_dff
        elif i > 0:
            # Concatenating b2 to out
            dpdi_relev_all = pd.concat([dpdi_relev_all,dpdi_relev],axis =0)
            dpdi_relev_dff_all = pd.concat([dpdi_relev_dff_all,dpdi_relev_dff],axis =0)
            out = [dpdi_relev_dff_all, dpdi_relev_all]
    return out

<h3> 2.2 Cross Validation Partition Generation Function </h3>

In [3]:
def k_subs(k,ntp,ncs):
    """
INPUTS
 k       (int) number of partition memebers
 ntp     (int) number of time periods

OUTPUTS
 out                    (list) containing the following elements
   out[0]               (list of tuples) index numbers for instrument partitions
     out[0][i]          (tuples of list) ith instrument partition
         out[0][i][0]   (list) of row numbers in the ith part. training set for instruments
         out[0][i][1]   (list) of row numbers in the ith part. test set for instruments
   out[1]               (list of tuples) index numbers for panel data partitions
     out[1][i]          (tuples of list) ith panel data partition
         out[1][i][0]   (list) of row numbers in the ith part. training set for panel data
         out[1][i][1]   (list) of row numbers in the ith part. test set for panel data
    """
    # List of row index numbers for each cross section
    a0 = [i for i in range(ntp)]
    # Length of the first k-1 partition list
    n_sl = int(np.floor(ntp)/k)
    # Generating the k-1 partitions as a lists of disjoint exhaustive lists 
    k_grp = [[a0.pop(np.random.randint(1,ntp-i-j*n_sl)) for i in range(n_sl)] 
                                                                for j in range(k-1)]
    # Adding in the last partition
    k_grp.append(a0)
    # Inintializing list of train / test tuples 
    in_indx = []
    pan_indx = []
    for i in range(k):
        # Initializing the ith train list
        a2 = []
        # Initializing the list of partitions whose union is a training set
        a3 = list(range(k))
        # Removing the test set index
        del a3[i]
        for j in a3:
            # Taking the union of all training set partition lists
            a2 = sorted(a2 + k_grp[j])
        # Creating the ith (training,test) tuple
        a4 = (a2,sorted(k_grp[i]))
        # Appending to full list
        in_indx.append(a4)
        a5 = a2
        a6 = sorted(k_grp[i])
        for j in range(1,ncs):
            a5 = a5 + (np.array(a2)+j*ntp).tolist()
            a6 = a6 + (np.array(sorted(k_grp[i]))+j*ntp).tolist()
        a7 = (a5,a6)
        pan_indx.append(a7)
        
    return([in_indx,pan_indx])    

<h3> 2.3 Panel Lasso Estimator </h3>

In [4]:
def pan_lasso(l_dp,l_inpt):
    """
INPUTS
l_dp                 (df) all panel regression variables 
l_inpt               (dict) Containing the following elements
  l_inpt['dep']        (str) name of dependent variable
  l_inpt['ex_nm']      (str) name of all exogenous variables
  l_inpt['ins_nm']     (str) name of all instruments
  l_inpt['alph']       (flt) lasso penalty parameter
  l_inpt['ncs']        (int) number of cross sections
  l_inpt['epsil']      (flt) threshold for averaging "non zero" coefficients
  
OUTPUTS
out            (lst) with the following elements
 out[0]        (lst) Estimated coefficients
 out[1]        (lst) containing the following elements
   out[1][j]   (lst) binary vector indicating the non zero ceofficients for each crs
    """
    
# Extracting variables from input dictionary
    l_dep_nm = l_inpt['dep']
    l_ex_nm = l_inpt['ex_nm']
    l_ins_nm = l_inpt['ins_nm']
    l_alph = l_inpt['alph']
    l_epsil = l_inpt['epsil']
    n_exo = len(l_ex_nm)
    ncs = l_inpt['ncs']
    t_inst = len(l_ins_nm)
    # Initializing the coefficient list
    excf1 = []

    # Estimating the coefficients on the exogenous regressors
    for k in range(inpt['ncs']):
        # Extracting the kth cross sections data 
        dsl = l_dp.loc[l_dp['crs']==k+1,:]
        #Initializing the regression model
        lin_reg = linear_model.LinearRegression()
        #Fitting the regression model
        lin_reg.fit(dsl.loc[:,l_ex_nm+l_ins_nm].values,
                    dsl.loc[:,l_dep_nm].values.reshape(dsl.shape[0],1))
        # Appending the est coeff for exogenous regressors to coeff list
        excf1.append([lin_reg.coef_[0][i] for i in range(n_exo)])

    # Averaging coefficient values over cross sections  
    excf = [np.mean([excf1[i][j] for i in range(len(excf1))]) for j in range(n_exo)]
    # Generating the name of the modified dependent variable
    adep_nm = ''.join(['a',l_dep_nm])
    # Generating the modified dependent variable
    l_dp[adep_nm] = (l_dp.loc[:,l_dep_nm].values.reshape(l_dp.shape[0],1)
                     -l_dp.loc[:,l_ex_nm].values.dot(np.array(excf).reshape(n_exo,1)))

    # Initialzing the list of estimated lasso coefficients                 
    ain_cf1 = []
    for k in range(ncs): 
        # Extracting modified dep and indep regressor for crs k 
        ds2 = l_dp.loc[l_dp['crs']==k+1,:]
        # Initializing lasso regression model
        lasso_reg = linear_model.Lasso(alpha = l_inpt['alph'])
        # Fitting the regression model
        lasso_reg.fit(ds2.loc[:,l_ins_nm].values,
                      ds2.loc[:,adep_nm].values.reshape(ds2.shape[0],1))
        # Appending lasso coefficient to list
        ain_cf1.append(list(lasso_reg.coef_))

    # Initializing set of estiamated coeff with those one the exogenous variables
    ain_cf = excf
    # generating the final averged values of each coefficient
    for j in range(len(ain_cf1[0])): 
        # Collecting all coeff estimated on jth inst greater than threshold l_epsil
        a = [ain_cf1[i][j] for i in range(len(ain_cf1)) if np.abs(ain_cf1[i][j]) > l_epsil]
        if not a:
            # If a is an empty list inst not selected in any crs so append a zero
            ain_cf.append(0)
        else:
            # Averging if a is non empty
            ain_cf.append(np.mean(a))    

    # Generating a list of lists where ain_cf_rm[j][i]=1 if the |coeff| on the ith inst
    # for the jth cross section is greater than l_epsil
    ain_cf_rm = [[1,1] + [int(np.abs(ain_cf1[j][i])>l_epsil) 
                            for i in range(len(ain_cf1[0]))] 
                            for j in range(len(ain_cf1))]
    # Function output
    out = [ain_cf, ain_cf_rm]
    return out

<h3> 2.4 Crossvalidated Panel Lasso Estimator </h3>

In [5]:
def pan_lasso_cv(data_pan_mr,data_pan_mrdf,inpt_ls):
    
    """
INPUTS
  data_pan_mr            (df) panel where crs regs are merged with insts
  data_pan_mrdf          (df) panel where crs regs are merged with insts and 1st differenced
  inpt_ls                (dict) of the following elements
    inpt_ls['cv']        (int) indicator for whether cross validation is done
    inpt_ls['dep']       (str) names of differenced dependent variable in lasso reg
    inpt_ls['odep']      (str) names of non differenced dependent variable
    inpt_ls['ex_nm']     (lst) names of differenced exogenous variables
    inpt_ls['oex_nm']    (lst) names of non differenced exogenous variables
    inpt_ls['ins_nm']    (lst) names of differenced instruments
    inpt_ls['oins_nm']   (lst) names of non difference instruments
    inpt_ls['alph']      (flt) initial alpha tuning parameter
    inpt_ls['epsil']     (flt) threshold for averaging "non zero" coefficients
    inpt_ls['cin']       (str) name of crs section variable
    inpt_ls['tin']       (str) name of time variable
    inpt_ls['n_alphs']   (int) number of different alphas tried
    inpt_ls['n_parts']   (int) number of partition elements 
    inpt_ls['ntp']       (int) number of time periods
    inpt_ls['ncs']       (int) number of cross sections

OUPUTS
    out             (lst) with the following elements
      out[0]        (lst) Estimated coefficients
      out[1]        (lst) containing the following elements
        out[1][j]   (lst) binary vector indicating the non zero ceofficients for each crs
      out[2]        (lst) list of mean meansquared errors 
      out[3]        (lst) cross validated alpha tuning parameter
    """
                    
    #Extracting info from dictionary
    n_alphs = inpt_ls['n_alphs']
    oins_nm = inpt_ls['oins_nm']
    oex_nm = inpt_ls['oex_nm']
    odep = inpt_ls['odep']
    n_parts = inpt_ls['n_parts']
    n_alphs = inpt_ls['n_alphs'] 
    cin = inpt_ls['cin']
    tin = inpt_ls['tin']
    ncs = inpt_ls['ncs']
    ntp = inpt_ls['ntp'] 
    cv = inpt_ls['cv']

    if cv == 0:
        # Estimating lasso regression with orginal input alpha
        ls_soln = pan_lasso(data_pan_mrdf,inpt_ls)
        out = ls_soln + ['none'] + ['none']

    elif cv == 1:  
        # Generating partitions
        indx = k_subs(n_parts,ntp-1,ncs)

        # Initializing carrier lists
        all_msr = []
        trl_alph = []

        # For each value of lasso tuning parameter
        for k in range(5,n_alphs+6):
            # Initializing mean squared error list for each partition
            msr = []
            # Setting the value of the lasso tuning parameter
            inpt_ls['alph'] = k/n_alphs
            # Appending the current lasso tuning parameter
            trl_alph.append(k/n_alphs)
            # For each of the training testing set combinations
            for part in range(n_parts):
                # Merged and differenced training set
                trn_mrdf = data_pan_mrdf.iloc[indx[1][part][0],:].copy()
                # Merged and differenced testing set
                tst_mrdf = data_pan_mrdf.iloc[indx[1][part][1],:].copy()
                # Merged only testing set
                tst_mr = data_pan_mr.iloc[indx[1][part][1],:].copy()
                # lasso estimation on training data set
                lcf = pan_lasso(trn_mrdf,inpt_ls)
                # Initializng list of allresiduals for each cross section 
                all_res = []
                # for each cross section
                for i in range(ncs):
                    # Extracting the un difference test regressor matrix for crs i
                    c1 = tst_mr.loc[tst_mr[cin]==i+1, oex_nm + oins_nm ].values
                    # Creating the diagonal selection matrix for crs i
                    c2 = np.diag(lcf[1][i])
                    # Reshaping Estimated lasso coefficient matrix
                    c3 = np.array(lcf[0]).reshape(len(oex_nm + oins_nm),1)
                    # Computing partial estimated values (no constant term included)
                    c4 = c1.dot(c2).dot(c3)
                    # Computing non centered residuals (no constant term included)
                    res1 = tst_mr.loc[tst_mr[cin]==i+1,odep].values.reshape(len(c4),1) - c4
                    # Computing centered squared residuals
                    cres = (res1 - np.mean(res1))**2
                    # Concatenating sqr residuals of ith cross section to all_res
                    all_res = all_res + cres.tolist()
                # Appending the mean sqrd residuals for the npart th trn and test set 
                msr.append(np.mean(all_res))
            # Appending the mean of mean sqrd residuals for all partitions   
            all_msr.append(np.mean(msr))

        # Index of smallest mean msr values
        ax = all_msr.index(min(all_msr))
        # alpha value that produces smallest mean_msr
        cv_alph = trl_alph[ax]

        # Running Lasso with cv alpha
        inpt_ls['alpha'] = cv_alph
        # Estimating lasso regression with cros validated alpha
        ls_soln = pan_lasso(data_pan_mrdf,inpt_ls)
        out = ls_soln
        out.append(all_msr)
        out.append(cv_alph)

    return out

<h3> 2.5 Double Panel Estimator </h3> 

In [54]:
def panel_dbl_est(dp_in,di_in,inpt_p):
    
    """
INPUTS
dp_in                              (df) panel variables
di_in                              (df) of instrments
inpt_p                             (dict) composed of the following elements
  inpt_p['dep']                    (str) name of dependent variable 
  inpt_p['reg']                    (str) names of exogenous regressors
  inpt_p['in_nm']                  (lst) of lists of 
    inpt_p['in_nm'][i]               (lst) of names of inst relevant to ith crs
  inpt_p['n_alphs']                (int) number of alphas to try in cv routine
  inpt_p['n_parts']                (int) number of cv partition elements
  inpt_p['cin']                    (str) cross section index name
  inpt_p['tin']                    (str) time index name
  inpt_p['ncs']                    (int) number of cross sections
  inpt_p['ntp']                    (int) number of time periods
  inpt_p['cv']                     (int) indicator for cross validated lasso parameter
  inpt_p['lasso']                  (int) indicator for lasso estimation
  inpt_p['alph']                   (flt) initial lasso tuning parameter 
  inpt_p['epsil']                  (flt) threshold for averaging "non zero" coefficients
  inpt_p['inst_partition']         (lst) of the following
    inpt_p['inst_partition'][0]      (list) of row numbers in the training set for instruments
    inpt_p['inst_partition'][1]      (list) of row numbers in the testing set for instruments
  inpt_p['panel_partition']        (lst) of the following
    inpt_p['panel_partition'][0]     (list) of row numbers in the training set for panel data
    inpt_p['panel_partition'][1]     (list) of row numbers in the testing set for panel data
  
OUTPUTS
out                     (lst) of the following elemements
  out[0]                (lst) of coefficients estimated with training set
  out[1]                (lst) of the following
    out[1][i]           (lst) vector indc the regs and insts relevant to ith cross section 
  out[2]                (lst) of the following
    out[2][i]           (lst) of training set residuals for ith cross section
  out[3]                (lst) of the following
    out[3][i]           (lst) of testing set residuals for ith cross section
  out[4]                (flt) final cross validated tuning parameter from training set
    """

    # Extracting data from input dict
    dep = inpt_p['dep']
    reg = inpt_p['reg']
    in_nm = inpt_p['in_nm']
    cin = inpt_p['cin']
    tin = inpt_p['tin']
    ncs = inpt_p['ncs']
    lasso = inpt_p['lasso']
    n_parts = inpt_p['n_parts']
    n_alphs = inpt_p['n_alphs']
    inst_trn_part = inpt_p['inst_partition'][0]
    inst_tst_part = inpt_p['inst_partition'][1]
    panel_trn_part = inpt_p['panel_partition'][0]
    panel_tst_part = inpt_p['panel_partition'][1]
    trn_ntp = len(inst_trn_part)
    tst_ntp = len(inst_tst_part)

    # Input data
    di  = di_in
    dp  = dp_in
    # Training Sets
    dp_trn = dp.loc[panel_trn_part,:]
    di_trn = di.loc[inst_trn_part,:]
    # Testing Sets
    dp_tst = dp.loc[panel_tst_part,:]
    di_tst = di.loc[inst_tst_part,:]

    # Initializing the set of all included instruments as the 1st set relevant inst
    inst_incl = set(in_nm[0])
    # Collecting rest of relevant instruments
    for i in range(len(in_nm)):
        # Union of inc and ith set of relevant instruments
        inst_incl = inst_incl|set(in_nm[i])
    # All included Instruments listed in order index order
    inst_incl = [''.join(['W',str(i)]) for i in range(1,di.shape[1])
                                       if ''.join(['W',str(i)]) in inst_incl]

    # df with time index and all included instruments
    di_incl = di.loc[:,[tin] + inst_incl]
    di_trn_incl = di_trn.loc[:,[tin] + inst_incl]
    di_tst_incl = di_tst.loc[:,[tin] + inst_incl]

    # List of logical vectors (as list) of which instruments in inc are relevant to ith crs
    inst_incl_vec = [[ 1 if inst_incl[i] in in_nm[j] else 0 
                                         for i in range(len(inst_incl))]
                                         for j in range(ncs)]

    # Merging and differencing input dictionary
    inpt_m = {'tin' : tin , 'cin': cin , 'ncs': ncs, 'inc_vec': inst_incl_vec}

    # Merging and differencing function call
    trn_mrg_dff = mrg_dff(dp_trn,di_trn_incl,inpt_m)
    tst_mrg_dff = mrg_dff(dp_tst,di_tst_incl,inpt_m)

    # Merged and differenced panel and instrument df
    dpdi_trn_mrg_dff = trn_mrg_dff[0]
    dpdi_tst_mrg_dff = tst_mrg_dff[0]

    # Merged panel and instrument df
    dpdi_trn_mrg = trn_mrg_dff[1]
    dpdi_tst_mrg = tst_mrg_dff[1]

    # Panel regression variable names (appending a 'D')
    exog_regr_dnames = [''.join(['D',reg[i]]) for i in range(len(reg))]
    inst_incl_dnames = [''.join(['D',inst_incl[i]]) for i in range(len(inst_incl))]
    dep_var_dname = ''.join(['D',dep])

    # List of all relev + irrelev instrument names
    all_inst_names = di_in.drop([tin],axis=1).columns.tolist()
    # All LHS non panel variable names
    all_reg_inst_names =  reg + all_inst_names

    if lasso == 0:
        # Initializing and fitting trainging data OLS regression
        trn_ols_reg = linear_model.LinearRegression()
        # Fitting an OLS regression to the training data
        trn_ols_reg.fit(dpdi_trn_mrg_dff.loc[:,exog_regr_dnames + inst_incl_dnames],
                    dpdi_trn_mrg_dff.loc[:,dep_var_dname])
        # Extracting estimated regression coefficients
        trn_ols_out = trn_ols_reg.coef_.tolist()
        # Initializing est ceofficient vector with coeffs on exogenous (reg)  variables
        trn_est_coeff = trn_ols_out[:len(reg)]
        for i in range(len(all_inst_names)):
            if inst_incl.count(all_inst_names[i]) > 0:
                # If the ith inst in di is in inst_incl append the estimate to est_coeff
                trn_est_coeff.append(trn_ols_out[inst_incl.index(all_inst_names[i])+len(reg)])
            elif inst_incl.count(all_inst_names[i])==0:
                # If the ith inst in di is not in inst_incl append a zero
                trn_est_coeff.append(0)  
        # List of lists of indicator of relevant ex and inst to each cross section      
        trn_relev_regr_vec = [ [1]*len(reg) + [int(in_nm[i].count(all_inst_names[j]) > 0) 
                                  for j in range(len(all_inst_names))] 
                                  for i in range(len(in_nm))]
        # Setting cv param to null
        trn_cv_param = 'NA'

    elif lasso == 1:
        # Input dictionary for Lasso Estimation
        inpt_ls_cv = {'dep': dep_var_dname,
                  'odep': dep ,
                  'ex_nm': exog_regr_dnames ,
                  'oex_nm': reg,
                  'ins_nm': inst_incl_dnames,
                  'oins_nm': inst_incl, 
                  'alph': inpt_p['alph'],'epsil':inpt_p['epsil'],'cin': cin,
                  'tin' : tin, 'n_alphs': inpt_p['n_alphs'], 'n_parts': inpt_p['n_parts'],
                  'ntp': trn_ntp ,'ncs':ncs, 'cv': inpt_p['cv']
                     }
        # Estimation by panel lasso
        trn_ls_cv_out = pan_lasso_cv(dpdi_trn_mrg,dpdi_trn_mrg_dff,inpt_ls_cv)
        # Extracting estimated coefficients
        trn_est_coeff = trn_ls_cv_out[0]
        # Relevant regressor matrix (non zero est_coeff)
        trn_relev_regr_vec = trn_ls_cv_out[1]
        # Cross validated parameter value
        trn_cv_param = trn_ls_cv_out[3]

    # Constucting a panel df of estimated errors Vj,i
    for i in range(1,ncs+1):
        # np.array of dep variable values for ith crs
        trn_dep_vals = dpdi_trn_mrg.loc[dpdi_trn_mrg[cin] == i,[dep]].values
        tst_dep_vals = dpdi_tst_mrg.loc[dpdi_tst_mrg[cin] == i,[dep]].values
        # Regressor values for ith crs
        trn_regr_vals = dpdi_trn_mrg.loc[dpdi_trn_mrg[cin] == i,all_reg_inst_names].values
        tst_regr_vals = dpdi_tst_mrg.loc[dpdi_tst_mrg[cin] == i,all_reg_inst_names].values
        # Relevant Regressor values for ith crs
        trn_relev_regr_vals = trn_regr_vals.dot(np.diag(trn_relev_regr_vec[i-1]))
        tst_relev_regr_vals = tst_regr_vals.dot(np.diag(trn_relev_regr_vec[i-1]))
        # Non Centered residuals
        trn_non_center_resids = trn_dep_vals - trn_relev_regr_vals.dot(
                                        np.array(trn_est_coeff).reshape(len(trn_est_coeff),1))
        tst_non_center_resids = tst_dep_vals - tst_relev_regr_vals.dot(
                                        np.array(trn_est_coeff).reshape(len(trn_est_coeff),1))
        # Centered residuals
        trn_center_resids = trn_non_center_resids - np.mean(trn_non_center_resids)
        # NOTE here I am centering on the mean of the training residuals
        tst_center_resids = tst_non_center_resids - np.mean(trn_non_center_resids)
        if i == 1:
            # if i = 1 initialize panel df
            trn_center_resids_full = [list(trn_center_resids.T[0])]
            tst_center_resids_full = [list(tst_center_resids.T[0])]
        elif i > 1:
            # if i > 1 add onto p_res
            trn_center_resids_full.append(list(trn_center_resids.T[0]))
            tst_center_resids_full.append(list(tst_center_resids.T[0]))

    # Function output
    out = [trn_est_coeff,trn_relev_regr_vec,trn_center_resids_full,tst_center_resids_full,trn_cv_param] 
    
    return out

<h3> 3.1 Data Loading </h3>

In [6]:
input_filename = 'pscdata_7_17_1126.json'
data_file = '/Users/ericpenner/Google_Drive/Research/pan_sel_cntrl/data'
input_file_full = ''.join([data_file,'/',input_filename])
with open(input_file_full) as f_obj: 
    pscdata = json.load(f_obj)
inpt = pscdata[0][0].copy()    

<h3> 3.2 Input Dictionary Setup </h3>

In [49]:
# Indicator for  whether in this run the subset of instrument relvant to each crs is known.
inpt['kwnsub'] = 0

# Indicator for whether residuals are observed
inpt['orcl'] = 0
inpt['lasso'] = 0
inpt['dep'] = inpt['en_nm'][0]
inpt['reg'] = inpt['ex_nm']
inpt['alph'] = 0.4
inpt['epsil'] = 0.09
inpt['cv'] = 0
inpt['n_parts'] = 4
inpt['n_alphs'] = 20

# List of list with the names of the relevant instruments for each crossection
in_nm=[]
for i in range(inpt['ncs']):
    # If the subset is known then list of relevant inst. for each crs is supplied to estimator
    if inpt['kwnsub'] == 1:
        a=[ True if pscdata[0][1]['coeff'][0][i][k]!=0 else False 
            for k in range(inpt['n_exo'],inpt['n_exo']+inpt['t_inst'])]
        in_nm.append(np.array(pscdata[0][1]['Dins_nms'][1:])[a].tolist())
    # If the subset is unknown then list of all inst. will be supplied to est. for each crs
    else:
        in_nm.append(pscdata[0][1]['Dins_nms'][1:])
    
inpt['in_nm'] = in_nm
inpt_l = inpt.copy()
inpt_l['lasso'] = 1

<h3> 3.3 Data Set Extraction and Function Call </h3>

In [50]:
k=100
data_err = pd.DataFrame(pscdata[k][0]['err_df'], columns = pscdata[0][1]['Derr_nms']) 
data_inst = pd.DataFrame(pscdata[k][0]['inst_df'], columns = pscdata[0][1]['Dins_nms'])
data_pan = pd.DataFrame(pscdata[k][0]['prim_df'], columns = pscdata[0][1]['Dlng_nms'])

k_parts = 4
k_partitions = k_subs(k_parts,inpt['ntp'],inpt['ncs'])
current_partition = 0
inpt['inst_partition'] = k_partitions[0][current_partition]
inpt['panel_partition'] = k_partitions[1][current_partition]

In [51]:
k_parts = 4
k_partitions = k_subs(k_parts,inpt['ntp'],inpt['ncs'])
current_partition = 0

In [52]:
inpt_pan = {}
inpt_pan['dep'] = 'Z1,1'
inpt_pan['reg'] = inpt['ex_nm']
inpt_pan['in_nm'] = in_nm
inpt_pan['n_alphs'] = inpt['n_alphs']
inpt_pan['n_parts'] = inpt['n_parts']
inpt_pan['cin'] = inpt['cin']
inpt_pan['tin'] = inpt['tin']
inpt_pan['ncs'] = inpt['ncs']
inpt_pan['ntp'] = inpt['ntp']
inpt_pan['cv'] = inpt['cv']
inpt_pan['lasso'] = inpt['lasso']
inpt_pan['alph'] = inpt['alph']
inpt_pan['epsil'] = inpt['epsil']
inpt_pan['inst_partition'] = k_partitions[0][current_partition]
inpt_pan['panel_partition'] = k_partitions[1][current_partition]

In [55]:
?panel_dbl_est
panel_dbl_est(data_pan,data_inst,inpt_pan)[0]

[-0.7684241640719013,
 -1.1362501504368219,
 -0.3860717582666008,
 -0.2539792895094912,
 -0.04035185813221634,
 0.17847681497280674,
 -0.4398429567085668,
 0.02337682924411728,
 0.05979953308235618,
 -0.14796460461081412,
 0.02849951545189444,
 0.29923405688312976,
 -0.2952675301173442,
 -0.18435987718884073,
 0.4019040870650189,
 -0.28431817125718617,
 0.4213078365176789,
 0.35987828905427444,
 -0.3581731451946868,
 -0.2560852416328804,
 -0.36097783007588546,
 0.1484090848850677]

In [39]:
inpt['cv']

1

In [None]:
inpt_p_lasso_cv = inpt_p.copy()

inpt_p_lasso = inpt_p.copy()
inpt_p_lasso['cv'] = 0

c_lasso_cv = panel_est(data_pan,data_inst,inpt_p_lasso_cv)
c_lasso = panel_est(data_pan,data_inst,inpt_p_lasso)
c_lasso_cv[3]
i=7
h_c = 1.5
ker = 9

v_lass_cv = np.array(c_lasso_cv[0][i])
p_lass_cv = np.linspace(np.min(v_lass_cv),np.max(v_lass_cv),100)
h_lass_cv = h_c*v_lass_cv.shape[0]**(-1/5)*np.std(v_lass_cv)
v_lass_cv_den = kr.mvden(v_lass_cv,p_lass_cv,h_lass_cv,ker)

v_lass = np.array(c_lasso[0][i])
p_lass = np.linspace(np.min(v_lass),np.max(v_lass),100)
h_lass = h_c*v_lass.shape[0]**(-1/5)*np.std(v_lass)
v_lass_den = kr.mvden(v_lass,p_lass,h_lass,ker)

v_true = data_err.loc[data_err['crs']==i+1,'V1'].values
p_true = np.linspace(np.min(v_true),np.max(v_true),100)
h_true = h_c*v_true.shape[0]**(-1/5)*np.std(v_true)
v_true_den = kr.mvden(v_true,p_true,h_true,ker) 

x_lm = [-5,5]
y_lm = 1

f,ax = plt.subplots()
f.set_figheight(7)
f.set_figwidth(15)
ax.set_xlim((x_lm[0],x_lm[1]))
ax.set_ylim((0,y_lm))
ax.plot(p_lass_cv,v_lass_cv_den)
ax.plot(p_lass,v_lass_den)
ax.plot(p_true,v_true_den)
ax.legend(["Lasso CV","Lasso","True"])
ax.grid(which = 'both')
#ax.set_title(''.join(['Distribution of Estimated ',coeff[0].columns[w]]))
plt.show()

In [None]:
c_lasso_cv[3]

In [None]:
inpt_ls = {}
inpt_ls['dep'] = 'DZ1,1'
inpt_ls['odep'] = 'Z1,1'
inpt_ls['ex_nm'] = [''.join(['DZ2,',str(i)]) for i in range(1,inpt['n_exo']+1)]
inpt_ls['oex_nm'] = [''.join(['Z2,',str(i)]) for i in range(1,inpt['n_exo']+1)]
inpt_ls['ins_nm'] = [''.join(['DW',str(i)]) for i in range(1,inpt['t_inst']+1)]
inpt_ls['oins_nm'] = [''.join(['W',str(i)]) for i in range(1,inpt['t_inst']+1)]
inpt_ls['alph'] = 0.45
inpt_ls['epsil'] = 0.1
inpt_ls['cin'] = inpt['cin']
inpt_ls['tin'] = inpt['tin']
inpt_ls['n_alphs'] = 20
inpt_ls['n_parts'] = 4
inpt_ls['ntp'] = inpt['ntp']
inpt_ls['ncs'] = inpt['ncs']
inpt_ls['cv'] = 0