In [13]:
## Preamble: Package Loading
import numpy as np
from sklearn import linear_model
from IPython.display import display
import matplotlib.pyplot as plt
from matplotlib import gridspec
import pandas as pd
import itertools as iter
import os
import datetime as dt
import json
import kernel as kr
import psc_sumdisp as psd 
# Preamble working directory retreival
wkng_folder = os.getcwd()

<h2> 2 Required Functions </h2>

<h3> 2.1 Merging and Differencing Function </h3>

In [100]:
def mrg_dff(dp_in,di_in,inpt):
    """
INPUTS 
  dp_in                  (df) panel dataframe 
  di_in                  (df) instruments dataframe
  inpt                   (dict) Containing the following elements
    inpt['tin']          (str) time index column name
    inpt['cin']          (str) cross section index column name
    inpt['ncs']          (int) number of cross sections
    inpt['inc_vec']       (lst) of list of binary vectors 
      inpt['inc_vec'][i]  (lst) of binary vectors listing which instruments in inc are relevant to ith crs
    
OUTPUTS
  out[0]             (df) dp_in and di_in merged on tin for each cin and 1st differenced
  out[1]             (df) dp_in and di_in merged on tin for each cin 
    """
    # Extracting variables from input dictionary
    tin = inpt['tin']
    cin = inpt['cin']
    ncs = inpt['ncs']
    inst_incl_vec = inpt['inc_vec']
    
    # Looping over each cross section
    for i in range(ncs):
        # Merging panel and instrument df on tin for ith cross section
        inst_no_tin = di_in.drop(inpt['tin'],axis=1).copy()
        inc_vec_for_i = inst_incl_vec[i]
        inst_only_relev_for_i = pd.DataFrame(inst_no_tin.values.dot(np.diag(inc_vec_for_i)))
        inst_only_relev_for_i.columns = inst_no_tin.columns 
        inst_only_relev_for_i = pd.concat([data_inst.loc[:,inpt['tin']] ,
                                           inst_only_relev_for_i] , axis=1)
        b1 = pd.merge(dp_in.loc[dp_in[cin]== i+1,:],inst_only_relev_for_i,how = 'inner', on = tin)
        # Initializing the temp difference matrix 
        b2 = b1.loc[1:,[tin]+[cin]] 
        # Looping over all the column names in b1
        for nm in b1.columns[2:].tolist():
            # 1st Differencing nm column of b1
            b2[''.join(['D',nm])] = (b1.loc[:,nm].values - b1.loc[:,nm].shift(1).values)[1:]
        if i == 0:
            # Initializing final matrix out
            out1 = b2
            out2 = b1
        elif i > 0:
            # Concatenating b2 to out
            out1 = pd.concat([out1,b2],axis =0)
            out2 = pd.concat([out2,b1],axis =0)
            out = [out1 , out2]
    return out

<h3> 2.2 Cross Validation Partition Generation Function </h3>

In [16]:
def k_subs(k,ntp,ncs):
    """
INPUTS
 k       (int) number of partition memebers
 ntp     (int) number of time periods

OUTPUTS
 out                    (list) containing the following elements
   out[0]               (list of tuples) index numbers for instrument partitions
     out[0][i]          (tuples of list) ith instrument partition
         out[0][i][0]   (list) of row numbers in the ith part. training set for instruments
         out[0][i][1]   (list) of row numbers in the ith part. test set for instruments
   out[1]               (list of tuples) index numbers for panel data partitions
     out[1][i]          (tuples of list) ith panel data partition
         out[1][i][0]   (list) of row numbers in the ith part. training set for panel data
         out[1][i][1]   (list) of row numbers in the ith part. test set for panel data
    """
    # List of row index numbers for each cross section
    a0 = [i for i in range(ntp)]
    # Length of the first k-1 partition list
    n_sl = int(np.floor(ntp)/k)
    # Generating the k-1 partitions as a lists of disjoint exhaustive lists 
    k_grp = [[a0.pop(np.random.randint(1,ntp-i-j*n_sl)) for i in range(n_sl)] 
                                                                for j in range(k-1)]
    # Adding in the last partition
    k_grp.append(a0)
    # Inintializing list of train / test tuples 
    in_indx = []
    pan_indx = []
    for i in range(k):
        # Initializing the ith train list
        a2 = []
        # Initializing the list of partitions whose union is a training set
        a3 = list(range(k))
        # Removing the test set index
        del a3[i]
        for j in a3:
            # Taking the union of all training set partition lists
            a2 = sorted(a2 + k_grp[j])
        # Creating the ith (training,test) tuple
        a4 = (a2,sorted(k_grp[i]))
        # Appending to full list
        in_indx.append(a4)
        a5 = a2
        a6 = sorted(k_grp[i])
        for j in range(1,ncs):
            a5 = a5 + (np.array(a2)+j*ntp).tolist()
            a6 = a6 + (np.array(sorted(k_grp[i]))+j*ntp).tolist()
        a7 = (a5,a6)
        pan_indx.append(a7)
        
    return([in_indx,pan_indx])    

<h3> 2.3 OLS function </h3>

In [17]:
def ols(df,inpt):
    """
INPUTS
df                (pandas df) Data Frame with all regressors
inpt              (dict) Dictionary with the following
  inpt['dep']     (string) Name of dependent variable contained in df
  inpt['reg']     (list of strings) names of regressors in df 
  inpt['cons']    (0,1) Indicator for whether a constant should be included

OUTPUTS 
out               (list of lists) List of the following
  out[0]          (list) Estimated coefficients
  out[1]          (list) Residuals
  out[2]          (list) Estimated conditional expectation
    """
    # Extracting input variables
    dep = inpt['dep']
    reg = inpt['reg']
    cons = inpt['cons']
    # Determining length of df (number of obs)
    n = df.shape[0]
    # Extracting Dependent Variable from df
    Y = df.loc[:,dep].values.reshape(n,1)
    # Extracting Regressors from df
    if len(reg) == 1:
        X = df.loc[:,reg].values.reshape(n,1)
    elif len(reg) > 1: 
        X = df.loc[:,reg].values
    # Adding column of ones if a constant is included
    if cons == 1: 
        X = np.hstack((np.ones((n,1)),X))
    # Estimated regression coefficients
    alpha = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(Y))
    # Estimated Conditional Expectation
    Yhat = X.dot(alpha)
    # Residuals of the regression
    res = Y - X.dot(alpha)
    # Constructing output list of lists
    out = [alpha.T.tolist()[0],res.T.tolist()[0],Yhat.T.tolist()[0]]
    return out

<h3> 2.4 Panel Lasso Estimator </h3>

In [18]:
def pan_lasso(l_dp,l_inpt):
    """
INPUTS
l_dp                 (df) all panel regression variables 
l_inpt               (dict) Containing the following elements
  l_inpt['dep']        (str) name of dependent variable
  l_inpt['ex_nm']      (str) name of all exogenous variables
  l_inpt['ins_nm']     (str) name of all instruments
  l_inpt['alph']       (flt) lasso penalty parameter
  l_inpt['ncs']        (int) number of cross sections
  l_inpt['epsil']      (flt) threshold for averaging "non zero" coefficients
  
OUTPUTS
out            (lst) with the following elements
 out[0]        (lst) Estimated coefficients
 out[1]        (lst) containing the following elements
   out[1][j]   (lst) binary vector indicating the non zero ceofficients for each crs
    """
    
# Extracting variables from input dictionary
    l_dep_nm = l_inpt['dep']
    l_ex_nm = l_inpt['ex_nm']
    l_ins_nm = l_inpt['ins_nm']
    l_alph = l_inpt['alph']
    l_epsil = l_inpt['epsil']
    n_exo = len(l_ex_nm)
    ncs = l_inpt['ncs']
    t_inst = len(l_ins_nm)
    # Initializing the coefficient list
    excf1 = []

    # Estimating the coefficients on the exogenous regressors
    for k in range(inpt['ncs']):
        # Extracting the kth cross sections data 
        dsl = l_dp.loc[l_dp['crs']==k+1,:]
        #Initializing the regression model
        lin_reg = linear_model.LinearRegression()
        #Fitting the regression model
        lin_reg.fit(dsl.loc[:,l_ex_nm+l_ins_nm].values,
                    dsl.loc[:,l_dep_nm].values.reshape(dsl.shape[0],1))
        # Appending the est coeff for exogenous regressors to coeff list
        excf1.append([lin_reg.coef_[0][i] for i in range(n_exo)])

    # Averaging coefficient values over cross sections  
    excf = [np.mean([excf1[i][j] for i in range(len(excf1))]) for j in range(n_exo)]
    # Generating the name of the modified dependent variable
    adep_nm = ''.join(['a',l_dep_nm])
    # Generating the modified dependent variable
    l_dp[adep_nm] = (l_dp.loc[:,l_dep_nm].values.reshape(l_dp.shape[0],1)
                     -l_dp.loc[:,l_ex_nm].values.dot(np.array(excf).reshape(n_exo,1)))

    # Initialzing the list of estimated lasso coefficients                 
    ain_cf1 = []
    for k in range(ncs): 
        # Extracting modified dep and indep regressor for crs k 
        ds2 = l_dp.loc[l_dp['crs']==k+1,:]
        # Initializing lasso regression model
        lasso_reg = linear_model.Lasso(alpha = l_inpt['alph'])
        # Fitting the regression model
        lasso_reg.fit(ds2.loc[:,l_ins_nm].values,
                      ds2.loc[:,adep_nm].values.reshape(ds2.shape[0],1))
        # Appending lasso coefficient to list
        ain_cf1.append(list(lasso_reg.coef_))

    # Initializing set of estiamated coeff with those one the exogenous variables
    ain_cf = excf
    # generating the final averged values of each coefficient
    for j in range(len(ain_cf1[0])): 
        # Collecting all coeff estimated on jth inst greater than threshold l_epsil
        a = [ain_cf1[i][j] for i in range(len(ain_cf1)) if np.abs(ain_cf1[i][j]) > l_epsil]
        if not a:
            # If a is an empty list inst not selected in any crs so append a zero
            ain_cf.append(0)
        else:
            # Averging if a is non empty
            ain_cf.append(np.mean(a))    

    # Generating a list of lists where ain_cf_rm[j][i]=1 if the |coeff| on the ith inst
    # for the jth cross section is greater than l_epsil
    ain_cf_rm = [[1,1] + [int(np.abs(ain_cf1[j][i])>l_epsil) 
                            for i in range(len(ain_cf1[0]))] 
                            for j in range(len(ain_cf1))]
    # Function output
    out = [ain_cf, ain_cf_rm]
    return out

<h3> 2.5 Crossvalidated Panel Lasso Estimator </h3>

In [19]:
def pan_lasso_cv(data_pan_mr,data_pan_mrdf,inpt_ls):
    
    """
INPUTS
  data_pan_mr            (df) panel where crs regs are merged with insts
  data_pan_mrdf          (df) panel where crs regs are merged with insts and 1st differenced
  inpt_ls                (dict) of the following elements
    inpt_ls['cv']        (int) indicator for whether cross validation is done
    inpt_ls['dep']       (str) names of differenced dependent variable in lasso reg
    inpt_ls['odep']      (str) names of non differenced dependent variable
    inpt_ls['ex_nm']     (lst) names of differenced exogenous variables
    inpt_ls['oex_nm']    (lst) names of non differenced exogenous variables
    inpt_ls['ins_nm']    (lst) names of differenced instruments
    inpt_ls['oins_nm']   (lst) names of non difference instruments
    inpt_ls['alph']      (flt) initial alpha tuning parameter
    inpt_ls['epsil']     (flt) threshold for averaging "non zero" coefficients
    inpt_ls['cin']       (str) name of crs section variable
    inpt_ls['tin']       (str) name of time variable
    inpt_ls['n_alphs']   (int) number of different alphas tried
    inpt_ls['n_parts']   (int) number of partition elements 
    inpt_ls['ntp']       (int) number of time periods
    inpt_ls['ncs']       (int) number of cross sections

OUPUTS
    out             (lst) with the following elements
      out[0]        (lst) Estimated coefficients
      out[1]        (lst) containing the following elements
        out[1][j]   (lst) binary vector indicating the non zero ceofficients for each crs
      out[2]        (lst) list of mean meansquared errors 
      out[3]        (lst) cross validated alpha tuning parameter
    """
                    
    #Extracting info from dictionary
    n_alphs = inpt_ls['n_alphs']
    oins_nm = inpt_ls['oins_nm']
    oex_nm = inpt_ls['oex_nm']
    odep = inpt_ls['odep']
    n_parts = inpt_ls['n_parts']
    n_alphs = inpt_ls['n_alphs'] 
    cin = inpt_ls['cin']
    tin = inpt_ls['tin']
    ncs = inpt_ls['ncs']
    ntp = inpt_ls['ntp'] 
    cv = inpt_ls['cv']

    if cv == 0:
        # Estimating lasso regression with orginal input alpha
        ls_soln = pan_lasso(data_pan_mrdf,inpt_ls)
        out = ls_soln + ['none'] + ['none']

    elif cv == 1:  
        # Generating partitions
        indx = k_subs(n_parts,ntp-1,ncs)

        # Initializing carrier lists
        all_msr = []
        trl_alph = []

        # For each value of lasso tuning parameter
        for k in range(5,n_alphs+6):
            # Initializing mean squared error list for each partition
            msr = []
            # Setting the value of the lasso tuning parameter
            inpt_ls['alph'] = k/n_alphs
            # Appending the current lasso tuning parameter
            trl_alph.append(k/n_alphs)
            # For each of the training testing set combinations
            for part in range(n_parts):
                # Merged and differenced training set
                trn_mrdf = data_pan_mrdf.iloc[indx[1][part][0],:].copy()
                # Merged and differenced testing set
                tst_mrdf = data_pan_mrdf.iloc[indx[1][part][1],:].copy()
                # Merged only testing set
                tst_mr = data_pan_mr.iloc[indx[1][part][1],:].copy()
                # lasso estimation on training data set
                lcf = pan_lasso(trn_mrdf,inpt_ls)
                # Initializng list of allresiduals for each cross section 
                all_res = []
                # for each cross section
                for i in range(ncs):
                    # Extracting the un difference test regressor matrix for crs i
                    c1 = tst_mr.loc[tst_mr[cin]==i+1, oex_nm + oins_nm ].values
                    # Creating the diagonal selection matrix for crs i
                    c2 = np.diag(lcf[1][i])
                    # Reshaping Estimated lasso coefficient matrix
                    c3 = np.array(lcf[0]).reshape(len(oex_nm + oins_nm),1)
                    # Computing partial estimated values (no constant term included)
                    c4 = c1.dot(c2).dot(c3)
                    # Computing non centered residuals (no constant term included)
                    res1 = tst_mr.loc[tst_mr[cin]==i+1,odep].values.reshape(len(c4),1) - c4
                    # Computing centered squared residuals
                    cres = (res1 - np.mean(res1))**2
                    # Concatenating sqr residuals of ith cross section to all_res
                    all_res = all_res + cres.tolist()
                # Appending the mean sqrd residuals for the npart th trn and test set 
                msr.append(np.mean(all_res))
            # Appending the mean of mean sqrd residuals for all partitions   
            all_msr.append(np.mean(msr))

        # Index of smallest mean msr values
        ax = all_msr.index(min(all_msr))
        # alpha value that produces smallest mean_msr
        cv_alph = trl_alph[ax]

        # Running Lasso with cv alpha
        inpt_ls['alpha'] = cv_alph
        # Estimating lasso regression with cros validated alpha
        ls_soln = pan_lasso(data_pan_mrdf,inpt_ls)
        out = ls_soln
        out.append(all_msr)
        out.append(cv_alph)

    return out

<h3> 2.6 Panel Data Estimator </h3>

In [22]:
def panel_fe(dp_in,di_in,inpt):
    """
INPUTS
dp_in                     (pandas df) df with dependent var. and all exogenous regs
di_in                     (pandas df) df with all instruments
inpt                      (dict) Dictionary with the following
  inpt['dep']               (string) Name of dependent variable contained in dp
  inpt['reg']               (list of strings) names of exogenous regressors
  inpt['cin']               (string) name of crossection index in dp
  inpt['tin']               (string) name of time index in dp
  inpt['ncs']               (int) number of crossections
  inpt['alph']              (flt) penalty value for lasso estimation
  inpt['epsil']             (flt)  threshold for averaging "non zero" coefficients
  inpt['lasso']             (int) indicator for whether subset selection with lasso is done
  inpt['in_nm']             (list of lists of strings) with the following components
    input['in_nm'][i-1]       (list of names) names of instruments relevant to crs i 
  

OUTPUTS 
out               (list of lists) List of the following
  out[0]          (list) Estimated coefficients
  out[1]          (list) list of all relevant instrument
  out[2]          (list) Estimated error terms Vi_j
  """

    ## Extracting Variables from inpt dictionary
    dep = inpt['dep']
    reg = inpt['reg']
    in_nm = inpt['in_nm']
    cin = inpt['cin']
    tin = inpt['tin']
    ncs = inpt['ncs']
    lasso = inpt['lasso']
    di  = di_in
    dp  = dp_in

    
    
    
    ## Constructing a df of all instrument relevant to at least 1 crossection
    # Initializing the set of all included instruments as the 1st set relevant inst
    inc = set(in_nm[0])
    # Collecting rest of relevant instruments
    for i in range(len(in_nm)):
        # Union of inc and ith set of relevant instruments
        inc = inc|set(in_nm[i])
    # All included Instruments listed in order index order
    inc = [''.join(['W',str(i)]) for i in range(1,di.shape[1]) if ''.join(['W',str(i)]) in inc]
    # df with time index and all included instruments
    di_inc = di.loc[:,[tin] + inc]

    # List of logical vectors (as list) of which instruments in inc are relevant to ith crs
    in_vec = [[ 1 if inc[i] in in_nm[j] else 0 for i in range(len(inc))]
                  for j in range(ncs)]

    # First differenced included instrument df
    Di = di_inc.loc[:,[tin]]
    for j in range(0,len(inc)):
        # Adding D to name of jth instrument in inc
        D_nm = ''.join(['D', di_inc.columns[j+1]])
        # First difference of jth instrument in inc
        Di[D_nm] = (di_inc.loc[:,di_inc.columns[j+1]].values 
                    - di_inc.loc[:,di_inc.columns[j+1]].shift(1).values)

    ## Constructing Panel Version of relevant instrument data
    for i in range(ncs):
        # Initializing differenced panel template df
        a1 = Di.loc[:,tin].copy()
        # Adding the crossection variable
        a1 = pd.concat([a1,pd.DataFrame(np.ones((di_inc.shape[0],1))*(i+1)
                                        ,columns = [cin])],axis = 1)
        # Initializing the panel template df
        b1 = a1
        # Product of Di with in_vec[i-1] s.t. inst. not relevant to i are zero 
        a2 = pd.DataFrame(Di.iloc[:,1:].values.dot(np.diag(in_vec[i]))
                          ,columns = Di.columns[1:])
        # Product of di_inc with in_vec[i-1] s.t. inst. not relevant to i are zero
        b2 = pd.DataFrame(di_inc.iloc[:,1:].values.dot(np.diag(in_vec[i]))
                          ,columns = di_inc.columns[1:])
        # Concatenating a2 onto panel template adding [cin] and [tin]
        a2 = pd.concat([a1,a2],axis = 1)
        # Concatenating b2 onto panel template adding [cin] and [tin]
        b2 = pd.concat([b1,b2],axis = 1)
        if i == 0:
            # if  i = 0 initialize final differenced panel df
            Ddi_pan = a2.iloc[1:,:]
            # if i = 0 initialize final panel df
            di_pan = b2
        elif i > 0: 
            # if i > 0 add ith crosssection rows onto final
            Ddi_pan = pd.concat([Ddi_pan,a2.iloc[1:,:]], axis = 0)
            # if i > 0 add ith crosssection rows onto final
            di_pan =  pd.concat([di_pan,b2], axis = 0)

    ## First Difference dependent and exogenous regressor matrix..
    for i in range(1,ncs+1):
        # Initializing panel template df
        c1 = dp.loc[dp[cin]== i,[cin]+[tin]].copy() 
        # First differencing al relevant variables for i crs
        for j in [inpt['dep']] + inpt['reg']:
            c1[''.join(['D',j])] = (dp.loc[dp[cin]== i,j].values 
                        - dp.loc[dp[cin]== i,j].shift(1).values)
        if i == 1:
            # If i = 1 initialize final panel df
            c2 = c1.iloc[1:,:]  
        elif i > 1:
            # If i > 1 add onto final panel df
            c2 = pd.concat([c2,c1.iloc[1:,:]],axis = 0)

    ## OLS estimation
    # Merging all differenced panel df's together
    Ddi_pan = pd.merge(c2,Ddi_pan,on=[cin,tin],how = 'inner')
    # List of all regressor names in Ddi_pan for use in ols()
    Dregs = ([''.join(['D',reg[i]]) for i in range(len(reg))] 
            + [''.join(['D',inc[i]]) for i in range(len(inc))])

    if lasso == 0:
        # Initializing ols object
        ols_reg =linear_model.LinearRegression()
        # Fitting the OLS regression
        ols_reg.fit(Ddi_pan.drop([cin]+[tin]+[''.join(['D',dep])],axis=1),
                    Ddi_pan.loc[:,''.join(['D',dep])])
        # Extracting estimated regression coefficients
        ols_out = ols_reg.coef_.tolist()
        # Initializing the ceofficient vector with zero's for regressors irrelevant to all crs
        ex_cf = ols_out[:len(reg)]
        # List of ex regressor column names in di
        fins_nm = di_in.columns.tolist()[len(reg)-1:]
        fregs = reg + fins_nm
        for i in range(len(fins_nm)):
            if inc.count(fins_nm[i]) > 0:
                # If the ith inst in di is in inc append the corr. coefficient to ex_cf
                ex_cf.append(ols_out[inc.index(fins_nm[i])+len(reg)])
            elif inc.count(fins_nm[i])==0:
                # If the ith inst in di is not in inc append a zero
                ex_cf.append(0)  
        # List of lists of indicator of relevant ex and inst to each cross section      
        rev_cf = [[1]*len(reg) + [in_nm[i].count(fins_nm[j]) 
                                  for j in range(len(fins_nm))] 
                                  for i in range(len(in_nm))]

    elif lasso == 1:
        # Input dictionary for Lasso Estimation
        l_inpt = {'dep': ''.join(['D',dep]),
                  'odep': inpt['dep'],
                  'ex_nm': [''.join(['D',reg[i]]) for i in range(len(reg))],
                  'oex_nm': reg,
                  'ins_nm': [''.join(['D',inc[i]]) for i in range(len(inc))],
                  'oins_nm': inpt['in_nm'][i], 
                  'alph': inpt['alph'],'epsil':inpt['epsil'],'cin': inpt['cin'],
                  'tin' : inpt['tin'], 'n_alphs': inpt['n_alphs'], 'n_parts': inpt['n_parts'],
                  'ntp': inpt['ntp'],'ncs':inpt['ncs']
                     }
        # Estimation by panel lasso
        out = pan_lasso_cv(Ddi_pan,l_inpt)
        # Extracting estimated coefficients
        ex_cf = np.array(out[0]).reshape(len(out[0]),1)
        # Relevant instrument matrix
        rev_cf = out[1]

    # Constucting a panel df of estimated errors Vj,i
    for i in range(1,ncs+1):
        # np.array of dep variable values for ith crs
        d1 = dp.loc[dp[cin] == i,[dep]].values
        # df of exogenous regressor values for ith crs
        d21 = dp.loc[dp[cin] == i ,[tin]+reg]
        # merging d21 and d22 making a df with all RHS regressors 
        d23 = pd.merge(d21,di,on = [tin], how = 'inner')
        # d2
        d2 = d23.drop(tin,axis=1).values.dot(np.diag(rev_cf[i-1])).dot(ex_cf)
        # Residual of time varying component 
        d3 = d1.reshape(d1.shape[0],1) - d2.reshape(d2.shape[0],1)
        # Centered residula of time varying component
        d3 = d3 - np.mean(d3)
        if i == 1:
            # if i = 1 initialize panel df
            p_res = [list(d3.T[0])]
        elif i > 1:
            # if i > 1 add onto p_res
            p_res.append(list(d3.T[0]))

    # Output of the function
    pan_out = [ex_cf,inc , p_res, rev_cf]

    return pan_out

<h3> 3.1 Data Loading </h3>

In [23]:
input_filename = 'pscdata_7_17_1126.json'
data_file = '/Users/ericpenner/Google_Drive/Research/pan_sel_cntrl/data'
input_file_full = ''.join([data_file,'/',input_filename])
with open(input_file_full) as f_obj: 
    pscdata = json.load(f_obj)
inpt = pscdata[0][0].copy()    

<h3> 3.2 Input Dictionary Setup </h3>

In [109]:
# Indicator for  whether in this run the subset of instrument relvant to each crs is known.
inpt['kwnsub'] = 0

# Indicator for whether residuals are observed
inpt['orcl'] = 0
inpt['lasso'] = 0
inpt['dep'] = inpt['en_nm'][0]
inpt['reg'] = inpt['ex_nm']
inpt['alph'] = 0.4
inpt['epsil'] = 0.09

# List of list with the names of the relevant instruments for each crossection
in_nm=[]
for i in range(inpt['ncs']):
    # If the subset is known then list of relevant inst. for each crs is supplied to estimator
    if inpt['kwnsub'] == 1:
        a=[ True if pscdata[0][1]['coeff'][0][i][k]!=0 else False 
            for k in range(inpt['n_exo'],inpt['n_exo']+inpt['t_inst'])]
        in_nm.append(np.array(pscdata[0][1]['Dins_nms'][1:])[a].tolist())
    # If the subset is unknown then list of all inst. will be supplied to est. for each crs
    else:
        in_nm.append(pscdata[0][1]['Dins_nms'][1:])
        
inpt['in_nm'] = in_nm
inpt_l = inpt.copy()
inpt_l['lasso'] = 1

<h3> 3.3 Data Set Extraction and Function Call </h3>

In [81]:
k=100
data_err = pd.DataFrame(pscdata[k][0]['err_df'], columns = pscdata[0][1]['Derr_nms'])  
data_inst = pd.DataFrame(pscdata[k][0]['inst_df'], columns = pscdata[0][1]['Dins_nms'])
data_pan = pd.DataFrame(pscdata[k][0]['prim_df'], columns = pscdata[0][1]['Dlng_nms'])

m_inpt = {'tin' :inpt['tin'], 'cin' : inpt['cin'] , 'ncs': inpt['ncs']}
data_pan_trns = mrg_dff(data_pan,data_inst,m_inpt)
data_pan_mrdf = data_pan_trns[0]
data_pan_mr   = data_pan_trns[1]

KeyError: 'in_vec'

In [26]:
inpt_ls = {}
inpt_ls['dep'] = 'DZ1,1'
inpt_ls['odep'] = 'Z1,1'
inpt_ls['ex_nm'] = [''.join(['DZ2,',str(i)]) for i in range(1,inpt['n_exo']+1)]
inpt_ls['oex_nm'] = [''.join(['Z2,',str(i)]) for i in range(1,inpt['n_exo']+1)]
inpt_ls['ins_nm'] = [''.join(['DW',str(i)]) for i in range(1,inpt['t_inst']+1)]
inpt_ls['oins_nm'] = [''.join(['W',str(i)]) for i in range(1,inpt['t_inst']+1)]
inpt_ls['alph'] = 0.45
inpt_ls['epsil'] = 0.1
inpt_ls['cin'] = inpt['cin']
inpt_ls['tin'] = inpt['tin']
inpt_ls['n_alphs'] = 20
inpt_ls['n_parts'] = 4
inpt_ls['ntp'] = inpt['ntp']
inpt_ls['ncs'] = inpt['ncs']
inpt_ls['cv'] = 0

In [27]:
pan_lasso_cv(data_pan_mr,data_pan_mrdf,inpt_ls)

[[-0.9909308494453454,
  -1.6301911468671897,
  -0.6645609811126932,
  -0.6742084982696253,
  0.21170726977964782,
  0.38894954585279334,
  0.09738059227056454,
  -0.4990077713403004,
  -0.2767514622919788,
  0.6661966228575743,
  0.2948102744377507,
  0.22509961234715717,
  0.8359338409898158,
  -0.5899525334780039,
  0.1342343817991136,
  0,
  0.33254725567232046,
  -0.07024602179817724,
  -0.5262111025110953,
  -0.3079532944759627,
  -0.5245828064034781,
  0],
 [[1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
  [1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0],
  [1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0],
  [1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
  [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0],
  [1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
  [1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
  [1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 

In [112]:
in_nm =inpt['in_nm']
di = data_inst
dp = data_pan
tin = inpt['tin']
cin = inpt['cin']
ncs = inpt['ncs']

# Initializing the set of all included instruments as the 1st set relevant inst
inst_incl = set(in_nm[0])
# Collecting rest of relevant instruments
for i in range(len(in_nm)):
    # Union of inc and ith set of relevant instruments
    inst_incl = inst_incl|set(in_nm[i])
# All included Instruments listed in order index order
inst_incl = [''.join(['W',str(i)]) for i in range(1,di.shape[1])
                                   if ''.join(['W',str(i)]) in inst_incl]
# df with time index and all included instruments
di_inst_incl = di.loc[:,[tin] + inst_incl]

# List of logical vectors (as list) of which instruments in inc are relevant to ith crs
inst_incl_vec = [[ 1 if inst_incl[i] in in_nm[j] else 0 
                                     for i in range(len(inst_incl))]
                                     for j in range(ncs)]

# Merging and differencing input dictionary
inpt_m = {'tin' : inpt['tin'] , 'cin': inpt['cin'] , 'ncs':inpt['ncs'], 'inc_vec': inst_incl_vec}

mrg_dff_all = mrg_dff(dp,di_inst_incl,inpt_m)

In [111]:
out[1]

Unnamed: 0,crs,t,Y,"Z1,1","Z1,2","Z2,1","Z2,2",W1,W2,W3,...,W11,W12,W13,W14,W15,W16,W17,W18,W19,W20
0,1.0,1.0,-3.924229,-2.578403,-0.284306,-1.048638,0.470765,-0.078605,1.738557,2.360978,...,-0.452217,-1.136770,-0.787915,-0.765736,0.954967,0.380810,1.423767,0.463397,1.015128,0.002755
1,1.0,2.0,1.006836,0.240972,-0.448595,-0.511588,0.994277,-0.739561,2.090662,2.191009,...,0.094380,-0.979826,0.081898,-0.832128,0.061729,0.548189,1.311751,0.270630,0.903003,0.329718
2,1.0,3.0,-6.948308,-4.521171,3.405121,0.694359,1.244344,-1.457118,-0.944059,-0.003173,...,-0.855382,-1.084337,-0.714253,0.010887,-0.593122,0.006763,0.807016,1.574511,0.937282,1.002653
3,1.0,4.0,1.665970,-0.354932,0.109004,0.204962,0.667624,2.629474,2.313243,-0.052996,...,0.083130,-0.969294,0.567911,-0.056712,0.223911,-0.435826,-0.019588,0.763981,0.504710,-0.944181
4,1.0,5.0,2.990035,1.171214,-1.510673,-0.106903,-0.482262,1.237314,2.100079,1.947957,...,-0.534396,-2.199505,-0.266083,-0.282358,-0.154866,-0.406745,0.149752,0.267440,-1.977043,-0.949975
5,1.0,6.0,-2.675885,-2.220305,0.585516,0.020751,0.491623,0.565855,-0.814858,0.209497,...,-1.456165,-1.183619,-0.258061,-0.085194,0.078530,1.017445,-0.246911,-1.247165,-1.178318,0.281473
6,1.0,7.0,5.314198,3.238669,-1.382146,-2.014601,-1.119426,1.107656,0.207745,1.200986,...,0.339762,-0.680512,-0.023180,0.955176,1.618961,1.007389,0.885207,0.602239,-0.320691,-0.783235
7,1.0,8.0,-3.279937,-3.542601,-1.372393,0.212034,1.502228,0.572843,0.094981,0.271423,...,0.477310,0.974428,0.532956,-0.275130,-1.675751,-0.280238,-0.333676,0.189417,-0.774663,-0.280602
8,1.0,9.0,1.758682,0.306618,-1.684354,0.016498,0.309873,0.139070,0.847215,-0.352955,...,1.143821,-0.291755,0.173656,-0.677726,0.764472,-1.114028,-1.155893,-0.555331,-0.544770,0.101377
9,1.0,10.0,3.618896,0.342171,-2.465280,0.648629,0.019507,1.914634,1.733819,1.619416,...,1.600728,1.289376,0.988559,0.119528,-0.165038,-0.498267,0.314297,0.669848,-0.198763,-0.058850


In [84]:
?mrg_dff

In [None]:
c = panel_fe(data_pan,data_inst,inpt)
c_l = panel_fe(data_pan,data_inst,inpt_l)

i=1
h_c = 1.5
ker = 9

# V_knw = np.array(c_k[2][i-1])
# p_knw = np.linspace(np.min(V_knw),np.max(V_knw),100)
# h_knw = h_c*V_unk.shape[0]**(-1/5)*np.std(V_knw)
# V_knw_den = kr.mvden(V_knw,p_knw,h_knw,ker)

V_unk = np.array(c[2][i])
p_unk = np.linspace(np.min(V_unk),np.max(V_unk),100)
h_unk = h_c*V_unk.shape[0]**(-1/5)*np.std(V_unk)
V_unk_den = kr.mvden(V_unk,p_unk,h_unk,ker)

V_lass = np.array(c_l[2][i])
p_lass = np.linspace(np.min(V_lass),np.max(V_lass),100)
h_lass = h_c*V_lass.shape[0]**(-1/5)*np.std(V_lass)
V_lass_den = kr.mvden(V_lass,p_lass,h_lass,ker)

V_true = data_err.loc[data_err['crs']==i+1,'V1'].values
p_true = np.linspace(np.min(V_true),np.max(V_true),100)
h_true = h_c*V_true.shape[0]**(-1/5)*np.std(V_true)
V_true_den = kr.mvden(V_true,p_true,h_true,ker) 

x_lm = [-5,5]
y_lm = 1

f,ax = plt.subplots()
f.set_figheight(7)
f.set_figwidth(15)
ax.set_xlim((x_lm[0],x_lm[1]))
ax.set_ylim((0,y_lm))
ax.plot(p_unk,V_unk_den)
ax.plot(p_lass,V_lass_den)
ax.plot(p_true,V_true_den)
ax.legend(["Unknown","Lasso","True"])
ax.grid(which = 'both')
#ax.set_title(''.join(['Distribution of Estimated ',coeff[0].columns[w]]))
plt.show()