Alex Kappes <br>
Problem Set 6 <br>
EconS 512

**Problem 1**

**(a)**

In [29]:
import numpy as np
from numpy.linalg import inv
import pandas as pd
import sympy as sym
sym.init_printing(use_latex="mathjax")

df = pd.read_csv('/home/akappes/WSU/512_MetricsII/merit_data.csv')

class estimate:
    def __init__(self, dep, indep, df):
        self.dep = np.asmatrix(dep).T
        self.indep = np.asmatrix(indep)
        self.df = df


    # parameters
    def beta(self):
        b = inv(self.indep.T * self.indep) * self.indep.T * self.dep
        return b


    # residuals
    def error(self, beta):
        resid = self.dep - self.indep * beta
        return resid


    # Whites HC covariance estimator
    def robust(self, e):
        k = self.indep.shape[1]
        n = self.indep.shape[0]

        s = np.zeros((k, k))
        i = 1

        while i < n:

            s = s + e[i, 0]**2 * self.indep[i].T * self.indep[i]
            i = i + 1

            if i > n:
                break

        w_hce = inv(self.indep.T * self.indep) * s * inv(self.indep.T * self.indep)
        w_se = np.sqrt(np.diag(w_hce))
        return w_se


    # construction of se clustered by state
    def se_cluster_s(self, g, e):
        group_s = pd.DataFrame(df[g].unique().tolist()).rename(columns={0: 'group'})

        l_s = {}
        for i in group_s['group']:
            l_s[i] = pd.DataFrame()

        for key in l_s:
            l_s[key] = df[df[g] == key].index.tolist()

        group_e = pd.DataFrame(e).rename(columns={0: 'ehat'})

        d_sigsq = {}
        for i in group_s['group']:
            d_sigsq[i] = pd.DataFrame()

        for key in d_sigsq:
            d_sigsq[key] = (np.asmatrix(group_e.loc[l_s[key]]).T *
                            np.asmatrix(group_e.loc[l_s[key]])) / len(l_s[key])

        sigsq_df = pd.DataFrame({'sigsq': 0}, index=df.index)

        for key in d_sigsq:
            sigsq_df.loc[df[df[g] == key].index.tolist(), 'sigsq'] = d_sigsq[key][0, 0]

        sigsq_l = np.array(sigsq_df)
        s = np.multiply(np.divide(1, sigsq_l), np.eye(len(self.indep)))
        covmat_s = inv(self.indep.T * s * self.indep)
        return np.sqrt(np.diag(covmat_s))

        # Code commented out below follows Greene's equations for clustered covmat estimator:
        # he specifies by grouping and summing over all groups but this produces singular x_g'x_g
        # matrices for all of this binary fixed effect data. I've created an identy matrix with
        # repeated 1 / sighat_g^2 values corresponding to index values in X.
        # Each within group obs is weighted appropriately and det(X) exists, providing
        # a computable clustered covmat estimator. Code below is kept for future reference
        # when not working with bullshit class data.

        # xtxi = {}
        # for state in group_s['group']:
        #     xtxi[state] = pd.DataFrame()
        #
        # for key in xtxi:
        #     xtxi[key] = inv(np.asmatrix(df[df[g] == key][x_vars]).T * np.asmatrix(df[df[g] == key][x_vars]))
        #
        # covmat_s_g = {}
        # for state in group_s['group']:
        #     covmat_s_g[state] = pd.DataFrame()
        #
        # for key in covmat_s_g:
        #     covmat_s_g[key] = d_sigsq[key] * xtxi[key]
        #
        # covmat_l = [covmat_s_g[group_s.loc[0, 'group']]]
        # i = 1
        # while i < len(group_s):
        #     covmat_l.append(covmat_s_g[group_s.loc[i, 'group']])
        #     i += 1
        #     if i > len(group_s):
        #         break
        #
        # covmat_s = sum(covmat_l)
        # return np.sqrt(np.diag(covmat_s))


    # construction of se clustered by state and year
    def se_cluster_multi(self, g1, g2, e):
        group_1 = df[g1].unique().tolist()
        group_2 = df[g2].unique().tolist()

        group_e = pd.DataFrame(e).rename(columns={0: 'ehat'})

        l_multi_idx = []
        for i in group_1:
            for j in group_2:
                l_multi_idx.append(df[(df[g1] == i) & (df[g2] == j)].index)

        i = 0
        sighat_multi_l = []
        while i < len(l_multi_idx):
            sighat_multi_l.append(np.asmatrix(group_e.loc[l_multi_idx[i].tolist()]).T *
                                  np.asmatrix(group_e.loc[l_multi_idx[i].tolist()]) / len(l_multi_idx[i]))
            i += 1
            if i > len(l_multi_idx):
                break

        sigsq_multi_df = pd.DataFrame({'sigsq_multi': 0}, index=df.index)

        for i in range(len(l_multi_idx)):
            sigsq_multi_df.loc[l_multi_idx[i].tolist()] = sighat_multi_l[i][0, 0]

        sigsq_multi_l = np.array(sigsq_multi_df)
        g_multi = np.multiply(np.divide(1, sigsq_multi_l), np.eye(len(self.indep)))
        covmat_g_multi = inv(self.indep.T * g_multi * self.indep)
        return np.sqrt(np.diag(covmat_g_multi))


### data construction ###
bin_state = pd.DataFrame(0, columns=df['state'].unique().tolist()[:-1], index=df.index)
bin_year = pd.DataFrame(0, columns=df['year'].unique().tolist()[:-1], index=df.index)

for j in bin_state.columns:
    bin_state.loc[df[df['state'] == j].index.tolist(), j] = 1

for j in bin_year.columns:
    bin_year.loc[df[df['year'] == j].index.tolist(), j] = 1

df.insert(loc=0, column='ones', value=1)
df = pd.concat([df, bin_state, bin_year], axis=1)

X = df.drop(['coll', 'year', 'state', 'chst'], axis=1)
x_vars = X.columns
y = df['coll']

### (a) estimation ###

est_a = estimate(y, X, df)
params_a = est_a.beta()
resids_a = est_a.error(params_a)
whites_hce_a = est_a.robust(resids_a)
clustered_state_se_a = est_a.se_cluster_s('state', resids_a)
clustered_stateyear_se_a = est_a.se_cluster_multi('state', 'year', resids_a)

print('X variables (state binaries appear in numeric categorical form): \n', x_vars.tolist())

X variables (state binaries appear in numeric categorical form): 
 ['ones', 'merit', 'male', 'black', 'asian', 11, 12, 13, 14, 15, 16, 21, 22, 23, 31, 32, 33, 34, 35, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 71, 72, 73, 74, 81, 82, 83, 84, 85, 86, 87, 88, 91, 92, 93, 94, 1993, 1997, 1992, 1989, 1991, 1999, 1994, 1998, 2000, 1995, 1990]


In [30]:
print('Estimated parameters: \n', np.round(np.array(params_a.T), 3))

Estimated parameters: 
 [[ 0.427  0.034 -0.079 -0.15   0.169  0.033  0.08   0.037  0.173  0.115
   0.191  0.125  0.175  0.098  0.092  0.031  0.152  0.122  0.122  0.066
   0.133  0.082  0.163  0.043  0.132  0.135  0.098  0.17   0.095  0.166
   0.027  0.105  0.103  0.008  0.02  -0.001 -0.02   0.092  0.097 -0.045
   0.064  0.061  0.008  0.053 -0.014  0.038 -0.    -0.029 -0.001  0.047
  -0.063  0.042  0.04   0.059  0.004  0.002 -0.009 -0.008 -0.019 -0.01
  -0.017 -0.003  0.006 -0.007 -0.022 -0.027]]


In [31]:
print("White's HC covariance estimator standard errors: \n", np.round(whites_hce_a, 5))

White's HC covariance estimator standard errors: 
 [0.02917 0.01451 0.00476 0.00732 0.01344 0.03699 0.03843 0.03899 0.03123
 0.03867 0.03706 0.02926 0.03039 0.0302  0.03015 0.03468 0.03015 0.03011
 0.03456 0.03496 0.03471 0.03612 0.03396 0.03339 0.03409 0.03451 0.03778
 0.03813 0.03822 0.03422 0.0343  0.0307  0.03507 0.03509 0.03057 0.03501
 0.03459 0.03414 0.03495 0.03608 0.03441 0.03472 0.02963 0.03436 0.03357
 0.03481 0.03579 0.03473 0.03465 0.03334 0.03598 0.03605 0.03515 0.0285
 0.03455 0.01195 0.01229 0.01191 0.01142 0.01175 0.01222 0.01202 0.01224
 0.0125  0.01216 0.0116 ]


In [32]:
print('Clustered state standard errors: \n', np.round(clustered_state_se_a, 5))

Clustered state standard errors: 
 [0.02918 0.01449 0.00476 0.00753 0.01364 0.03699 0.03841 0.03897 0.03118
 0.03867 0.03705 0.02926 0.03042 0.0302  0.03014 0.03467 0.03015 0.03012
 0.03454 0.03493 0.03474 0.03612 0.03395 0.03337 0.0341  0.03447 0.03779
 0.03816 0.03825 0.03425 0.03428 0.03068 0.03505 0.0351  0.03055 0.03499
 0.03457 0.03412 0.03493 0.03612 0.0344  0.0347  0.02962 0.03435 0.03357
 0.03479 0.03576 0.03472 0.03464 0.03333 0.03596 0.03605 0.03516 0.02852
 0.03452 0.01197 0.01231 0.01196 0.01147 0.01178 0.01225 0.01207 0.01226
 0.01252 0.01223 0.01162]


In [33]:
print('Clustered state and year standard errors: \n', np.round(clustered_stateyear_se_a, 5))

Clustered state and year standard errors: 
 [0.02912 0.01447 0.00476 0.00752 0.01364 0.0369  0.03835 0.03889 0.03115
 0.03863 0.037   0.02922 0.03038 0.03017 0.0301  0.03462 0.03011 0.03009
 0.0345  0.03489 0.0347  0.03608 0.03392 0.03332 0.03406 0.03443 0.03769
 0.0381  0.03806 0.03421 0.03421 0.03065 0.03501 0.03499 0.03051 0.03493
 0.03444 0.03408 0.03487 0.03598 0.03434 0.03465 0.02958 0.03431 0.0335
 0.03474 0.03566 0.03462 0.03456 0.03329 0.0358  0.03599 0.03509 0.02848
 0.03447 0.01192 0.01226 0.01188 0.0114  0.01173 0.0122  0.012   0.01221
 0.01247 0.01213 0.01157]


**(b)**

In [34]:
years = df['year'].unique().tolist()

weights = []
for i in years:
    weights.append(len(df[df['year'] == i]['state'].unique()) /
                   len(df[df['year'] == i]['state']))

year_idx = []
for i in years:
    year_idx.append(df[df['year'] == i].index.tolist())

X_weighted = pd.DataFrame(0, columns=X.columns, index=X.index)

for i in range(len(weights)) and range(len(year_idx)):
    X_weighted.loc[year_idx[i]] = weights[i] * X.loc[year_idx[i]]

X_weighted['ones'] = 1

est_b = estimate(y, X_weighted, df)
params_b = est_b.beta()

print('Estimated state-weighted parameters: \n', np.round(np.array(params_b.T), 3))

Estimated state-weighted parameters: 
 [[  0.43    2.735  -5.511 -10.193  11.463   2.079   5.231   1.978  11.806
    7.884  13.033   8.218  11.893   6.689   5.991   2.082  10.207   7.991
    8.054   4.054   8.716   5.336  10.855   2.737   8.608   8.807   6.609
   11.367   5.899  11.024   1.7     6.871   6.57    0.045   0.803  -0.354
   -1.589   6.073   6.167  -3.557   3.928   3.976   0.326   3.146  -1.044
    2.287  -0.595  -2.433  -0.359   2.753  -4.46    2.489   2.384   4.069
   -0.268   0.279  -0.607  -0.38   -1.072  -0.447  -1.065  -0.083   0.364
   -0.531  -1.391  -1.738]]


**(c)**

In [38]:
s_group = df['state'].unique().tolist()
y_group = df['year'].unique().tolist()

sy_idx = []
for i in s_group:
    for j in y_group:
        sy_idx.append(df[(df['state'] == i) & (df['year'] == j)].index)
mean_l = []
i = 0
while i < len(sy_idx):
    mean_l.append(X.loc[sy_idx[i]].mean().tolist())
    i += 1
    if i > len(sy_idx):
        break

X_means = pd.DataFrame(0, columns=X.columns, index=X.index)

for i in range(len(sy_idx)) and range(len(mean_l)):
    X_means.loc[sy_idx[i]] = mean_l[i] * X.loc[sy_idx[i]]

est_c = estimate(y, X_means, df)
params_c = est_c.beta()
resids_c = est_c.error(params_c)
whites_hce_c = est_c.robust(resids_c)
clustered_state_se_c = est_c.se_cluster_s('state', resids_c)

print('Estimated (state x year) mean-weighted parameters: \n', np.round(np.array(params_c.T), 3))

Estimated (state x year) mean-weighted parameters: 
 [[ 3.170000e-01  2.016740e+02 -7.286500e+02 -2.382109e+03  1.667099e+03
   5.741920e+02  7.924350e+02  5.664590e+02  1.194774e+03  9.505090e+02
   1.271796e+03  9.666010e+02  1.215587e+03  8.557630e+02  7.757240e+02
   5.602240e+02  1.083194e+03  9.254110e+02  9.443200e+02  6.901210e+02
   9.951590e+02  7.316940e+02  1.137718e+03  6.094040e+02  9.678900e+02
   9.951500e+02  8.969850e+02  1.217197e+03  1.834347e+03  1.183627e+03
   5.281470e+02  8.686050e+02  9.484310e+02  5.057440e+02  4.272010e+02
   3.565220e+02  2.979280e+02  8.977770e+02  1.101635e+03  1.406430e+02
   7.617850e+02  6.583680e+02  4.172040e+02  6.267600e+02  3.660290e+02
   5.801750e+02  3.452650e+02  2.489950e+02  3.889630e+02  5.909040e+02
   1.253920e+02  5.883480e+02  5.762670e+02  7.476260e+02  3.831300e+02
   1.468620e+02 -2.308800e+01  1.165770e+02  3.316340e+02  1.913070e+02
  -1.380200e+01  8.200200e+01  6.314800e+01 -4.252000e+01 -5.303000e+01
   1.829350

In [39]:
print("White's HC covariance estimator standard errors: \n", np.round(whites_hce_c, 5))

White's HC covariance estimator standard errors: 
 [6.2620000e-02 6.4301800e+01 4.3308570e+01 1.3471729e+02 3.2220800e+02
 2.6023704e+02 2.6363435e+02 2.6488781e+02 2.4476876e+02 2.6504064e+02
 2.6044492e+02 2.3863566e+02 2.4174925e+02 2.4138771e+02 2.4138534e+02
 2.5399565e+02 2.4103414e+02 2.4111243e+02 2.5329701e+02 2.5386791e+02
 2.5438483e+02 2.5809131e+02 2.5149564e+02 2.4987721e+02 2.5207115e+02
 2.5255471e+02 2.6221290e+02 2.6485295e+02 2.7485278e+02 2.5223473e+02
 2.5233095e+02 2.4301413e+02 2.5668061e+02 2.5564741e+02 2.4258909e+02
 2.5427307e+02 2.5423148e+02 2.5214267e+02 2.5806394e+02 2.5796465e+02
 2.5250138e+02 2.5344753e+02 2.3984207e+02 2.5203111e+02 2.4998790e+02
 2.5280041e+02 2.5547182e+02 2.5322769e+02 2.5244132e+02 2.4894690e+02
 2.5683468e+02 2.5652804e+02 2.5472409e+02 2.3277237e+02 2.5316340e+02
 8.8058160e+01 4.8350030e+01 9.1844690e+01 2.3410741e+02 1.3423027e+02
 5.7705990e+01 7.2403220e+01 5.4884210e+01 4.7453890e+01 5.2855820e+01
 1.8212392e+02]


In [40]:
print('Clustered state standard errors: \n', np.round(clustered_state_se_c, 5))

Clustered state standard errors: 
 [5.9110000e-02 6.4092300e+01 4.3332580e+01 1.3557542e+02 3.0985688e+02
 2.4805087e+02 2.5200061e+02 2.5343218e+02 2.3186460e+02 2.5329354e+02
 2.4861775e+02 2.2546024e+02 2.2886645e+02 2.2828352e+02 2.2830400e+02
 2.4135520e+02 2.2797661e+02 2.2806300e+02 2.4080076e+02 2.4164640e+02
 2.4206577e+02 2.4589210e+02 2.3898122e+02 2.3725837e+02 2.3955652e+02
 2.4014810e+02 2.4984618e+02 2.5278945e+02 2.6437064e+02 2.3999341e+02
 2.3981718e+02 2.3007415e+02 2.4461253e+02 2.4332085e+02 2.2963633e+02
 2.4199163e+02 2.4162668e+02 2.3961069e+02 2.4567066e+02 2.4573346e+02
 2.4007647e+02 2.4089005e+02 2.2661701e+02 2.3964533e+02 2.3722680e+02
 2.4033362e+02 2.4357934e+02 2.4090305e+02 2.4002581e+02 2.3638181e+02
 2.4458904e+02 2.4443411e+02 2.4225752e+02 2.1986941e+02 2.4092765e+02
 8.5301380e+01 4.8464280e+01 8.8866740e+01 2.2260001e+02 1.2835904e+02
 5.7288320e+01 7.1054660e+01 5.4548320e+01 4.7487640e+01 5.2819210e+01
 1.7363649e+02]
