In [68]:
## Problem Set 1
## ABM
## October 5th, 2022

In [69]:
import pandas as pd

In [70]:
df = pd.read_stata("../data/PS1_Data.dta")

### Question 1
Report sample statistics for the full sample, the balanced sub-panel, and the exiters

In [71]:
# Full Sample
df[['Y', 'L', 'I', 'K', 'A']].describe()

Unnamed: 0,Y,L,I,K,A
count,39569.0,39569.0,39569.0,39569.0,39569.0
mean,13.487732,5.000947,5.034981,8.986055,8.53921
std,1.726792,1.00151,0.999631,1.872697,3.213174
min,5.9139,0.62485,1.1288,2.0878,1.0
25%,12.424,4.3289,4.3677,7.993,6.0
50%,13.591,5.007,5.031,9.2913,9.0
75%,14.664,5.6784,5.7125,10.289,11.0
max,19.164,8.8615,9.3403,14.572,17.0


In [72]:
# Creating a balanced sub-panel 
df['firm'].value_counts() # Can already see that some firms do not show up every period
df_balanced = df.groupby('firm').filter(lambda x : len(x) == 10)
df_balanced[['Y', 'L', 'I', 'K', 'A']].describe()

Unnamed: 0,Y,L,I,K,A
count,21800.0,21800.0,21800.0,21800.0,21800.0
mean,13.407925,4.993716,5.044564,9.157427,7.320183
std,1.722635,1.003712,1.003588,1.804151,3.232118
min,5.9139,1.1026,1.1288,2.2374,1.0
25%,12.357,4.321175,4.3673,8.259625,5.0
50%,13.515,5.00205,5.0419,9.4318,7.0
75%,14.57125,5.6704,5.726675,10.39,10.0
max,18.872,8.8615,9.3403,14.335,16.0


In [73]:
# Exiters panel 
df_exiters = df.groupby('firm').filter(lambda x : len(x) < 10)
df_exiters[['Y', 'L', 'I', 'K', 'A']].describe()


Unnamed: 0,Y,L,I,K,A
count,17769.0,17769.0,17769.0,17769.0,17769.0
mean,13.585644,5.009822,5.023223,8.775805,10.03478
std,1.726896,0.998759,0.994656,1.932915,2.472889
min,6.7063,0.62485,1.3426,2.0878,1.0
25%,12.51,4.341,4.3683,7.6603,8.0
50%,13.686,5.0127,5.0154,9.1051,10.0
75%,14.772,5.6894,5.6996,10.151,12.0
max,19.164,8.5965,8.8701,14.572,17.0


I can see that mean output and labor are slightly higher for exiters. Age of the firm is significantly higher for exiters. Capital, on the other hand, is higher for survivers. 

### Question 2
Using only the balanced sub-panel, compute total, between and within and random effects estimators for the equation below. Perform a Hausman test of RE vs FE.  

$$y_{it} = \beta_0 + \beta_a a_{it} + \beta_k k_{it} + \beta_l l_{it} + \gamma_t + \omega_{it} + \eta_{it}$$

In [74]:
# Adjust dataset
import statsmodels.api as sm

year = pd.Categorical(df_balanced.year)
df_balanced = df_balanced.set_index(['firm', 'year'])
df_balanced["year"] = year

In [75]:
# Pooled OLS
from linearmodels import PooledOLS

exog_vars = ["A", "K", "L", "year"] 
exog = sm.add_constant(df_balanced[exog_vars]) # this adds a constant 

mod = PooledOLS(df_balanced.Y, exog) # this automatically excludes dummy year = 1
pooled_res = mod.fit()
print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:                      Y   R-squared:                        0.5270
Estimator:                  PooledOLS   R-squared (Between):              0.4762
No. Observations:               21800   R-squared (Within):               0.5343
Date:                Fri, Oct 07 2022   R-squared (Overall):              0.5270
Time:                        14:22:52   Log-likelihood                -3.463e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      2023.0
Entities:                        2180   P-value                           0.0000
Avg Obs:                      10.0000   Distribution:                F(12,21787)
Min Obs:                      10.0000                                           
Max Obs:                      10.0000   F-statistic (robust):             2023.0
                            

In [76]:
# Random Effects 
from linearmodels.panel import RandomEffects

exog_vars = ["A", "K", "L", "year"] 
exog = sm.add_constant(df_balanced[exog_vars]) # this adds a constant 

mod = RandomEffects(df_balanced.Y, exog) # this automatically excludes dummy year = 1
re_res = mod.fit()
print(re_res)

                        RandomEffects Estimation Summary                        
Dep. Variable:                      Y   R-squared:                        0.5291
Estimator:              RandomEffects   R-squared (Between):              0.4738
No. Observations:               21800   R-squared (Within):               0.5346
Date:                Fri, Oct 07 2022   R-squared (Overall):              0.5270
Time:                        14:22:52   Log-likelihood                -3.416e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      2039.6
Entities:                        2180   P-value                           0.0000
Avg Obs:                      10.0000   Distribution:                F(12,21787)
Min Obs:                      10.0000                                           
Max Obs:                      10.0000   F-statistic (robust):             2039.6
                            

In [77]:
# Within estimator 
from linearmodels import PanelOLS

exog_vars = ["A", "K", "L", "year"]

exog = sm.add_constant(df_balanced[exog_vars])

mod = PanelOLS(df_balanced.Y, exog, entity_effects = True, drop_absorbed=True)
fe_res = mod.fit()
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                      Y   R-squared:                        0.5350
Estimator:                   PanelOLS   R-squared (Between):              0.4481
No. Observations:               21800   R-squared (Within):               0.5350
Date:                Fri, Oct 07 2022   R-squared (Overall):              0.5241
Time:                        14:22:53   Log-likelihood                -3.297e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      2051.1
Entities:                        2180   P-value                           0.0000
Avg Obs:                      10.0000   Distribution:                F(11,19609)
Min Obs:                      10.0000                                           
Max Obs:                      10.0000   F-statistic (robust):             2051.1
                            

Variables have been fully absorbed and have removed from the regression:

year.10

  fe_res = mod.fit()


In [78]:
# Between estimator 

from linearmodels.panel import BetweenOLS

exog_vars = ["A", "K", "L", "year"] 
exog = sm.add_constant(df_balanced[exog_vars])

mod = BetweenOLS(df_balanced.Y, exog)
be_res = mod.fit()
print(be_res)

                         BetweenOLS Estimation Summary                          
Dep. Variable:                      Y   R-squared:                        0.4898
Estimator:                 BetweenOLS   R-squared (Between):              0.4898
No. Observations:                2180   R-squared (Within):               0.4995
Date:                Fri, Oct 07 2022   R-squared (Overall):              0.4983
Time:                        14:22:53   Log-likelihood                   -1286.6
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      173.39
Entities:                        2180   P-value                           0.0000
Avg Obs:                      10.0000   Distribution:                 F(12,2167)
Min Obs:                      10.0000                                           
Max Obs:                      10.0000   F-statistic (robust):             173.39
                            

  return Series(np.sqrt(np.diag(self.cov)), self._var_names, name="std_error")
  return Series(np.sqrt(np.diag(self.cov)), self._var_names, name="std_error")
  return Series(np.sqrt(np.diag(self.cov)), self._var_names, name="std_error")
  return Series(np.sqrt(np.diag(self.cov)), self._var_names, name="std_error")


In [79]:
# Comparing the 4 estimators 
from linearmodels.panel import compare

print(compare({"Between": be_res, "RE": re_res, "Pooled": pooled_res, "Within": fe_res}))

                                   Model Comparison                                   
                               Between                RE         Pooled         Within
--------------------------------------------------------------------------------------
Dep. Variable                        Y                 Y              Y              Y
Estimator                   BetweenOLS     RandomEffects      PooledOLS       PanelOLS
No. Observations                  2180             21800          21800          21800
Cov. Est.                   Unadjusted        Unadjusted     Unadjusted     Unadjusted
R-squared                       0.4898            0.5291         0.5270         0.5350
R-Squared (Within)              0.4995            0.5346         0.5343         0.5350
R-Squared (Between)             0.4898            0.4738         0.4762         0.4481
R-Squared (Overall)             0.4983            0.5270         0.5270         0.5241
F-statistic                     173.39     

  return Series(np.sqrt(np.diag(self.cov)), self._var_names, name="std_error")


In [80]:
# Hausman test of random effects vs fixed effects 
# Compare within and random effects


### Question 3
Using the balanced sub-panel compute difference estimators of the above equation. Report results from first, second and third differences. 

In [81]:
df_balanced = df_balanced.drop(columns = ["year"])
df_balanced = df_balanced.reset_index()

In [82]:
# create a differenced dataframe 
# I create a dictionary where I store the three new dataframes (first difference, second difference, third difference)
diffs = {}
for i in range(1, 4):
    df = df_balanced[["Y", "A", "K", "L", "year"]].diff(i)
    df["firm"] = df_balanced["firm"]
    year = pd.Categorical(df.year)
    df = df.set_index(['firm', 'year'])
    df["year"] = year
    diffs[i] = df

In [83]:
# First difference estimator 
diffs_estimator = {}
for i in range(1, 4):
    exog_vars = ["A", "K", "L", "year"]
    exog = sm.add_constant(diffs[i][exog_vars])
    mod = PanelOLS(diffs[i].Y, exog)
    diffs_estimator[i] = mod.fit()


Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


In [85]:
print(compare({"First Diff": diffs_estimator[1], "Second Diff": diffs_estimator[2], "Third Diff": diffs_estimator[3]}))

                      Model Comparison                      
                        First Diff  Second Diff   Third Diff
------------------------------------------------------------
Dep. Variable                    Y            Y            Y
Estimator                 PanelOLS     PanelOLS     PanelOLS
No. Observations             21799        21798        21797
Cov. Est.               Unadjusted   Unadjusted   Unadjusted
R-squared                   0.4808       0.5222       0.5389
R-Squared (Within)          0.4817       0.5234       0.5408
R-Squared (Between)         0.3662       0.4466       0.4710
R-Squared (Overall)         0.4808       0.5222       0.5389
F-statistic                 5045.1       5954.1       6367.4
P-value (F-stat)            0.0000       0.0000       0.0000
const                      -0.4633       0.2337       0.2590
                         (-2.9356)     (2.2684)     (3.4598)
A                           0.1522       0.1451       0.1376
                        