# 06 â€” Panel Data Analysis
**Author:** Ebenezer Adjartey

Covers: Pooled OLS, Fixed Effects (within), Random Effects (GLS), Hausman test, First-difference estimator, dynamic GMM (Arellano-Bond), cluster-robust SEs.

In [None]:
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from linearmodels.panel import PooledOLS, PanelOLS, RandomEffects, FirstDifferenceOLS
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
sns.set_theme(style='whitegrid')
print('Libraries loaded. (install linearmodels: pip install linearmodels)')

## 1. Generate Balanced Panel Dataset

In [None]:
N = 100    # individuals
T = 10     # time periods
n = N * T  # total observations

ids   = np.repeat(np.arange(1, N+1), T)
times = np.tile(np.arange(1, T+1), N)

# Individual fixed effects (unobserved heterogeneity)
alpha_i = np.repeat(np.random.normal(0, 2, N), T)

# Explanatory variables
x1 = np.random.normal(0, 1, n) + alpha_i * 0.3   # correlated with FE
x2 = np.random.normal(0, 1, n)

# True model: y = alpha_i + 2*x1 + 1.5*x2 + e
y = alpha_i + 2*x1 + 1.5*x2 + np.random.normal(0, 1, n)

panel = pd.DataFrame({'id':ids,'time':times,'y':y,'x1':x1,'x2':x2})
panel_idx = panel.set_index(['id','time'])
print(panel.head(12))
print(f'Panel: {N} individuals x {T} periods = {n} obs')

## 2. Pooled OLS

In [None]:
pooled = PooledOLS(panel_idx['y'], sm.add_constant(panel_idx[['x1','x2']])).fit()
print(pooled.summary)
print(f'\nPooled OLS: coef(x1)={pooled.params["x1"]:.4f}  (true=2.0)')

## 3. Fixed Effects (Within Estimator)

In [None]:
fe = PanelOLS(panel_idx['y'], panel_idx[['x1','x2']], entity_effects=True).fit()
print(fe.summary)
print(f'\nFE: coef(x1)={fe.params["x1"]:.4f}  (true=2.0)')
print(f'FE: coef(x2)={fe.params["x2"]:.4f}  (true=1.5)')

## 4. Random Effects

In [None]:
re = RandomEffects(panel_idx['y'], sm.add_constant(panel_idx[['x1','x2']])).fit()
print(re.summary)
print(f'\nRE: coef(x1)={re.params["x1"]:.4f}  coef(x2)={re.params["x2"]:.4f}')

## 5. Hausman Test (FE vs RE)

In [None]:
# Hausman test: H0 = RE is consistent (no correlation between alpha_i and X)
# Use Durbin-Wu-Hausman via statsmodels or manual calculation
fe_params = fe.params[['x1','x2']]
re_params = re.params[['x1','x2']]
diff = fe_params - re_params

fe_cov = fe.cov[['x1','x2']].loc[['x1','x2']]
re_cov = re.cov[['x1','x2']].loc[['x1','x2']]
cov_diff = fe_cov.values - re_cov.values

import numpy.linalg as la
hausman_stat = diff.values @ la.inv(cov_diff) @ diff.values
hausman_p    = 1 - pd.Series([hausman_stat]).apply(lambda x: __import__('scipy').stats.chi2.cdf(x, df=2))[0]
print(f'Hausman statistic = {hausman_stat:.4f}')
print(f'p-value = {hausman_p:.4f}')
print('Verdict:', 'Use Fixed Effects (RE is inconsistent)' if hausman_p < 0.05 else 'Random Effects preferred')

## 6. First-Difference Estimator

In [None]:
fd = FirstDifferenceOLS(panel_idx['y'], panel_idx[['x1','x2']]).fit()
print(fd.summary)
print(f'\nFD: coef(x1)={fd.params["x1"]:.4f}  coef(x2)={fd.params["x2"]:.4f}')

## 7. Cluster-Robust Standard Errors

In [None]:
fe_cluster = PanelOLS(panel_idx['y'], panel_idx[['x1','x2']],
                       entity_effects=True).fit(cov_type='clustered', cluster_entity=True)
print('Fixed Effects with Clustered SE (by entity):')
print(fe_cluster.summary)

## 8. Coefficient Comparison Across Models

In [None]:
comparison = pd.DataFrame({
    'Model':  ['Pooled OLS', 'Fixed Effects', 'Random Effects', 'First Diff'],
    'x1':     [pooled.params['x1'], fe.params['x1'], re.params['x1'], fd.params['x1']],
    'x2':     [pooled.params['x2'], fe.params['x2'], re.params['x2'], fd.params['x2']]
}).set_index('Model').round(4)

print('Coefficient Comparison (True: x1=2.0, x2=1.5):')
print(comparison)

# Visualization
fig, ax = plt.subplots(figsize=(8, 5))
comparison.plot(kind='bar', ax=ax)
ax.axhline(2.0, color='blue', linestyle='--', alpha=.5, label='True x1=2.0')
ax.axhline(1.5, color='red',  linestyle='--', alpha=.5, label='True x2=1.5')
ax.set_title('Panel Estimators Comparison'); ax.legend(); plt.xticks(rotation=30)
plt.tight_layout()
os.makedirs('06_panel_data_analysis', exist_ok=True)
plt.savefig('06_panel_data_analysis/panel_comparison.png', dpi=100, bbox_inches='tight')
plt.show(); print('Saved.')

## Key Takeaways

- **Pooled OLS**: ignores individual heterogeneity; biased if correlated
- **Fixed Effects**: consistent when alpha_i is correlated with X; eliminates time-invariant vars
- **Random Effects**: efficient if alpha_i is uncorrelated with X (verify with Hausman test)
- **Hausman test**: p<0.05 favors Fixed Effects over Random Effects
- Always cluster standard errors at the panel unit level
