# 04 — Regression Analysis
**Author:** Ebenezer Adjartey

Covers: Simple & multiple OLS, diagnostics (VIF, heteroskedasticity), logistic regression, probit, Tobit, Poisson, Negative Binomial, IV/2SLS, quantile regression.

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan, het_white
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
sns.set_theme(style='whitegrid')
print('Libraries loaded.')

## 1. Data Generation

In [None]:
n = 300
# Continuous variables
educ    = np.random.randint(8, 20, n)              # years of education
exper   = np.random.randint(0, 30, n)              # years of experience
iq      = np.random.normal(100, 15, n)             # IQ score
female  = np.random.binomial(1, 0.5, n)            # gender dummy

# True DGP: wage = 10 + 2*educ + 0.5*exper + 0.05*iq + e
wage_log = 10 + 2*educ + 0.5*exper + 0.05*iq - 5*female + np.random.normal(0, 20, n)
wage     = np.exp(wage_log / 40)  # rescale

# Binary outcome
latent   = -5 + 0.3*educ + 0.05*exper + np.random.normal(0, 1, n)
employed = (latent > 0).astype(int)

# Count outcome
pub_rate = np.exp(0.5 + 0.1*educ + np.random.normal(0, 0.3, n))
publications = np.random.poisson(pub_rate)

df = pd.DataFrame({'wage':wage,'educ':educ,'exper':exper,'iq':iq,
                   'female':female,'employed':employed,'publications':publications})
print(df.head()); print(df.describe().round(2))

## 2. Simple OLS Regression

In [None]:
# wage = b0 + b1*educ + e
X_simple = sm.add_constant(df['educ'])
ols_simple = sm.OLS(df['wage'], X_simple).fit()
print(ols_simple.summary())

## 3. Multiple OLS Regression

In [None]:
X_multi = sm.add_constant(df[['educ','exper','iq','female']])
ols_multi = sm.OLS(df['wage'], X_multi).fit()
print(ols_multi.summary())

## 4. OLS Diagnostics

In [None]:
# 4a. Variance Inflation Factor (multicollinearity)
vif_data = pd.DataFrame()
vif_data['Variable'] = ['educ','exper','iq','female']
X_vif = df[['educ','exper','iq','female']]
vif_data['VIF'] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
print('VIF (>10 = multicollinearity concern):')
print(vif_data)

# 4b. Breusch-Pagan heteroskedasticity test
bp_stat, bp_p, _, _ = het_breuschpagan(ols_multi.resid, ols_multi.model.exog)
print(f'\nBreusch-Pagan test: stat={bp_stat:.4f}, p={bp_p:.4f}')
print('Verdict:', 'Heteroskedasticity present' if bp_p < 0.05 else 'Homoskedastic')

# 4c. Robust standard errors (HC3)
ols_robust = sm.OLS(df['wage'], X_multi).fit(cov_type='HC3')
print('\nRobust (HC3) coefficient table:')
print(ols_robust.summary2().tables[1].round(4))

## 5. Logistic Regression (Binary)

In [None]:
X_logit = sm.add_constant(df[['educ','exper','female']])
logit_model = sm.Logit(df['employed'], X_logit).fit(disp=False)
print(logit_model.summary())

# Marginal effects at the mean
print('\nMarginal effects at mean (dP/dx):')
print(logit_model.get_margeff().summary())

## 6. Probit Model

In [None]:
probit_model = sm.Probit(df['employed'], X_logit).fit(disp=False)
print(probit_model.summary())
print('\nProbit Marginal Effects:')
print(probit_model.get_margeff().summary())

## 7. Poisson Regression (Count Data)

In [None]:
X_count = sm.add_constant(df[['educ','exper']])
poisson_model = sm.Poisson(df['publications'], X_count).fit(disp=False)
print(poisson_model.summary())

# Incidence rate ratios (IRR)
irr = np.exp(poisson_model.params)
print('\nIncidence Rate Ratios (exp(beta)):')
print(irr.round(4))

## 8. Negative Binomial Regression

In [None]:
nb_model = sm.NegativeBinomial(df['publications'], X_count).fit(disp=False)
print(nb_model.summary())

# Compare Poisson vs NB: likelihood ratio test for overdispersion
lr_stat = 2 * (nb_model.llf - poisson_model.llf)
lr_p    = stats.chi2.sf(lr_stat, df=1)
print(f'\nLR test (overdispersion): stat={lr_stat:.4f}, p={lr_p:.4f}')
print('Prefer NB over Poisson if p < 0.05 (overdispersion present)')

## 9. Quantile Regression

In [None]:
quantile_models = {}
for q in [0.25, 0.50, 0.75]:
    qr = smf.quantreg('wage ~ educ + exper + female', df).fit(q=q)
    quantile_models[q] = qr
    print(f'\nQuantile = {q}:')
    print(qr.params.round(4))

## 10. Regression Plots & Diagnostics

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Scatter + regression line
axes[0,0].scatter(df['educ'], df['wage'], alpha=.3, s=15)
x_line = np.linspace(df['educ'].min(), df['educ'].max(), 100)
y_line = ols_simple.params['const'] + ols_simple.params['educ'] * x_line
axes[0,0].plot(x_line, y_line, 'r-', lw=2)
axes[0,0].set_title(f'Simple OLS: wage ~ educ\nR2={ols_simple.rsquared:.3f}')
axes[0,0].set_xlabel('Education'); axes[0,0].set_ylabel('Wage')

# Residuals vs fitted
axes[0,1].scatter(ols_multi.fittedvalues, ols_multi.resid, alpha=.3, s=15)
axes[0,1].axhline(0, color='red', linestyle='--')
axes[0,1].set_title('Residuals vs Fitted')
axes[0,1].set_xlabel('Fitted Values'); axes[0,1].set_ylabel('Residuals')

# QQ plot of residuals
stats.probplot(ols_multi.resid, plot=axes[1,0])
axes[1,0].set_title('Q-Q Plot of Residuals')

# Quantile regression coefficients for 'educ'
qs = [0.25, 0.50, 0.75]
coefs = [quantile_models[q].params['educ'] for q in qs]
axes[1,1].plot(qs, coefs, 'bo-', lw=2, ms=8)
axes[1,1].axhline(ols_multi.params['educ'], color='red', linestyle='--', label='OLS')
axes[1,1].set_title('Quantile Regression: educ Coefficient')
axes[1,1].set_xlabel('Quantile'); axes[1,1].set_ylabel('Coefficient')
axes[1,1].legend()

plt.tight_layout()
os.makedirs('04_regression_analysis', exist_ok=True)
plt.savefig('04_regression_analysis/regression_plots.png', dpi=100, bbox_inches='tight')
plt.show(); print('Saved.')

## Key Takeaways

- **OLS**: unbiased under Gauss-Markov assumptions; use robust SEs for heteroskedasticity
- **VIF > 10**: multicollinearity concern
- **Logit/Probit**: for binary outcomes; report marginal effects
- **Poisson**: for counts; test for overdispersion; use NB if present
- **Quantile regression**: robust to outliers; models full conditional distribution
