<a href="https://colab.research.google.com/github/Bhawana1009/Regression-Analysis/blob/main/Regression_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Econometrics and Data Analysis for Accounting and Finance (MN52080)
# Empirical Project 1 – Individual Work

import os
import pandas as pd
import numpy as np
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
from statsmodels.compat import lzip
import matplotlib.pyplot as plt
import pickle
from statsmodels.stats.diagnostic import het_white
from statsmodels.stats.outliers_influence import reset_ramsey
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

data = pd.read_excel("C:\\econonometrics\\econometrics\\Goyal_Welch_data.xlsx" , index_col=0)
print(data.head())

data = data.loc['1970-01':'2020-12']
print(data)

data.index = pd.to_datetime(data.index ,format='%Y%m')
data = data.fillna(method='ffill')
print(data.head())

# DATAFRAME
data = pd.DataFrame({
    'Equity_Premium':np.log(data['CRSP_SPvw']) -np.log(data['Rfree']),
    'tms': data['lty'] - data['tbl'],
    'dfy': data['BAA'] - data['AAA'],
    'dfr': data['corpr']-data['ltr'],
    'tbl': data['tbl'],
    'ltr': data['ltr'],
    'Infl': data['infl']

})
print(data.head())

# MULTIPLE REGRESSION
data.replace([np.inf, -np.inf], np.nan, inplace=True)
formula = 'Equity_Premium ~ ltr + tbl + tms  + dfr + dfy + Infl'
results = smf.ols(formula, data=data).fit()
print(results.summary())


# F Test
hypotheses = ' tms= ltr = tbl = dfr = dfy = Infl = 0'
f_test = results.f_test(hypotheses)
print(f_test)

# NORMAL DISTRIBUTION
# JARQUE BERA
residuals = results.resid
name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
test = sms.jarque_bera(residuals)
lzip(name, test)

# RESET RAMSEY
formula = 'Equity_Premium ~ tms + ltr + tbl  + dfy + dfr + Infl'
results = smf.ols(formula, data).fit()
reset_ramsey(results,degree = 4)

residuals = results.resid
plt.figure(1, dpi=100)
plt.hist(residuals,20,edgecolor='black',linewidth=1.2)
plt.xlabel('Residuals')
plt.ylabel('Density')
plt.show()



# AUTOCORRELATION
with open('macro.pickle', 'rb') as handle:
 loaded_data = pickle.load(handle)

loaded_data = loaded_data.dropna()

# DURBIN WASTON TEST
residuals = results.resid
durbin_watson_stat = sms.durbin_watson(residuals)
print(durbin_watson_stat)

# BREUSCH GODFREY TEST
name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
results1 = sms.acorr_breusch_godfrey(results, 12)
lzip(name, results1)

data = data.dropna()
data.index = pd.to_datetime(data.index)
formula = 'Equity_Premium ~ tms  + tbl  + dfy  + dfr + ltr + Infl'
results = smf.ols(formula, data).fit()

# OBTAIN THE RESIDUALS AND PLOT OUT RESIDUALS OVER TIME
plt.figure(1)
plt.plot(results.resid)

plt.xlabel('Date')
plt.ylabel('Residuals')
plt.grid(True)
plt.show()

# BREUSCH PAGAN
name = ['Lagrange multiplier statistic', 'Lagrange multiplier p-value', 'F-statistic', 'F-statistic p-value']
breuschpagan_test = sms.het_breuschpagan(results.resid, results.model.exog)
lzip(name, breuschpagan_test)

# WHITE TEST
lm, lm_pvalue, fvalue, f_pvalue = het_white(results.resid, results.model.exog)
print("Lagrange multiplier statistic:",lm)
print("Lagrange multiplier p-value:",lm_pvalue)
print("F-statistic:",fvalue)
print("F-statistic p-value:",f_pvalue)
if lm_pvalue < 0.05:
 print("Rejected the null hypothesis. Heteroscedasticity detected")
else:
 print("Failed to rejected the null hypothesis. No heteroscedasticity detected")

 # GOLDFELD QUANDT TEST
y = data['Equity_Premium']
explanatory_var = data[['tms','tbl','ltr','dfy','dfr', 'Infl']]
x = sm.add_constant(explanatory_var)
sm.stats.diagnostic.het_goldfeldquandt(y, x, drop=0.25)

# WHITE MODIFIED STANDARD ERROR
results_White = smf.ols(formula, data).fit(cov_type='HC1')
print(results_White.summary())

# NEWEY WEST PROCEDURE
results_NW = smf.ols(formula, data).fit(cov_type='HAC',
 cov_kwds={'maxlags':6,'use_correction':True})
print(results_NW.summary())

# CORRELATION MATRIX AND VIF
explanatory_var.corr()
vif_data = pd.DataFrame()
vif_data["feature"] = explanatory_var.columns
vif_data["VIF"] = [variance_inflation_factor(explanatory_var.values, i)
 for i in range(len(explanatory_var.columns))]

print(vif_data)


import seaborn as sns
sns.heatmap(data.corr())

formula_1 ='Equity_Premium ~ tms + dfr  + tbl  + dfy + ltr + Infl '
results_1 = smf.ols(formula_1, data).fit()
y_fitted = results_1.fittedvalues
residuals = results_1.resid

plt.figure(1,dpi=100)
plt.plot(residuals, label='resid')
plt.plot(y_fitted, label='linear prediction')
plt.xlabel('Date')
plt.ylabel('Residuals')
plt.grid(True)
plt.legend()
plt.show()
residuals.nsmallest(2)

# DUMMY VARIABLE
data['JANDUM'] = np.where(data.index.month == 1, 1, 0)
data['DEC00DUM'] = np.where(data.index == '1973-11-1', 1, 0)
data['JUN00DUM'] = np.where(data.index == '1975-3-1', 1, 0)
formula_2 = 'Equity_Premium ~ tms + ltr + tbl  + dfy + dfr + Infl + DEC00DUM + JUN00DUM + JANDUM'
results_2 = smf.ols(formula_2, data).fit()
print(results_2.summary())


# T TEST
from scipy.stats import t

df = 334

alpha_1 = 0.01
alpha_5 = 0.05
alpha_10 = 0.1

t_critical_1 = t.ppf(1 - alpha_1/2, df)
t_critical_5 = t.ppf(1 - alpha_5/2, df)
t_critical_10 = t.ppf(1 - alpha_10/2, df)
print(f"1% Critical Value: {t_critical_1:.4f}")
print(f"5% Critical Value: {t_critical_5:.4f}")
print(f"10% Critical Value: {t_critical_10:.4f}")


