In [None]:
"""
Forecasting Model for Financial Index Returns (Monthly)
-------------------------------------------------------

This script performs multiple linear regression forecasting on
financial/economic data. It includes model evaluation, residual diagnostics,
and feature importance visualization.

How to use:
1. Place your CSV file inside a folder named `data/`.
2. Ensure it contains a target variable column (e.g., 'Target_Return').
3. Update `target_column` below with your column name.

Dependencies:
pandas, numpy, matplotlib, seaborn, statsmodels, scikit-learn, scipy
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
from scipy.stats import shapiro
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Configuration
data_path = "data/monthly_data.csv"     # Your dataset path
target_column = "Target_Return"         # Replace with your target variable name
train_ratio = 0.8                       # Train-test split ratio
sns.set_theme(style="whitegrid")

# Load Data
df = pd.read_csv(data_path)

if target_column not in df.columns:
    raise ValueError(f"Target column '{target_column}' not found in data.")

X = df.drop(columns=[target_column])
y = df[target_column]

In [None]:
# Train-Test Split
train_size = int(len(df) * train_ratio)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Model Training
model = LinearRegression()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"\nTrain R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")

In [None]:
# Actual vs Predicted - Line Plot
plt.figure(figsize=(16, 8))
plt.plot(y_test.values, label="Actual", marker='o', markersize=7)
plt.plot(y_test_pred, label="Predicted", marker='x', linestyle='--', markersize=7)
plt.legend(fontsize=12)
plt.title("Actual vs Predicted - Monthly Return", fontsize=14)
plt.grid(True)
plt.show()

# Actual vs Predicted - Bar Plot (First 15 Observations)
plt.figure(figsize=(16, 8))
width = 0.35
x = np.arange(min(15, len(y_test)))
plt.bar(x - width/2, y_test.iloc[:15], width, label="Actual", color="skyblue")
plt.bar(x + width/2, y_test_pred[:15], width, label="Predicted", color="orange", alpha=0.7)
plt.legend(fontsize=12)
plt.title("Actual vs Predicted - First 15 Months", fontsize=14)
plt.grid(True)
plt.show()

In [None]:
# OLS Summary (Statsmodels)
X_train_ols = sm.add_constant(X_train)
ols_model = sm.OLS(y_train, X_train_ols).fit()
print("\nOLS Summary:")
print(ols_model.summary())

# Feature Importance
coefficients = pd.DataFrame(model.coef_, index=X.columns, columns=["Coefficient"])
coefficients = coefficients.sort_values(by="Coefficient", ascending=False)

plt.figure(figsize=(14, 8))
sns.barplot(x=coefficients["Coefficient"], y=coefficients.index, palette="coolwarm")
plt.title("Feature Importance - Regression Coefficients", fontsize=14)
plt.grid(True)
plt.show()

print("\nTop 5 Features driving target variable:")
print(coefficients.head())

In [None]:
# Forecast for Next Month
next_forecast = model.predict(X.iloc[[-1]])[0]
print(f"\nNext Month Forecast: {next_forecast:.4f}")

# Residual Analysis
residuals = y_test.values - y_test_pred

plt.figure(figsize=(14, 8))
plt.scatter(y_test_pred, residuals, s=50, alpha=0.7, edgecolor='k')
plt.axhline(0, color='red', linestyle='--')
plt.title("Residual Plot", fontsize=14)
plt.grid(True)
plt.show()

In [None]:
# Diagnostic Tests
# Normality (Shapiro-Wilk)
shapiro_test = shapiro(residuals)
normality_result = "Normal" if shapiro_test.pvalue > 0.05 else "Not Normal"

# Homoskedasticity (Breusch-Pagan)
X_test_ols = sm.add_constant(X_test)
bp_test = het_breuschpagan(residuals, X_test_ols)
homoskedasticity_result = "Homoskedastic" if bp_test[1] > 0.05 else "Heteroskedastic"

# Autocorrelation (Durbin-Watson)
dw_stat = durbin_watson(residuals)
autocorr_result = "No Strong Autocorrelation" if 1.5 < dw_stat < 2.5 else "Possible Autocorrelation"

print(f"\nNormality Test (Shapiro-Wilk): p={shapiro_test.pvalue:.4f} → {normality_result}")
print(f"Homoskedasticity Test (Breusch-Pagan): p={bp_test[1]:.4f} → {homoskedasticity_result}")
print(f"Durbin-Watson: {dw_stat:.2f} → {autocorr_result}")


# Multicollinearity (VIF)
vif_data = pd.DataFrame({
    "Feature": X.columns,
    "VIF": [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
})
print("\nVIF Analysis:")
print(vif_data)

high_vif = vif_data[vif_data["VIF"] > 5]
multicollinearity_result = "Detected" if not high_vif.empty else "Not Detected"
if not high_vif.empty:
    print("\nHigh Multicollinearity in:")
    print(high_vif)

# Correlation Heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(X.corr(), cmap="coolwarm", annot=False, fmt=".2f", linewidths=0.5, square=True)
plt.title("Feature Correlation Heatmap", fontsize=14)
plt.show()

In [None]:
#report summary
print("\n===== Interpretation Summary =====")
print(f"Train R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")
print(f"Next Month Forecast: {next_forecast:.4f}")
print(f"Normality: Residuals are {normality_result}")
print(f"Homoskedasticity: Residuals are {homoskedasticity_result}")
print(f"Autocorrelation: {autocorr_result}")
print(f"Multicollinearity: {multicollinearity_result}")
if not high_vif.empty:
    print(f"Features with High VIF: {', '.join(high_vif['Feature'])}")

print("\nTop 5 Features:")
print(coefficients.head())

actual_vs_pred = pd.DataFrame({
    "Actual": y_test.iloc[:10].values,
    "Predicted": y_test_pred[:10]
})
print("\nFirst 10 Actual vs Predicted:")
print(actual_vs_pred)