In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

# Create directory for saving plots
os.makedirs('figures', exist_ok=True)

# Load and preprocess data
def load_data():
    df = pd.read_csv('https://raw.githubusercontent.com/AZFARHAD24511/datasets/refs/heads/main/financial_dataset.csv')

    df['ReportDate'] = pd.to_datetime(df['Year'].astype(str) + 'Q' + df['Quarter'].astype(str))
    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    df = df.sort_values(by=['name', 'ReportDate'])

    df['SIC2'] = df['sic'].astype(str).str[:2]
    df['quarter'] = df['Quarter']
    df['Revenues_lag1'] = df.groupby('name')['Revenues'].shift(1)
    df['ROA_rolling4'] = df.groupby('name')['ROA'].rolling(4).mean().reset_index(0, drop=True)
    df['ProfitMargin_rolling4'] = df.groupby('name')['ProfitMargin'].rolling(4).mean().reset_index(0, drop=True)
    df['Revenues_pct_change'] = df.groupby('name')['Revenues'].pct_change()

    df = df[df['SIC2'].isin(['20', '21', '22', '23', '24'])]
    df = df[df['Revenues'] > 1e6]

    features = [
        'Assets', 'Liabilities', 'NetIncomeLoss', 'ROA', 'ProfitMargin', 'DebtRatio',
        'Revenues_lag1', 'ROA_rolling4', 'ProfitMargin_rolling4', 'Revenues_pct_change',
        'quarter'
    ]
    df_model = df.dropna(subset=features + ['Revenues'])
    X = df_model[features]
    y = np.log1p(df_model['Revenues'])
    report_dates = df_model['ReportDate']

    return X, y, report_dates

# Train-test split based on time
def time_split(X, y, dates, cutoff='2019-01-01'):
    X_train = X[dates < cutoff]
    X_test = X[dates >= cutoff]
    y_train = y[dates < cutoff]
    y_test = y[dates >= cutoff]
    return X_train, X_test, y_train, y_test

# Evaluation metrics
def evaluate(y_true, y_pred, label='Test'):
    y_true_actual = np.expm1(y_true)
    y_pred_actual = np.expm1(y_pred)
    rmse = np.sqrt(mean_squared_error(y_true_actual, y_pred_actual))
    mae = mean_absolute_error(y_true_actual, y_pred_actual)
    mape = np.mean(np.abs((y_true_actual - y_pred_actual) / y_true_actual)) * 100
    r2 = r2_score(y_true_actual, y_pred_actual)
    return rmse, mae, mape, r2

# General model runner
def run_model(model, name, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)

    rmse_t, mae_t, mape_t, r2_t = evaluate(y_test, y_pred_test)
    _, _, _, r2_train = evaluate(y_train, y_pred_train)

    print(f"===== {name} =====")
    print(f"Test RMSE: {rmse_t:.2f}")
    print(f"Test MAE: {mae_t:.2f}")
    print(f"Test MAPE: {mape_t:.2f}%")
    print(f"Test R^2: {r2_t:.3f}")
    print(f"Train R^2: {r2_train:.3f}\n")

# Load data
X, y, report_dates = load_data()
X_train, X_test, y_train, y_test = time_split(X, y, report_dates)

# Pipelines for each model
models = {
    "XGBoost": xgb.XGBRegressor(
        random_state=0,
        max_depth=3,
        n_estimators=100,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1
    ),
    "Linear Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LinearRegression())
    ]),
    "Lasso Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('lasso', LassoCV(cv=5, max_iter=10000))
    ]),
    "Ridge Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('ridge', RidgeCV(cv=5))
    ]),
    "Random Forest": Pipeline([
        ('scaler', StandardScaler()),
        ('rf', RandomForestRegressor(n_estimators=100, random_state=0))
    ])
}

# Run all models
for name, model in models.items():
    run_model(model, name, X_train, X_test, y_train, y_test)


===== XGBoost =====
Test RMSE: 22375643180.11
Test MAE: 7901908929.08
Test MAPE: 43.94%
Test R^2: 0.847
Train R^2: 0.980

===== Linear Regression =====
Test RMSE: 30110479996185534816845824.00
Test MAE: 1540586178642369411284992.00
Test MAPE: 105399618970893025280.00%
Test R^2: -276321501421102068420461985792.000
Train R^2: -15.862

===== Lasso Regression =====
Test RMSE: 3629333315946385315463168.00
Test MAE: 185692846641539803250688.00
Test MAPE: 12704226191929739264.00%
Test R^2: -4014514522794469655532535808.000
Train R^2: -15.971

===== Ridge Regression =====
Test RMSE: 83914917990977019779743744.00
Test MAE: 4293460710529587866501120.00
Test MAPE: 293738272632756731904.00%
Test R^2: -2146138134300493277456544825344.000
Train R^2: -7.679

===== Random Forest =====
Test RMSE: 22992405232.14
Test MAE: 8114171197.26
Test MAPE: 42.51%
Test R^2: 0.839
Train R^2: 0.978

