In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load dataset
df = pd.read_csv("financial_dataset.csv")

# Create a datetime feature from year and quarter
df['ReportDate'] = pd.to_datetime(df['Year'].astype(str) + 'Q' + df['Quarter'].astype(str))

# Clean the data
df = df.replace([np.inf, -np.inf], np.nan).dropna()

# Process categorical features
df['SIC2'] = df['sic'].astype(str).str[:2].astype(int)               # Use 2-digit SIC code
df['stprinc'] = df['stprinc'].astype('category').cat.codes          # Encode state as categorical integer

# Drop unused columns
df = df.drop(columns=['Label', 'name'])                             # Drop company name and label

# Sort by time for time series split
df = df.sort_values('ReportDate')

# Define features and target
features = [col for col in df.columns if col not in ['NetIncomeLoss', 'Year', 'Quarter', 'ReportDate']]
X = df[features]
y = df['NetIncomeLoss']
dates = df['ReportDate']

# Time-based train-test split (train before 2023, test from 2023)
cutoff = pd.to_datetime('2023-01-01')
X_train, X_test = X[dates < cutoff], X[dates >= cutoff]
y_train, y_test = y[dates < cutoff], y[dates >= cutoff]

# Evaluation function
def evaluate(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (np.abs(y_true) + 1e-6))) * 100
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, mape, r2

# Define models
models = {
    "Linear Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]),
    "Ridge Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=1.0))
    ]),
    "Random Forest": RandomForestRegressor(
        n_estimators=50,         # fewer trees to save memory
        max_depth=10,            # limit tree depth
        max_features='sqrt',     # subset of features at each split
        random_state=0,
        n_jobs=-1                # use all CPU cores
    )
}

# Train and evaluate all models
results = []
for name, model in models.items():
    print(f"Training {name}...")
    start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start
    y_pred = model.predict(X_test)
    rmse, mae, mape, r2 = evaluate(y_test, y_pred)
    results.append({
        'Model': name,
        'RMSE': rmse,
        'MAE': mae,
        'MAPE (%)': mape,
        'R^2': r2,
        'Training Time (s)': train_time
    })

# Show results as a sorted table
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='R^2', ascending=False).reset_index(drop=True)
print(results_df)


  df['ReportDate'] = pd.to_datetime(df['Year'].astype(str) + 'Q' + df['Quarter'].astype(str))


Training Linear Regression...
Training Ridge Regression...
Training Random Forest...
               Model          RMSE           MAE      MAPE (%)       R^2  \
0      Random Forest  3.069004e+09  3.811884e+08  1.896758e+12  0.896275   
1  Linear Regression  7.383362e+09  1.233551e+09  4.371369e+13  0.399659   
2   Ridge Regression  7.383415e+09  1.233548e+09  4.371754e+13  0.399650   

   Training Time (s)  
0           3.739271  
1           0.084827  
2           0.048892  
