In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Adjust this path to your public directory where predictions are saved
public_dir = os.path.abspath('public_price')
candidates = [
    os.path.join(public_dir, 'predictions_price_raw_latest.csv'),
    os.path.join(public_dir, 'val_predictions_latest.csv'),
    os.path.join(public_dir, 'predictions_price_raw_epoch005.csv'),
]
pred_path = None
for p in candidates:
    if os.path.exists(p):
        pred_path = p
        break

if pred_path is None:
    raise FileNotFoundError(f'No prediction CSV found in {public_dir}; expected one of: {candidates}')

print('Loading', pred_path)
df = pd.read_csv(pred_path, parse_dates=['Date'] if 'Date' in pd.read_csv(pred_path, nrows=0).columns else None)
df.head()

## Per-company time series plot
Select a company below (by name) and the notebook will plot RawTrue vs RawPred over time (if raw columns available) or ScaledTrue vs ScaledPred otherwise.

In [None]:
# list companies found in CSV
companies = sorted(df['Company'].unique().tolist())
print('Found companies (example):', companies[:10])

# choose a company to visualize
company = companies[0] if len(companies) > 0 else None
if company is None:
    raise RuntimeError('No companies found in prediction CSV')

# build time-indexed DataFrame for this company
c_df = df[df['Company'] == company].copy()
if 'RawTrue' in c_df.columns and 'RawPred' in c_df.columns:
    y_true_col = 'RawTrue'
    y_pred_col = 'RawPred'
else:
    y_true_col = 'ScaledTrue'
    y_pred_col = 'ScaledPred'

c_df = c_df.sort_values('Date') if 'Date' in c_df.columns else c_df.reset_index(drop=True)

plt.figure(figsize=(12,4))
plt.plot(c_df['Date'] if 'Date' in c_df.columns else np.arange(len(c_df)), c_df[y_true_col], label='True')
plt.plot(c_df['Date'] if 'Date' in c_df.columns else np.arange(len(c_df)), c_df[y_pred_col], label='Pred')
plt.title(f'Company {company}: {y_true_col} vs {y_pred_col}')
plt.legend()
plt.grid(True)
plt.show()

## Residuals and error statistics
Plot residuals (Pred - True) and compute MAE/RMSE for the selected company.

In [None]:
resid = c_df[y_pred_col].values - c_df[y_true_col].values
mae = (np.abs(resid)).mean()
rmse = np.sqrt((resid**2).mean())
print(f'Company {company} MAE={mae:.6f}, RMSE={rmse:.6f}')
plt.figure(figsize=(12,3))
plt.plot(c_df['Date'] if 'Date' in c_df.columns else np.arange(len(c_df)), resid, label='Residual')
plt.axhline(0, color='k', linestyle='--', alpha=0.6)
plt.title(f'Company {company} Residuals (Pred - True)')
plt.grid(True)
plt.show()

## Inspect errors across all companies
A quick bar plot of per-company RMSE (raw or scaled depending on data availability).

In [None]:
import math
grouped = df.groupby('Company')
errs = []
for comp, g in grouped:
    if y_true_col in g.columns and y_pred_col in g.columns:
        r = np.sqrt(((g[y_pred_col].values - g[y_true_col].values)**2).mean())
    else:
        r = math.nan
    errs.append((comp, r))
errs_sorted = sorted(errs, key=lambda x: (x[1] if not math.isnan(x[1]) else 1e9))
comps = [e[0] for e in errs_sorted]
rvals = [e[1] for e in errs_sorted]
plt.figure(figsize=(10,6))
plt.barh(comps, rvals)
plt.xlabel('RMSE')
plt.title('Per-company RMSE (lowest -> highest)')
plt.show()