In [None]:
# Function definitions
# Execute this cell before doing anything else

import os
import json
import time
import numpy as np
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()

def _div_safe(dividend: float, divisor: float) -> float:
    try:
        return dividend / divisor
    except:
        return 0.0

# stock - name of the stock
# date - list with dates of observations
# reality - list of reality prices (corresponding with date list)
# prediction - list of predicted prices (corresponding with date list)
# <return> - list of individual records, to be appended into wide-form DataFrame
def transform_data(stock: str, date: list[str], reality: list[float], prediction: list[float]) -> list[dict]:
    data = []
    reality_series = pd.Series(reality)
    reality_diff_series = reality_series.diff().shift(-1)
    prediction_series = pd.Series(prediction)
    prediction_diff_series = prediction_series.diff().shift(-1)
    for i in range(1,len(date)):
        reality_series_head = reality_series.head(i)
        reality_diff_series_head = reality_diff_series.head(i)
        prediction_series_head = prediction_series.head(i)
        prediction_diff_series_head = prediction_diff_series.head(i)
    
        item = {"Date": date[i], "Stock": stock, "Prediction": prediction[i], "Reality": reality[i]}
        item["Price Delta - Predicted"] = prediction[i-1] - prediction[i]
        item["Price Delta - Reality"] = reality[i-1] - reality[i]
        item["Price Delta (%) - Predicted"] =  100 * _div_safe(prediction[i-1] - prediction[i], prediction[i-1])
        item["Price Delta (%) - Reality"] = 100 * (reality[i-1] - reality[i])/reality[i-1]
        item["Prediction Error"] = prediction[i] - reality[i]
        item["Prediction Error (Abs)"] = abs(prediction[i] - reality[i])
        item["Prediction Error (Cummulative)"] = sum([prediction[j] - reality[j] for j in range(i)])
        item["Prediction Error (Abs Cummulative)"] = sum([abs(prediction[j] - reality[j]) for j in range(i)])
        item["Correlation"] = reality_series_head.corr(prediction_series_head) if i > 2 else None
        item["Correlation - of Deltas"] = reality_diff_series_head.corr(prediction_diff_series_head) if i > 2 else None
        item["MSE"] = ((prediction_series_head - reality_series_head) ** 2).mean()
        item["RMSE"] = item["MSE"] ** 0.5
        item["RMSLE"] = ((np.log10(1 + prediction_series_head) - np.log10(1 + reality_series_head)) ** 2).mean()
        mse_model = ((prediction_series_head - reality_series_head) ** 2).mean()
        mse_baseline = ((reality_series_head.mean() - reality_series_head) ** 2).mean()
        item["R-Squared"] = 1 - mse_model / mse_baseline if i > 2 else None
        
        data.append(item)
    return data

def make_plot(wide_df):
    melted_df = df.copy().melt(["Date", "Stock"], var_name = "Metric", value_name = "Value")
    melted_df["color_id"] = melted_df["Stock"] + " - " + melted_df["Metric"]
    chart_base = alt.Chart(melted_df)
    
    selector_class = alt.selection_point(fields=['Metric'])
    chart_selector_class = chart_base.mark_bar().encode(
            y = f"Metric:N",
            color = alt.condition(selector_class, alt.value("red"), alt.value("lightgray"))
        ).add_params(selector_class)
    
    selector_stock = alt.selection_point(fields=['Stock'])
    chart_selector_stock = chart_base.mark_bar().encode(
            x = f"Stock:N",
            color = alt.condition(selector_stock, alt.value("red"), alt.value("lightgray"))
        ).add_params(selector_stock)
    
    chart_plot = chart_base.mark_line().encode(
            x = alt.X("Date"),
            y = alt.Y("Value").scale(zero=False),
            color = alt.Color("color_id", legend = None),
            tooltip = ["Stock", "Metric", "Date", "Value"]
        ).transform_filter(selector_class).transform_filter(selector_stock).properties(width = 2048).interactive()
    
    chart_box = chart_base.mark_boxplot().encode(
            x = alt.X("Stock"),
            column = alt.Column("Metric"),
            y = alt.Y("Value").scale(zero=False),
            color = alt.Color("color_id", legend = None),
        ).transform_filter(selector_class).transform_filter(selector_stock)
    
    return (chart_selector_stock & (chart_selector_class | chart_plot) & chart_box).configure(background='#BBBBBB')

In [None]:
# Load JSE predictions Data
# Execute this cell to load and transform JSE data into wide-form DataFrame
start = time.time()
data = []
for filename in os.listdir("../datasets/jse"):
    if filename.endswith("predicted"):
        path = os.path.join("../datasets/jse", filename)
        with open(path, "r") as infile:
            loaded = json.load(infile)
            data.extend(transform_data(filename[:5], loaded["Date"], loaded["Reality"], loaded["Prediction"]))
df = pd.DataFrame(data)
print(f"Loaded {len(data)} records in time {time.time() - start}")

In [None]:
# Load JSE Ridge predictions Data
# Execute this cell to load and transform JSE Ridge data into wide-form DataFrame
start = time.time()
data = []
for filename in os.listdir("../datasets/jse"):
    if filename.endswith("predicted-ridge"):
        path = os.path.join("../datasets/jse", filename)
        with open(path, "r") as infile:
            loaded = json.load(infile)
            data.extend(transform_data(filename[:5], loaded["Date"], loaded["Reality"], loaded["Prediction"]))
df = pd.DataFrame(data)
print(f"Loaded {len(data)} records in time {time.time() - start}")

In [None]:
# Load dummy models data
# Execute this cell to load and transform dummy models into wide-worm DataFrame
#    Note: this cell should be mutually-exclusive with Load Data cell (or custom data loading cell)
import random as rnd

def predict_keep(reality, inflation):
    prediction = []
    last = 0
    for item in reality:
        prediction.append(last * (1+inflation))
        last = item
    return prediction

def predict_keep_momentum(reality, momentum):
    prediction = []
    last = 0
    deltas = [0] * momentum
    for item in reality:
        prediction.append(last + sum(deltas)/momentum)
        deltas = deltas[1:]
        deltas.append(last - item)
        last = item
    return prediction

def predict_up_momentum(reality, momentum):
    prediction = []
    last = 0
    deltas = [0] * momentum
    for item in reality:
        prediction.append(last + abs(sum(deltas)/momentum))
        deltas = deltas[1:]
        deltas.append(last - item)
        last = item
    return prediction

def predict_rnd_momentum(reality, momentum):
    prediction = []
    last = 0
    deltas = [0] * momentum
    for item in reality:
        prediction.append(last + abs(sum(deltas)/momentum) * (1 - 2*rnd.random()))
        deltas = deltas[1:]
        deltas.append(last - item)
        last = item
    return prediction

def predict_random(items, min_val, max_val):
    return [min_val + (max_val - min_val) * rnd.random() for i in range(items)]

start = time.time()
data = []

sample_df = pd.read_csv("sample_prices.csv")

reality = sample_df["Adj Close"]
date = sample_df["Date"]
data.extend(transform_data("Keep", date, reality, predict_keep(reality, 0)))
data.extend(transform_data("Inflate 0.05%", date, reality,  predict_keep(reality, 0.0005)))
data.extend(transform_data("Keep_Momentum_1", date, reality,  predict_keep_momentum(reality, 1)))
data.extend(transform_data("Keep_Momentum_5", date, reality,  predict_keep_momentum(reality, 5)))
data.extend(transform_data("Keep_Momentum_20", date, reality,  predict_keep_momentum(reality, 20)))
data.extend(transform_data("Up_Momentum_1", date, reality,  predict_up_momentum(reality, 1)))
data.extend(transform_data("Up_Momentum_5", date, reality,  predict_up_momentum(reality, 5)))
data.extend(transform_data("Up_Momentum_20", date, reality,  predict_up_momentum(reality, 20)))
data.extend(transform_data("Rnd_Momentum_1", date, reality,  predict_rnd_momentum(reality, 1)))
data.extend(transform_data("Rnd_Momentum_5", date, reality,  predict_rnd_momentum(reality, 5)))
data.extend(transform_data("Rnd_Momentum_20", date, reality,  predict_rnd_momentum(reality, 20)))
data.extend(transform_data("Random", date, reality,  predict_random(len(reality), min(reality), max(reality))))
        
df = pd.DataFrame(data).tail(-20) # Trim due to how predictions are made
print(f"Loaded {len(data)} records in time {time.time() - start}")

In [None]:
# Execute this cell to make interactive atair graph with all the metrics for current wide-form DataFrame
make_plot(df)

In [None]:
#  Execute this cell to show final statistics for all metrics for current wide-form DataFrame
df["FormattedDate"] = pd.to_datetime(df["Date"])
idx = df.groupby(df["Stock"])["FormattedDate"].idxmax()
df_max = df.loc[idx]
df_max.describe()