# Build and compress index files and metadata

### *Notes on data transformation*  
Raw values can be misleading since we focus on patterns.  
There are some possible ways of transforming data:  
1. Divide by first value - $x_t^* = \frac{x_t}{x_1}$
2. Standardize - $x_t^* = \frac{x_t-\bar x}{\sigma}$
3. Scale to range (0, 1) - $x_t^* = \frac{x_t-x_{min}}{x_{max}-x_{min}}$

This project uses **3**

### *Notes on similarity search*
Some metrics to calculate similarity:
1. Euclidean distance - $\|\mathbf{x}-\mathbf{y}\|_2$
2. Cosine similarity - $\frac{\mathbf{x}\cdot\mathbf{y}}{\|\mathbf{x}\|_2\|\mathbf{y}\|_2}$
3. Dynamic time wrapping

This project uses **1** with additional processing to lower memory uses and speed up searching

## ===== Part 1: Stock data processing =====

In [None]:
NUM_TICKERS = 200  # int or None for all tickers
DAYS_WINDOW = 50
PERIOD = "2y"  # one of 1y, 2y, 5y, 10y, max

In [None]:
import random

import numpy as np

random.seed(42)
np.random.seed(42)

In [None]:
from stock_gone_wrong.simularity.ticker import get_us_tickers

ticker_symbols = get_us_tickers()
if NUM_TICKERS is not None:
    ticker_symbols = random.choices(ticker_symbols, k=NUM_TICKERS)
print(len(ticker_symbols), "tickers")

In [None]:
from typing import cast

import pandas as pd
import yfinance as yf

tickers = yf.Tickers(",".join(ticker_symbols))
history_df = cast(pd.DataFrame, tickers.history(period=PERIOD))
print(len(history_df), "days")

In [None]:
from stock_gone_wrong.simularity.ticker import process_history, shorlist_history

df = process_history(history_df)
if NUM_TICKERS is None:
    df = shorlist_history(df, 5000)
metrics: list[str] = list(df.columns.levels[0].values)

if "Close" in metrics and "Volume" in metrics:
    # some cells may have a NaN Close but float Volume
    df["Volume"] = df["Volume"].where(~df["Close"].isna(), other=pd.NA)

print(df.columns.levels[0], df.shape)
display(df.head())

# each metric should have the same nan masks
nan_masks = [df[c].isna().to_numpy() for c in metrics]
assert np.all(nan_masks == nan_masks[0])

## ===== Part 2: Data indexing =====

In [None]:
DATA_FILE = f"us_stock_{DAYS_WINDOW}.zip"

In [None]:
from sklearn.preprocessing import minmax_scale

from stock_gone_wrong.simularity.preprocess import sliding_metrics_series_view

# to make sure there are data after the window
metrics_series, non_nans = sliding_metrics_series_view(df, DAYS_WINDOW, DAYS_WINDOW)
for m in metrics_series:
    # sort of standardise the samples by clamping the data
    metrics_series[m] = minmax_scale(metrics_series[m], feature_range=(0, 1), axis=1)
print(non_nans.shape, metrics_series[metrics[0]].shape)

In [None]:
from stock_gone_wrong.simularity.indexing import (create_index,
                                                  create_index_meta)

metric_indices = {m: create_index(s) for m, s in metrics_series.items()}
meta_df = create_index_meta(non_nans, df["Close"].columns, df.index)

In [None]:
from stock_gone_wrong.simularity.indexing import DataPack

DataPack(df, metric_indices, meta_df).archive(DATA_FILE)
data_pack = DataPack.extract(DATA_FILE)

In [None]:
%%timeit
data_pack = DataPack.extract(DATA_FILE)

## ===== Part 3: Query =====

In [None]:
METRICS = "Close"
TICKER = "TSLA"
SHOW_FORECAST = True

if SHOW_FORECAST:
    import warnings

    warnings.warn(
        "SHOW_FORECAST is set to True. But keep in mind that the calculations are based on the scaled data, which will lead to underestimating the uncertainty. The results have little predictive power."
    )

In [None]:
query_df = yf.Tickers(TICKER).history("6mo")
query_df = process_history(query_df)
query_df.columns = query_df.columns.droplevel(1)

raw_query_data = query_df[METRICS][:DAYS_WINDOW].to_numpy()
query_data = minmax_scale(raw_query_data, feature_range=(0, 1))
query_data = query_data.reshape((1, -1))
print(query_df.shape, query_data.shape)

In [None]:
(dist, *_), (idx, *_) = data_pack.indices[METRICS].search(query_data, 20)
print(dist)
print(idx)

In [None]:
from matplotlib import pyplot as plt

from stock_gone_wrong.simularity.preprocess import extended_minmax_scale
from stock_gone_wrong.simularity.visual import calculate_PI, format_plot

similar_tickers = [data_pack.meta.loc[i]["Ticker"] for i in idx]
print(sorted(list(set(similar_tickers))))

x_days = np.arange(-DAYS_WINDOW, DAYS_WINDOW)
scaled_series = []
for i in idx:
    series = data_pack.get_series(i, METRICS, DAYS_WINDOW * 2)
    series = extended_minmax_scale(series, (0, 1), fit_window=slice(0, DAYS_WINDOW))
    plt.plot(x_days, series, color="grey", alpha=0.3)
    scaled_series.append(series)
scaled_data = np.stack(scaled_series)


if SHOW_FORECAST:
    series_mean = scaled_data.mean(axis=0)
    plt.plot(x_days, series_mean, label="Mean")
    pi_lower, pi_upper = calculate_PI(scaled_data)
    plt.fill_between(
        x_days, pi_lower, pi_upper, alpha=0.2, label="95% Prediction Interval"
    )
plt.axvline(-1, color="red", alpha=0.5, linestyle="--", label="Last record")
plt.plot(x_days[:DAYS_WINDOW], query_data[0], label=TICKER)

format_plot(TICKER, METRICS)
plt.show()