Imports & helper defs 

In [10]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, math, random, os
from pathlib import Path
import yfinance as yf, mplfinance as mpf, plotly.express as px
from ts2vec import TS2Vec
from sklearn.cluster import DBSCAN
import hdbscan, torch, ipywidgets as w
from sklearn.preprocessing import StandardScaler

from ipywidgets import Layout 

def make_windows_multich(arr2d, size=256, stride=1):
    T, C = arr2d.shape
    if T < size:
        # return an “empty” windows array
        return np.zeros((0, size, C), dtype=arr2d.dtype)
    n = (T - size)//stride + 1
    return np.stack([arr2d[i*stride:i*stride+size] for i in range(n)])

def standardise_ohlcv(df, log_volume=True):
    df = df.copy()
    if log_volume:
        df['Volume'] = np.log1p(df['Volume'])
    return (df - df.mean()) / df.std()

def candle_panel(members, ax=None):
    """
    Plot the average window in full candlestick + volume style.
    `members` is an array of shape (n_windows, win_len, 5).
    """
    # 1) compute prototype
    proto = members.mean(axis=0)  # shape (win_len, 5)

    # 2) build a DataFrame with a dummy minute-based index
    df = pd.DataFrame(
        proto,
        columns=['Open','High','Low','Close','Volume'],
        index=pd.date_range('2000-01-01', periods=len(proto), freq='min')
    )
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    if df.empty:
        if ax: ax.axis('off')
        return

    # 3) plot candlestick only
    own_fig = False
    if ax is None:
        fig, ax = plt.subplots(figsize=(6,4))
        own_fig = True

    mpf.plot(
        df,
        type='candle',
        style='charles',
        volume=False,   # disable built-in volume
        ax=ax,
        tight_layout=True
    )

    # 4) overlay volume bars on a twin axis
    ax2 = ax.twinx()
    ax2.bar(df.index, df['Volume'], color='#bbb', alpha=.3, width=1)
    ax2.axis('off')
    ax.axis('off')

    if own_fig:
        plt.show()

def gallery_panel(members, ax=None):
    proto = members.mean(0)
    df = pd.DataFrame(proto,
                      columns=['Open','High','Low','Close','Volume'],
                      index=pd.date_range('2000-01-01', periods=len(proto)))
    df.replace([np.inf,-np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    if df.empty:
        if ax: ax.axis('off'); return
    own = False
    if ax is None:
        fig, ax = plt.subplots(figsize=(6,3)); own=True
    mpf.plot(df, ax=ax, type='line', style='charles',
             volume=False, tight_layout=True)
    ax2 = ax.twinx()
    ax2.bar(df.index, df['Volume'], color='#bbb', alpha=.3, width=1)
    ax2.axis('off'); ax.axis('off')
    if own: plt.show()

def inspect_cluster(windows3, labels, cid, n=6):
    idx = np.where(labels==cid)[0]
    if len(idx)==0:
        print('Cluster empty'); return
    samp = np.random.choice(idx, min(n,len(idx)), replace=False)
    rows = math.ceil(len(samp)/3)
    fig, axs = plt.subplots(rows,3, figsize=(12,2.5*rows))
    for ax,i in zip(axs.ravel(), samp):
        gallery_panel(windows3[i:i+1], ax=ax)
        ax.set_title(f'id {i}', fontsize=8)
    for ax in axs.ravel()[len(samp):]: ax.axis('off')
    plt.tight_layout(); plt.show()

def save_model(enc, path):
    # use the model’s own save method
    enc.save(path)

def load_model(path, input_dims, device='cpu'):
    # re-instantiate and then call its load() method
    enc = TS2Vec(input_dims=input_dims, device=device)
    enc.load(path)
    return enc

In [9]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

print(torch.version.cuda)
print(torch.backends.cudnn.version())

True
1
NVIDIA GeForce RTX 4080 Laptop GPU
12.1
90100


Interactive config panel

In [27]:
import pandas as pd
import ipywidgets as w
from ipywidgets import Layout

# --- existing widgets ---
TICKERS   = w.SelectMultiple(options=['AAPL','MSFT','AMZN','NVDA','GOOGL','TSLA'],
                             value=('AAPL','MSFT'), description='Tickers')
dates     = pd.date_range('2000-01-01','2025-12-31',freq='MS')
DATE_RANGE= w.SelectionRangeSlider(options=dates,
                                   value=(dates[0], dates[-1]),
                                   description='Dates',
                                   layout=Layout(width='700px'),
                                   continuous_update=False)
WIN_LENS  = w.SelectMultiple(options=[8,16,32,64,128,256], value=(64,256),
                             description='Win lens')
LOG_VOL   = w.Checkbox(value=True, description='log1p Volume')
EPS       = w.FloatSlider(value=0.9, min=0.2, max=2, step=0.1,
                          description='eps')
MIN_SAMP  = w.IntSlider(value=5, min=2, max=15, step=1,
                        description='min_samples')
MIN_SIZE  = w.IntSlider(value=5, min=1, max=30, step=1,
                        description='MIN_SIZE')

# --- new TS2Vec hyper-params ---
N_EPOCHS = w.IntSlider(
    value=50,            # default 50 epochs
    min=1,
    max=200,
    step=1,
    description='n_epochs'
)
OUTPUT_DIMS = w.IntText(value=512, description='output_dims')
HIDDEN_DIMS = w.IntText(value=128, description='hidden_dims')
DEPTH       = w.IntSlider(value=12, min=1, max=20, step=1,
                          description='depth')
LR          = w.FloatLogSlider(value=0.001, base=10,
                               min=-5, max=0, step=0.1,
                               description='lr')
BATCH_SIZE  = w.IntSlider(value=16, min=1, max=128, step=1,
                          description='batch_size')

# --- display everything together ---
display(w.VBox([
    TICKERS,
    DATE_RANGE,
    WIN_LENS,
    LOG_VOL,
    EPS,
    MIN_SAMP,
    MIN_SIZE,
    # TS2Vec block:
    OUTPUT_DIMS,
    HIDDEN_DIMS,
    DEPTH,
    LR,
    BATCH_SIZE,
    N_EPOCHS
]))

VBox(children=(SelectMultiple(description='Tickers', index=(0, 1), options=('AAPL', 'MSFT', 'AMZN', 'NVDA', 'G…

Download & standardise

In [18]:
cols = ['Open','High','Low','Close','Volume']
start = DATE_RANGE.value[0].strftime('%Y-%m-%d')
end   = DATE_RANGE.value[1].strftime('%Y-%m-%d')

dfs = []
for tkr in TICKERS.value:
    df = yf.download(tkr, start=start, end=end,
                     auto_adjust=False, progress=False)[cols]
    df['Ticker'] = tkr
    dfs.append(df)
raw_df = pd.concat(dfs).dropna()
print(raw_df.groupby('Ticker').size())

std_df = standardise_ohlcv(raw_df[cols], log_volume=LOG_VOL.value)
data_np = std_df.values

Ticker
AAPL    6416
dtype: int64


Windowing

In [19]:
windows_dict = {}
for L in WIN_LENS.value:
    windows_dict[L] = make_windows_multich(data_np, size=L, stride=1)
    print(f'Len {L}:', windows_dict[L].shape)

Len 8: (6409, 8, 5)
Len 64: (6353, 64, 5)


Train or load TS2Vec

In [20]:
device     = 'cuda' if torch.cuda.is_available() else 'cpu'
input_dims = 5
print(device)
encoders = {}

for L, win3 in windows_dict.items():
    n_wins = win3.shape[0]
    PATH  = f'ts2vec_ohlcv_L{L}.pt'

    # 1) Skip any window‐length that produced zero windows
    if n_wins == 0:
        print(f"⚠️  Skipping L={L}: no windows (data length < {L})")
        continue

    # 2) Load or train as before
    if Path(PATH).exists():
        enc = load_model(PATH, input_dims, device=device)
        print(f'✅ Loaded model for window length {L}.')
    else:
        enc = encoder = TS2Vec(
            input_dims   = input_dims,
            output_dims  = OUTPUT_DIMS.value,
            hidden_dims  = HIDDEN_DIMS.value,
            depth        = DEPTH.value,
            device       = device,
            lr           = LR.value,
            batch_size   = BATCH_SIZE.value)
        enc.fit(win3, n_epochs=N_EPOCHS.value, verbose=True)
        save_model(enc, PATH)
        print(f'💾 Model trained & saved → {PATH}')

    encoders[L] = enc
   

cuda
Epoch #0: loss=1.8524624693393708
Epoch #1: loss=1.4741976998746396
Epoch #2: loss=1.3329044197499753
Epoch #3: loss=1.1706568786501885
Epoch #4: loss=1.1516100457310676
Epoch #5: loss=1.086480716392398
Epoch #6: loss=1.0637135127186774
Epoch #7: loss=1.0341109657287597
Epoch #8: loss=1.003539227321744
Epoch #9: loss=0.9728141734004021
Epoch #10: loss=0.9672667769342661
Epoch #11: loss=0.9422362433373928
Epoch #12: loss=0.951987733617425
Epoch #13: loss=0.8960464334487915
Epoch #14: loss=0.8942458010837435
Epoch #15: loss=0.8830837276205421
Epoch #16: loss=0.9204633899405599
Epoch #17: loss=0.8684426309540868
Epoch #18: loss=0.8431382766738534
Epoch #19: loss=0.8496761270612478
Epoch #20: loss=0.848962961807847
Epoch #21: loss=0.8130769573524594
Epoch #22: loss=0.8353477125242352
Epoch #23: loss=0.7962501405552029
Epoch #24: loss=0.8373217950016261
Epoch #25: loss=0.8516824279725551
Epoch #26: loss=0.8302527353912592
Epoch #27: loss=0.8041453368216753
Epoch #28: loss=0.77343189468

mbedding + clustering

In [30]:
results = {}
for L, win3 in windows_dict.items():
    Z = encoder.encode(win3).mean(axis=1)
    Z_scaled = StandardScaler().fit_transform(Z)
    labels = DBSCAN(eps=EPS.value, min_samples=MIN_SAMP.value).fit_predict(Z_scaled)
    if (labels==-1).all():  # fallback
        labels = hdbscan.HDBSCAN(min_cluster_size=MIN_SAMP.value,
                                 min_samples=MIN_SAMP.value).fit_predict(Z_scaled)
    results[L] = dict(labels=labels, Z=Z_scaled)
    print(f'L={L}:', dict(zip(*np.unique(labels, return_counts=True))))



L=8: {np.int64(-1): np.int64(241), np.int64(0): np.int64(1157), np.int64(1): np.int64(21), np.int64(2): np.int64(4199), np.int64(3): np.int64(791)}




L=64: {np.int64(-1): np.int64(926), np.int64(0): np.int64(2143), np.int64(1): np.int64(14), np.int64(2): np.int64(80), np.int64(3): np.int64(37), np.int64(4): np.int64(27), np.int64(5): np.int64(38), np.int64(6): np.int64(26), np.int64(7): np.int64(11), np.int64(8): np.int64(59), np.int64(9): np.int64(13), np.int64(10): np.int64(51), np.int64(11): np.int64(55), np.int64(12): np.int64(170), np.int64(13): np.int64(19), np.int64(14): np.int64(59), np.int64(15): np.int64(14), np.int64(16): np.int64(13), np.int64(17): np.int64(30), np.int64(18): np.int64(70), np.int64(19): np.int64(11), np.int64(20): np.int64(31), np.int64(21): np.int64(59), np.int64(22): np.int64(24), np.int64(23): np.int64(16), np.int64(24): np.int64(57), np.int64(25): np.int64(67), np.int64(26): np.int64(14), np.int64(27): np.int64(46), np.int64(28): np.int64(37), np.int64(29): np.int64(14), np.int64(30): np.int64(25), np.int64(31): np.int64(73), np.int64(32): np.int64(15), np.int64(33): np.int64(40), np.int64(34): np.in

Interactive pattern gallery

In [None]:
def show_gallery(win_len):
    labels = results[win_len]['labels']
    win3   = windows_dict[win_len]
    ids = [cid for cid,cnt in zip(*np.unique(labels, return_counts=True))
           if cid!=-1 and cnt>=MIN_SIZE.value]
    if not ids:
        print('No clusters ≥', MIN_SIZE.value); return
    cols=3; rows=math.ceil(len(ids)/cols)
    fig,axs=plt.subplots(rows,cols,figsize=(5*cols,3*rows), squeeze=False)
    for ax,cid in zip(axs.ravel(), ids):
        candle_panel(win3[labels==cid], ax=ax)
        ax.set_title(f'CID {cid} • {sum(labels==cid)}', fontsize=8)
    for ax in axs.ravel()[len(ids):]: ax.axis('off')
    plt.tight_layout(); plt.show()



w.interact(show_gallery, win_len=w.Dropdown(options=WIN_LENS.value, description='Win len'))

Raw-window drill-down

In [None]:
def drill_nonoverlap(win_len, cid, n):
    windows3 = windows_dict[win_len]
    labels   = results[win_len]['labels']
    idxs     = np.where(labels == cid)[0]
    if len(idxs) == 0:
        print(f"Cluster {cid} empty")
        return

    # pick up to n non-overlapping start indices
    non_overlap = []
    last_end    = -win_len
    for i in sorted(idxs):
        if i >= last_end + win_len:
            non_overlap.append(i)
            last_end = i
        if len(non_overlap) == n:
            break

    rows = math.ceil(len(non_overlap) / 3)
    fig, axs = plt.subplots(rows, 3, figsize=(5*3, 2.5*rows), squeeze=False)
    for ax, start in zip(axs.ravel(), non_overlap):
        # plot the single window at [start:start+win_len]
        candle_panel(windows3[start:start+1], ax=ax)
        ax.set_title(f"id {start}", fontsize=8)

    # turn off unused axes
    for ax in axs.ravel()[len(non_overlap):]:
        ax.axis('off')

    plt.tight_layout()
    plt.show()




# hook up the interact UI
w.interact(
    drill_nonoverlap,
    win_len=w.Dropdown(options=WIN_LENS.value, description='Win len'),
    cid    =w.IntText(value=0, description='Cluster ID'),
    n      =w.IntSlider(value=6, min=1, max=12, description='Samples')
)

interactive(children=(Dropdown(description='Win len', options=(64, 256), value=64), IntText(value=0, descripti…

<function __main__.drill_nonoverlap(win_len, cid, n)>

Save embeddings & labels

In [None]:
for L, res in results.items():
    np.savez_compressed(f'embeddings_L{L}.npz',
                        Z=res['Z'], labels=res['labels'])
print('All embeddings saved.')

Next steps