In [None]:
import logging
import os
import shutil
import numpy as np
import pandas as pd
import torch
from chronos import ChronosPipeline
from sklearn.cluster import KMeans
from pathlib import Path
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
def get_embeddings(path, ticker):
    try:
        pipeline = ChronosPipeline.from_pretrained(
            "amazon/chronos-t5-small", # Use tiny
            device_map="auto",
            torch_dtype=torch.bfloat16,
        )
    except ImportError as e:
        logging.error(f"❌ Error loading the Chronos model for {ticker}: {e}")
        raise ImportError(f"❌ Error loading the Chronos model for {ticker}: {e}")

    try:
        df = pd.read_parquet(path)
        context = torch.tensor(df["Close"].values)
        embeddings, tokenizer_state = pipeline.embed(context)
    except ValueError as e:
        logging.error(f"❌ Error extracting embeddings for {ticker}: {e}")
        raise ValueError(f"❌ Error extracting embeddings for {ticker}: {e}")
    
    logging.info(f"✅ Successfully extracted embeddings for {ticker}.")
    return embeddings, tokenizer_state

# Use Ilyaas function
def extract_all_embeddings(data_dir):
    try:
        data_dir = Path(data_dir)
        files = list(data_dir.glob("*.parquet"))
    except FileExistsError as e:
        logging.error(f"❌ Directory {data_dir} not found, {e}.")
        raise FileExistsError(f"❌ Directory {data_dir} not found, {e}.")
    
    tickers = []
    embedding_list = []
    
    for file_path in files:
        ticker = file_path.stem
        embeddings, _ = get_embeddings(file_path, ticker)

        # If embeddings has a singleton batch dimension (e.g., shape (1, T, D)), squeeze it.
        if embeddings.ndim == 3 and embeddings.shape[0] == 1:
            embeddings = embeddings.squeeze(0)  # Now shape becomes (T, D)
        
        # If embeddings are time-distributed (i.e. shape (T, D)), average over time axis.
        if embeddings.ndim == 2:
            embedding_vector = embeddings.mean(dim=0).float().numpy()
        else:
            embedding_vector = embeddings.float().numpy()

        tickers.append(ticker)
        embedding_list.append(embedding_vector)
    
    logging.info(f"✅ Successfully extracted all embeddings for time series in {data_dir}.")
    return tickers, np.array(embedding_list)


def cluster_embeddings(data_dir, n_clusters=5):
    tickers, embeddings_array = extract_all_embeddings(data_dir)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(embeddings_array)

    ticker_cluster_dict = {ticker: cluster for ticker, cluster in zip(tickers, clusters)}
    return ticker_cluster_dict


def save_stocks_to_cluster_dirs(clusters, data_directory, output_directory):
    try:
        cluster_dirs = {cluster: os.path.join(output_directory, f"cluster_{cluster}")
                        for cluster in set(clusters.values())}
        for path in cluster_dirs.values():
            os.makedirs(path, exist_ok=True)

        for stock, cluster in clusters.items():
            source_file = os.path.join(data_directory, f"{stock}.parquet")
            dest_file = os.path.join(cluster_dirs[cluster], f"{stock}.parquet")
            if os.path.exists(source_file):
                shutil.copy(source_file, dest_file)
                logging.info(f"Copied {stock}.parquet to cluster_{cluster} directory.")
            else:
                logging.warning(f"❌ File {source_file} not found!")
    except FileNotFoundError as e:
        logging.error(f"❌ Directory {data_directory} not found: {e}")


if __name__ == "__main__":
    data_directory = "/Users/akramchakrouni/Projects/time-series-forecasting-cluserting/data/chronos/"
    output_directory = "/Users/akramchakrouni/Projects/time-series-forecasting-cluserting/clusters/embeddings"
    n_clusters = 5  # Adjust based on your analysis
    clusters_dict = cluster_embeddings(data_directory, n_clusters)
    save_stocks_to_cluster_dirs(clusters_dict, data_directory, output_directory)