### Install libraries

In [None]:
!pip install -r ../dev-requirements.txt

### Set up variables for MLFlow

In [None]:
# Load .env file if it exists
# Don't use dotenv
!pip install python-dotenv

from dotenv import load_dotenv
load_dotenv('../.env')

### Load Data

In [None]:
import pandas as pd
import os

folder = os.path.join("../airflow/assets")
dfs = []
for file in os.listdir(folder):
    if file.endswith(".csv"):
        dfs.append(pd.read_csv(os.path.join(folder, file), skiprows=1, parse_dates=['date']))
print(dfs.__len__())

#### Merge all data into one dataframe

In [None]:
import pandas as pd

# Step 1: Convert "date" column to datetime in all dataframes
for df in dfs:
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S', errors="coerce")

# Step 2: Find the oldest and newest dates across all dataframes
all_dates = [df['date'] for df in dfs]
all_dates_flat = [date for sublist in all_dates for date in sublist if not pd.isnull(date)]

oldest_date = min(all_dates_flat)
newest_date = max(all_dates_flat)

# Step 3: Create a new dataframe with the date range
date_range = pd.date_range(start=oldest_date, end=newest_date, freq='H')  # Hourly frequency
merged_df = pd.DataFrame({'date': date_range})

# Step 4: Add "close" columns from each dataframe to the merged_df using list comprehension
for df in dfs:
    try:
        ticker = df['symbol'].iloc[0]  # Assuming each dataframe has a "ticker" column
        close_col_name = f'close_{ticker}'

        df = df.set_index('date').sort_index()
        df = df[~df.index.duplicated(keep='first')].reindex(date_range, method='ffill')

        # Create a DataFrame with the "date" and "close" columns
        close_data = df[df.index.isin(date_range)][['close']]
        close_data.rename(columns={'close': close_col_name}, inplace=True)

        # Merge the "close_data" into the "merged_df"
        merged_df = pd.merge(merged_df, close_data, left_on='date', right_index=True, how='left')
    except ValueError as e:
        print(f'Error on coin {ticker}: {e}')


# Now, merged_df contains the desired data with the date range and "close_{ticker}" columns, with missing hours filled.


### Clustering Analysis

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from fastdtw import fastdtw
from tqdm import tqdm
import mlflow
import mlflow.sklearn
from sklearn.metrics import silhouette_score
from sklearn_extra.cluster import KMedoids

def preprocess_data(
    merged_df,
    slice_data=True,
    scale_data=None,
    drop_columns=None,
    percentage_change=False
):
    # Optionally slice data based on the date
    if slice_data:
        time_series_data = merged_df[merged_df['date'] >= '2021-06-01']
    else:
        time_series_data = merged_df.copy()


    # Backfill missing values
    time_series_data = time_series_data.fillna(method='bfill')

    # Optionally filter out columns
    if drop_columns:
        time_series_data = time_series_data.drop(drop_columns, axis=1)

    # Select the time series columns for clustering and exclude 'date' column
    df = time_series_data.copy()
    time_series_data = time_series_data.iloc[:, 1:].values


    # Optionally scale data (e.g., min-max scaling or log transformation)
    if scale_data == "min_max":
        time_series_data = (time_series_data - time_series_data.min()) / (time_series_data.max() - time_series_data.min())
    elif scale_data == "log":
        time_series_data = np.log(time_series_data)

    # Optionally calculate percentage change
    if percentage_change:
        time_series_data = np.diff(time_series_data, axis=1) / time_series_data[:, :-1]

    time_series_data = time_series_data.T  # Transpose the data to have time series in rows

    return time_series_data, df

def do_clustering(
    merged_df,
    cluster_range=range(2, 8),
    clustering_algorithm="KMeans",
    slice_data=True,
    scale_data=None,
    filter_data=None,
    percentage_change=False
):
    experiment_name = f"Clustering"
    mlflow.set_experiment(experiment_name)

    time_series_data, df = preprocess_data(merged_df, slice_data, scale_data, filter_data, percentage_change)

    # Rest of your clustering code
    distances = None
    # Initialize an empty array to store the distances
    distances = np.zeros((time_series_data.shape[0], time_series_data.shape[0]))
    # Calculate DTW distances between time series columns with progress tracking
    for i in tqdm(range(time_series_data.shape[0]), desc="Calculating DTW Distances"):
        for j in range(i, time_series_data.shape[0]):
            distance, _ = fastdtw(time_series_data[i], time_series_data[j])
            distances[i, j] = distance
            distances[j, i] = distance

    # Silhouette Score vs. Cluster Count
    silhouette_scores = []
    inertia_scores = []

    # Iterate through different cluster counts
    for n_clusters in cluster_range:
        
        with mlflow.start_run():

            # Log parameters to MLflow
            mlflow.log_param("N_clusters", str(n_clusters))
            mlflow.log_param("Clustering_Algorithm", clustering_algorithm)
            mlflow.log_param("Slice_Data", slice_data)
            mlflow.log_param("Scale_Data", scale_data)
            mlflow.log_param("Filter_Data", filter_data)
            mlflow.log_param("Percentage_Change", percentage_change)

            print(f"Running experiment for {n_clusters} clusters...")

            if clustering_algorithm == "KMeans":
                # Perform K-Means clustering with DTW distances
                kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(distances)

                # Get cluster labels
                labels = kmeans.labels_
                inertia_score = kmeans.inertia_

                print("K-Means clustering completed.")

            elif clustering_algorithm == "KMedoids":
                # Perform K-Medoids clustering with DTW distances
                kmedoids = KMedoids(n_clusters=n_clusters, metric="precomputed", random_state=0).fit(distances)

                # Get cluster labels
                labels = kmedoids.labels_
                inertia_score = kmedoids.inertia_

                print("K-Medoids clustering completed.")

            # Log clustering results and distances to MLflow
            mlflow.log_param("Cluster_Count", n_clusters)
            mlflow.log_param("Distance_Matrix_Shape", distances.shape)
            mlflow.log_param("Cluster_Labels", labels.tolist())

            print("Clustering results logged to MLflow.")

            # Calculate Silhouette Score and Inertia
            print(distances.shape)
            print(distances)

            silhouette = silhouette_score(distances, labels, metric="precomputed")

            silhouette_scores.append(silhouette)
            inertia_scores.append(inertia_score)

            mlflow.log_metric("Silhouette_Score", silhouette)
            mlflow.log_metric("Inertia_Score", inertia_score)

            # Define a color palette for plotting
            palette = sns.color_palette("husl", len(time_series_data))

            # Visualize the clustered time series using Seaborn
            num_rows = n_clusters
            num_cols = 1
            fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 6 * num_rows))

            for cluster_id in range(n_clusters):
                ax = axes[cluster_id] if num_rows > 1 else axes
                for series_idx, label in enumerate(labels):
                    if label == cluster_id:
                        series_name = df.columns[series_idx + 1]  # Get the column name (series name)
                        sns.lineplot(data=time_series_data[series_idx], color=palette[series_idx], label=series_name, ax=ax)

                ax.annotate(f"Inertia: {inertia_score:.2f}", xy=(0.05, 0.85), xycoords='axes fraction', fontsize=10)
                ax.annotate(f"Silhouette: {silhouette:.2f}", xy=(0.05, 0.75), xycoords='axes fraction', fontsize=10)

                ax.set_title(f"Cluster {cluster_id + 1}")
                ax.legend(loc='upper right')  # Add legends for series in the cluster

            plt.tight_layout()

            # Save the plot to a file and log it to MLflow
            plot_path = f"cluster_plots_{n_clusters}_clusters.png"
            plt.savefig(plot_path)
            mlflow.log_artifact(plot_path)

            print("Cluster visualization plot saved and logged to MLflow.")

            plt.show()

    with mlflow.start_run(run_name="Scores_logging"):

        # Log parameters to MLflow
        mlflow.log_param("N_clusters", str(n_clusters))
        mlflow.log_param("Clustering_Algorithm", clustering_algorithm)
        mlflow.log_param("Slice_Data", slice_data)
        mlflow.log_param("Scale_Data", scale_data)
        mlflow.log_param("Filter_Data", filter_data)
        mlflow.log_param("Percentage_Change", percentage_change)


        # Log the lists of Silhouette and Inertia scores
        mlflow.log_param("Silhouette_Scores", silhouette_scores)
        mlflow.log_param("Inertia_Scores", inertia_scores)

        # Silhouette Score vs. Cluster Count plot
        plt.figure(figsize=(8, 4))
        plt.plot(cluster_range, silhouette_scores, marker='o')
        plt.title("Silhouette Score vs. Cluster Count")
        plt.xlabel("Cluster Count")
        plt.ylabel("Silhouette Score")
        plt.tight_layout()
        plt.savefig("silhouette_scores_plot.png")
        mlflow.log_artifact("silhouette_scores_plot.png")

        # Inertia Score vs. Cluster Count plot
        plt.figure(figsize=(8, 4))
        plt.plot(cluster_range, inertia_scores, marker='o')
        plt.title("Inertia Score vs. Cluster Count")
        plt.xlabel("Cluster Count")
        plt.ylabel("Inertia Score")
        plt.tight_layout()
        plt.savefig("inertia_scores_plot.png")
        mlflow.log_artifact("inertia_scores_plot.png")


#### Clustering Analysis (on Raw Series)

In [None]:

# Experiment 1: Raw Data
do_clustering(
    merged_df,
    clustering_algorithm="KMeans",
    slice_data=False,
    scale_data=None,
    filter_data=None,
    percentage_change=False
)

do_clustering(
    merged_df,
    clustering_algorithm="KMedoids",
    slice_data=False,
    scale_data=None,
    filter_data=None,
    percentage_change=False
)

# Experiment 2: Data from 2021-06
do_clustering(
    merged_df,
    clustering_algorithm="KMeans",
    slice_data=True,
    scale_data=None,
    filter_data=None,
    percentage_change=False
)

do_clustering(
    merged_df,
    clustering_algorithm="KMedoids",
    slice_data=True,
    scale_data=None,
    filter_data=None,
    percentage_change=False
)

# Experiment 3: Data from 2021-06 with Custom Min-Max Scaling
do_clustering(
    merged_df,
    clustering_algorithm="KMeans",
    slice_data=True,
    scale_data="min_max",
    filter_data=None,
    percentage_change=False
)

do_clustering(
    merged_df,
    clustering_algorithm="KMedoids",
    slice_data=True,
    scale_data="min_max",
    filter_data=None,
    percentage_change=False
)

# Experiment 4: Data from 2021-06 with Column Filtering and Custom Min-Max Scaling
do_clustering(
    merged_df,
    clustering_algorithm="KMeans",
    slice_data=True,
    scale_data="min_max",
    filter_data=['close_XLM/USD', 'close_RSR/USD', 'close_ICP/USD', 'close_LUNA/USD'],
    percentage_change=False
)

do_clustering(
    merged_df,
    clustering_algorithm="KMedoids",
    slice_data=True,
    scale_data="min_max",
    filter_data=['close_XLM/USD', 'close_RSR/USD', 'close_ICP/USD', 'close_LUNA/USD'],
    percentage_change=False
)

# Experiment 5: Custom Min-Max Scaling with all Columns
time_series_data = merged_df.copy()
do_clustering(
    time_series_data,
    clustering_algorithm="KMeans",
    slice_data=False,
    scale_data="min_max",
    filter_data=None,
    percentage_change=False
)
do_clustering(
    time_series_data,
    clustering_algorithm="KMedoids",
    slice_data=False,
    scale_data="min_max",
    filter_data=None,
    percentage_change=False
)

# Experiment 6: Data from 2021-06 with Log Transformation
do_clustering(
    merged_df,
    clustering_algorithm="KMeans",
    slice_data=True,
    scale_data="log",
    filter_data=None,
    percentage_change=False
)