In [None]:
!pip install -r ../dev-requirements.txt

In [None]:
# Load .env file if it exists
# Don't use dotenv
!pip install python-dotenv

from dotenv import load_dotenv
load_dotenv('../.env')

### LOAD Experiment from MLFLOW

Run 

``` python
cd mlflow
mlflow server
```


In [None]:
import mlflow
import json

experiment_id = "508906627986939289"
run_id = "639b59a8bb2b476eb8a353a5ca4b6a66"
experiments = mlflow.search_runs(experiment_ids=experiment_id)

# find in dataframe the experiment with run_id equals to 154a028019be42c9b78b3c05c8122e53
experiment = experiments.loc[experiments['run_id'] == run_id]
cluster_lables = json.loads(experiment["params.Cluster_Labels"][1])
criptos = json.loads(experiment["params.Criptocurrencies"][1].replace("'", '"'))

# Run without mlflow: uncomment the following lines and comment the previous ones
# criptos = ['close_ADA/USD', 'close_BCH/USD', 'close_BTC/USD', 'close_DOGE/USD', 'close_DOT/USD', 'close_EOS/USD', 'close_ETC/USD', 'close_ETH/USD', 'close_LTC/USD', 'close_XRP/USD']
# cluster_lables = [1, 0, 0, 1, 0, 0, 0, 0, 0, 1]

cripto_clusters = {}

for label in cluster_lables:
    cripto_clusters[label] = [criptos[i] for i, cluster_label in enumerate(cluster_lables) if cluster_label == label]

for cluster in cripto_clusters:
    print(f'Cluster {cluster}: {cripto_clusters[cluster]}')

In [None]:
experiments.loc[experiments['run_id'] == run_id]

### LOAD DATA

In [None]:
import pandas as pd
import os

folder = os.path.join("../airflow/assets")
dfs = []
for file in os.listdir(folder):
    if file.endswith(".csv"):
        dfs.append(pd.read_csv(os.path.join(folder, file), skiprows=1, parse_dates=['date']))
print(dfs.__len__())

#### Merge all in one dataframe

In [None]:
import pandas as pd

# Step 1: Convert "date" column to datetime in all dataframes
for df in dfs:
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S', errors="coerce")

# Step 2: Find the oldest and newest dates across all dataframes
all_dates = [df['date'] for df in dfs]
all_dates_flat = [date for sublist in all_dates for date in sublist if not pd.isnull(date)]

oldest_date = min(all_dates_flat)
newest_date = max(all_dates_flat)

# Step 3: Create a new dataframe with the date range
date_range = pd.date_range(start=oldest_date, end=newest_date, freq='H')  # Hourly frequency
merged_df = pd.DataFrame({'date': date_range})

# Step 4: Add "close" columns from each dataframe to the merged_df using list comprehension
for df in dfs:
    try:
        ticker = df['symbol'].iloc[0]  # Assuming each dataframe has a "ticker" column
        close_col_name = f'close_{ticker}'

        df = df.set_index('date').sort_index()
        df = df[~df.index.duplicated(keep='first')].reindex(date_range, method='ffill')

        # Create a DataFrame with the "date" and "close" columns
        close_data = df[df.index.isin(date_range)][['close']]
        close_data.rename(columns={'close': close_col_name}, inplace=True)

        # Merge the "close_data" into the "merged_df"
        merged_df = pd.merge(merged_df, close_data, left_on='date', right_index=True, how='left')
    except ValueError as e:
        print(f'Error on coin {ticker}: {e}')


# Now, merged_df contains the desired data with the date range and "close_{ticker}" columns, with missing hours filled.

#### Split data in clusters 

In [None]:
clusters_data = {}
merged_df = merged_df.dropna()
# cripto_clusters
 
# loop on key and value of cripto_clusters
for cluster, criptos in cripto_clusters.items():
    clusters_data[cluster] = merged_df[criptos]

# Clusters now contains a dictionary with the cluster number as key and the dataframe with the criptos as value