### SET UP ENVIRONMENT

In [None]:
!pip install -r ../dev-requirements.txt

In [None]:
# Load .env file if it exists
# Don't use dotenv
!pip install python-dotenv

from dotenv import load_dotenv
load_dotenv('../.env')

### LOAD Experiment from MLFLOW

Run 

``` python
cd mlflow
mlflow server
```


In [None]:
import mlflow
import json

experiment_id = "508906627986939289"
run_id = "639b59a8bb2b476eb8a353a5ca4b6a66"

# Mlfow Section
experiments = mlflow.search_runs(experiment_ids=experiment_id)
experiment = experiments.loc[experiments['run_id'] == run_id]
cluster_lables = json.loads(experiment["params.Cluster_Labels"][1])
criptos = json.loads(experiment["params.Criptocurrencies"][1].replace("'", '"'))

# Run without mlflow: uncomment the following lines and comment the mlflow section
# criptos = ['close_ADA/USD', 'close_BCH/USD', 'close_BTC/USD', 'close_DOGE/USD', 'close_DOT/USD', 'close_EOS/USD', 'close_ETC/USD', 'close_ETH/USD', 'close_LTC/USD', 'close_XRP/USD']
# cluster_lables = [0, 0, 1, 1, 0, 0, 1, 0, 0]

cripto_clusters = {}

for label in cluster_lables:
    cripto_clusters[label] = [criptos[i] for i, cluster_label in enumerate(cluster_lables) if cluster_label == label]

for cluster in cripto_clusters:
    print(f'Cluster {cluster}: {cripto_clusters[cluster]}')

### LOAD DATA

In [None]:
import pandas as pd
import os

folder = os.path.join("../airflow/assets")
dfs = []
for file in os.listdir(folder):
    if file.endswith(".csv"):
        dfs.append(pd.read_csv(os.path.join(folder, file), skiprows=1, parse_dates=['date']))
print(dfs.__len__())

#### Merge all in one dataframe

In [None]:
import pandas as pd

# Step 1: Convert "date" column to datetime in all dataframes
for df in dfs:
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S', errors="coerce")

# Step 2: Find the oldest and newest dates across all dataframes
all_dates = [df['date'] for df in dfs]
all_dates_flat = [date for sublist in all_dates for date in sublist if not pd.isnull(date)]

oldest_date = min(all_dates_flat)
newest_date = max(all_dates_flat)

# Step 3: Create a new dataframe with the date range
date_range = pd.date_range(start=oldest_date, end=newest_date, freq='H')  # Hourly frequency
merged_df = pd.DataFrame({'date': date_range})

# Step 4: Add "close" columns from each dataframe to the merged_df using list comprehension
for df in dfs:
    try:
        ticker = df['symbol'].iloc[0]  # Assuming each dataframe has a "ticker" column
        close_col_name = f'close_{ticker}'

        df = df.set_index('date').sort_index()
        df = df[~df.index.duplicated(keep='first')].reindex(date_range, method='ffill')

        # Create a DataFrame with the "date" and "close" columns
        close_data = df[df.index.isin(date_range)][['close']]
        close_data.rename(columns={'close': close_col_name}, inplace=True)

        # Merge the "close_data" into the "merged_df"
        merged_df = pd.merge(merged_df, close_data, left_on='date', right_index=True, how='left')
    except ValueError as e:
        print(f'Error on coin {ticker}: {e}')


# Now, merged_df contains the desired data with the date range and "close_{ticker}" columns, with missing hours filled.

In [None]:
clusters_data = {}
merged_df = merged_df.dropna()
# cripto_clusters
 
# loop on key and value of cripto_clusters
for cluster, criptos in cripto_clusters.items():
    criptos.append('date')
    clusters_data[cluster] = merged_df[criptos]

# Clusters now contains a dictionary with the cluster number as key and the dataframe with the criptos as value

### MODELS SECTION

#### Data analysis

In [None]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller

def adfuller_test(time_series, significance_level=0.05):
    """
    Perform Augmented Dickey-Fuller (ADF) test for stationarity.

    Parameters:
    - time_series: A pandas Series or NumPy array containing the time series data.
    - significance_level: The significance level for the test (default is 0.05).

    Returns:
    - ADF test result and p-value.
    - A string indicating the stationarity based on the p-value.
    """

    result = adfuller(time_series)
    adf_statistic = result[0]
    p_value = result[1]

    if p_value <= significance_level:
        stationarity = "Stationary (p <= {0})".format(significance_level)
    else:
        stationarity = "Non-Stationary (p > {0})".format(significance_level)

    return adf_statistic, p_value, stationarity

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def calculate_and_plot_price_changes(data, column_name, window_size=10):
    """
    Calculate and plot first-order percentage differences, rolling mean, and rolling standard deviation.

    Parameters:
    - data: A pandas DataFrame containing a 'date' index and the specified cryptocurrency price column.
    - column_name: The name of the column representing cryptocurrency prices.
    - window_size: The window size for the rolling statistics (default is 10).

    Returns:
    - None (plots the results).
    """
    if 'date' not in data.columns:
        raise ValueError("The 'data' DataFrame must contain a 'date' column as the index.")

    if column_name not in data.columns:
        raise ValueError(f"The specified column '{column_name}' is not found in the DataFrame.")

    # Calculate daily percentage changes
    data['PriceChange'] = data[column_name].pct_change() * 100

    # Calculate rolling mean and rolling standard deviation
    data['RollingMean'] = data['PriceChange'].rolling(window=window_size).mean()
    data['RollingStd'] = data['PriceChange'].rolling(window=window_size).std()

    # Plot the results
    plt.figure(figsize=(12, 6))
    plt.plot(data.index, data['PriceChange'], label='Price Change (%)', color='blue')
    plt.plot(data.index, data['RollingMean'], label=f'Rolling Mean ({window_size}-hour)', color='green')
    plt.plot(data.index, data['RollingStd'], label=f'Rolling Std Deviation ({window_size}-hour)', color='red')

    plt.xlabel('Date')
    plt.ylabel('Percentage Change / Rolling Statistics')
    plt.title(f'{column_name} Price Changes and Rolling Statistics')
    plt.legend()
    plt.grid(True)

    plt.show()

In [None]:
for cluster in cripto_clusters:
    print(f'Cluster {cluster}: {cripto_clusters[cluster]}\n')
    for cripto in cripto_clusters[cluster]:
        if cripto != 'date':
            print(f'Analyzing: {cripto}')

            adf_statistic, p_value, stationarity = adfuller_test(clusters_data[cluster][cripto])
            print("ADF Statistic:", adf_statistic)
            print("p-value:", p_value)
            print("Stationarity:", stationarity)

            calculate_and_plot_price_changes(clusters_data[cluster], cripto)
            print("\n")
        print("\n---------------------------------\n")

#### Multivariate Time Series - Vector Auto Regression (VAR)

[Source](https://www.analyticsvidhya.com/blog/2018/09/multivariate-time-series-guide-forecasting-modeling-python-codes/)

In [None]:
from math import sqrt

from sklearn.metrics import mean_squared_error
from statsmodels.tsa.vector_ar.var_model import VAR

cluster = clusters_data[0]

def mts_var_training(cluster_data):
    
    # Data preparation

    # Step 1: Remove the "date" column
    data = cluster_data.drop(['date'], axis=1)
    cols = data.columns
    # Step 2: Set the "date" column as the index
    data.index = cluster_data.date

    # Step 3: Dealing with missing values --> already done

    # Step 4: Split the data into train and test sets
    train_size = int(len(data) * 0.7)
    test_size = (len(data) - train_size) / 2

    train = data.iloc[0:train_size] 
    valid = data.iloc[train_size:int(len(data) - test_size)]
    test = data.iloc[int(train_size + test_size):len(data)]

    print('train size: ', len(train))
    print('valid size: ', len(valid))
    print('test size: ', len(test))
    print('total size: ', len(data))
    print("\n")
    assert len(data) == len(train) + len(valid) + len(test)

    model = VAR(endog=train)
    model_fit = model.fit()
    print(model_fit)

    # make prediction on validation
    prediction = model_fit.forecast(model_fit.endog, steps=len(valid))

    #converting predictions to dataframe
    pred = pd.DataFrame(index=range(0,len(prediction)),columns=[cols])
    for j in range(0, len(cols)):
        for i in range(0, len(prediction)):
            pred.iloc[i][j] = prediction[i][j]

    #check rmse
    for i in cols:
        print('RMSE value for', i, 'is : ', sqrt(mean_squared_error(pred[i], valid[i])))

In [None]:
for cluster in cripto_clusters:
    print(f'Cluster {cluster}: {cripto_clusters[cluster]}\n')
    mts_var_training(clusters_data[cluster])
    print("\n---------------------------------\n")

#### Multivariate Time Series Forecasting with Deep Learning

[Source 1](https://towardsdatascience.com/multivariate-time-series-forecasting-with-deep-learning-3e7b3e2d2bcf)
[Source 2](https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/)

In [None]:
# import pandas as pd
# import numpy as np
# import tensorflow as tf
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.metrics import mean_squared_error
# from math import sqrt

# # Load your cryptocurrency price data (replace this with your own data)
# data = clusters_data[0]
# data['date'] = pd.to_datetime(data['date'])  # Convert the 'date' column to datetime

# # Sort the data by date
# data = data.sort_values(by='date')

# # Define the number of previous time steps to consider for prediction
# n_steps = 10  # You can adjust this value

# # Create input data by shifting prices to create sequences
# X = data.drop(columns=['date']).values
# X_seq = [X[i:i + n_steps] for i in range(len(X) - n_steps)]

# # Shift the closing price to predict the next closing price
# y = data.drop(columns=['date']).shift(-n_steps).values

# train_size = int(len(data) * 0.7)
# test_size = (len(data) - train_size) / 2

# train = data.iloc[0:train_size] 
# valid = data.iloc[train_size:int(len(data) - test_size)]
# test = data.iloc[int(train_size + test_size):len(data)]

# print('train size: ', len(train))
# print('valid size: ', len(valid))
# print('test size: ', len(test))
# print('total size: ', len(data))
# print("\n")
# assert len(data) == len(train) + len(valid) + len(test)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_seq, y[:-n_steps], test_size=0.2, shuffle=False)

# # split the training set into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)

# # Reshape the data to 3D for LSTM
# X_train = np.array(X_train)
# X_test = np.array(X_test)
# y_train = np.array(y_train)
# y_test = np.array(y_test)

# # Create a function to calculate RMSE
# def calculate_rmse(y_true, y_pred):
#     return sqrt(mean_squared_error(y_true, y_pred))

# # Define the model
# model = tf.keras.Sequential([
#     tf.keras.layers.LSTM(64, activation='relu', input_shape=(n_steps, X_train.shape[2]), return_sequences=True),
#     tf.keras.layers.LSTM(64, activation='relu'),
#     tf.keras.layers.Dense(X_train.shape[2])
# ])

# # Compile the model
# model.compile(optimizer='adam', loss='mse')

# # Train the model
# model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# # Make predictions
# y_pred = model.predict(X_test)

# # Calculate and print RMSE
# rmse = calculate_rmse(y_test, y_pred)
# print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
