### SET UP ENVIRONMENT

In [None]:
!pip install -r ../dev-requirements.txt

In [None]:
# Load .env file if it exists
# Don't use dotenv
!pip install python-dotenv

from dotenv import load_dotenv
load_dotenv('../.env')

### LOAD DATA

In [None]:
import pandas as pd
import os


folder = os.path.join("../../airflow/assets/binance_1d")
dfs = []
for file in os.listdir(folder):
    if file.endswith(".csv"):
        dfs.append(pd.read_csv(os.path.join(folder, file), skiprows=1, parse_dates=['Date']))
print(dfs.__len__())

#### Merge all in one dataframe

In [None]:
import pandas as pd

# Step 1: Convert "date" column to datetime in all dataframes
for df in dfs:
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d', errors="coerce")

# Step 2: Find the oldest and newest dates across all dataframes
all_dates = [df['Date'] for df in dfs]
all_dates_flat = [date for sublist in all_dates for date in sublist if not pd.isnull(date)]

oldest_date = '2019-01-01'
newest_date = max(all_dates_flat)

# Step 3: Create a new dataframe with the date range
date_range = pd.date_range(start=oldest_date, end=newest_date, freq='D')  # Daily frequency
merged_df = pd.DataFrame({'Date': date_range})

# Step 4: Add "close" and "Volume USDT" columns from each dataframe to the merged_df using list comprehension
for df in dfs:
    try:
        ticker = df['Symbol'].iloc[0]  # Assuming each dataframe has a "symbol" column
        close_col_name = f'close_{ticker}'
        volume_col_name = f'Volume USDT_{ticker}'  # Replace with the actual column name if it's different in your data

        df = df.set_index('Date').sort_index()

        # Create DataFrames with the "date" and "close" columns
        close_data = df[df.index.isin(date_range)][['Close']]
        close_data.rename(columns={'Close': close_col_name}, inplace=True)

        # Merge the "close_data" into the "merged_df"
        merged_df = pd.merge(merged_df, close_data, left_on='Date', right_index=True, how='left')

        # Add the "Volume USDT" column to the merged_df (replace 'Volume USDT' with the actual column name if it's different)
        # merged_df[volume_col_name] = df['Volume USDT']

    except ValueError as e:
        print(f'Error on coin {ticker}: {e}')


# print number of columns -1 of merged_df
print(merged_df.columns.__len__()-1)

### LOAD Experiment from MLFLOW

Run 

``` python
cd mlflow
mlflow server
```

In [None]:
import mlflow
import json

experiment_id = "110357928989408424"
run_id = "35f1bb80732f433297fda78e6638feab"

# Mlfow Section
experiments = mlflow.search_runs(experiment_ids=experiment_id)
experiment = experiments.loc[experiments['run_id'] == run_id]

# Use eval() to convert the string to a list of tuples
data_list = eval(experiment["params.Cluster_Labels"].tolist()[0])

# Convert the list of tuples to a dictionary
data_dict = dict(data_list)

# Create a map where keys are the values from the original dictionary
cripto_clusters = {}
for key, value in data_dict.items():
    if value in cripto_clusters:
        cripto_clusters[value].append(key)
    else:
        cripto_clusters[value] = [key]

clusters_data = {}

# loop on key and value of cripto_clusters
for cluster, criptos in cripto_clusters.items():
    _criptos = criptos + ['Date']
    clusters_data[cluster] = merged_df[_criptos]

# Clusters now contains a dictionary with the cluster number as key and the dataframe with the criptos as value

### MODELS SECTION

#### Data analysis

In [None]:
from statsmodels.tsa.stattools import adfuller

def adfuller_test(time_series, significance_level=0.05):
    """
    Perform Augmented Dickey-Fuller (ADF) test for stationarity.

    Parameters:
    - time_series: A pandas Series or NumPy array containing the time series data.
    - significance_level: The significance level for the test (default is 0.05).

    Returns:
    - ADF test result and p-value.
    - A string indicating the stationarity based on the p-value.
    """

    result = adfuller(time_series)
    adf_statistic = result[0]
    p_value = result[1]

    if p_value <= significance_level:
        stationarity = "Stationary (p <= {0})".format(significance_level)
    else:
        stationarity = "Non-Stationary (p > {0})".format(significance_level)

    return adf_statistic, p_value, stationarity

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def calculate_and_plot_price_changes(cluster_data, column_name, window_size=10):
    """
    Calculate and plot first-order percentage differences, rolling mean, and rolling standard deviation.

    Parameters:
    - data: A pandas DataFrame containing a 'date' index and the specified cryptocurrency price column.
    - column_name: The name of the column representing cryptocurrency prices.
    - window_size: The window size for the rolling statistics (default is 10).

    Returns:
    - None (plots the results).
    """
    data = cluster_data.copy()
    if 'Date' not in data.columns:
        raise ValueError("The 'data' DataFrame must contain a 'date' column as the index.")

    if column_name not in data.columns:
        raise ValueError(f"The specified column '{column_name}' is not found in the DataFrame.")
    
    # Calculate daily percentage changes
    data['PriceChange'] = data[column_name].pct_change() * 100

    # Calculate rolling mean and rolling standard deviation
    data['RollingMean'] = data['PriceChange'].rolling(window=window_size).mean()
    data['RollingStd'] = data['PriceChange'].rolling(window=window_size).std()

    # Plot the results
    plt.figure(figsize=(12, 6))
    plt.plot(data.index, data['PriceChange'], label='Price Change (%)', color='blue')
    plt.plot(data.index, data['RollingMean'], label=f'Rolling Mean ({window_size}-hour)', color='green')
    plt.plot(data.index, data['RollingStd'], label=f'Rolling Std Deviation ({window_size}-hour)', color='red')

    plt.xlabel('Date')
    plt.ylabel('Percentage Change / Rolling Statistics')
    plt.title(f'{column_name} Price Changes and Rolling Statistics')
    plt.legend()
    plt.grid(True)

    plt.show()

In [None]:
for cluster in cripto_clusters:
    print(f'Cluster {cluster}: {cripto_clusters[cluster]}\n')
    for cripto in clusters_data[cluster]:
        if cripto != 'Date':
            print(f'Analyzing: {cripto}')

            adf_statistic, p_value, stationarity = adfuller_test(clusters_data[cluster][cripto])
            print("ADF Statistic:", adf_statistic)
            print("p-value:", p_value)
            print("Stationarity:", stationarity)

            calculate_and_plot_price_changes(clusters_data[cluster], cripto)
            print("\n")
        print("\n---------------------------------\n")

#### Multivariate Time Series - Vector Auto Regression (VAR)

[Source](https://www.analyticsvidhya.com/blog/2018/09/multivariate-time-series-guide-forecasting-modeling-python-codes/)

In [167]:
from math import sqrt

from sklearn.metrics import mean_squared_error
from statsmodels.tsa.vector_ar.var_model import VAR

cluster = clusters_data[0]

def mts_var_training(cluster_data):
    
    # Data preparation

    # Step 1: Remove the "date" column
    data = cluster_data.drop(['Date'], axis=1)
    cols = data.columns
    # Step 2: Set the "date" column as the index
    data.index = cluster_data.Date

    # Step 3: Dealing with missing values --> already done

    # Step 4: Split the data into train and test sets
    train_size = int(len(data) * 0.7)
    test_size = (len(data) - train_size) / 2

    train = data.iloc[0:train_size] 
    valid = data.iloc[train_size:int(len(data) - test_size)]
    test = data.iloc[int(train_size + test_size):len(data)]

    print('train size: ', len(train))
    print('valid size: ', len(valid))
    print('test size: ', len(test))
    print('total size: ', len(data))
    print("\n")
    assert len(data) == len(train) + len(valid) + len(test)

    model = VAR(endog=train)
    model_fit = model.fit()
    print(model_fit)

    # make prediction on validation
    prediction = model_fit.forecast(model_fit.endog, steps=len(valid))

    #converting predictions to dataframe
    pred = pd.DataFrame(index=range(0,len(prediction)),columns=[cols])
    for j in range(0, len(cols)):
        for i in range(0, len(prediction)):
            pred.iloc[i][j] = prediction[i][j]

    #check rmse
    for i in cols:
        print('RMSE value for', i, 'is : ', sqrt(mean_squared_error(pred[i], valid[i])))

    return prediction, valid

In [168]:
for cluster in cripto_clusters:
    print(f'Cluster {cluster}: {cripto_clusters[cluster]}\n')
    prediction, valid = mts_var_training(clusters_data[cluster])
    print("\n---------------------------------\n")

Cluster 2: ['close_ETHUSDT', 'close_BNBUSDT', 'close_BTCUSDT', 'close_TRXUSDT']

train size:  1226
valid size:  263
test size:  263
total size:  1752


<statsmodels.tsa.vector_ar.var_model.VARResultsWrapper object at 0x7fd770d4a700>
RMSE value for close_ETHUSDT is :  315.2080548562406
RMSE value for close_BNBUSDT is :  85.0057633834698
RMSE value for close_BTCUSDT is :  5960.647829501882
RMSE value for close_TRXUSDT is :  0.017235701377382416

---------------------------------

Cluster 0: ['close_QTUMUSDT', 'close_VETUSDT', 'close_XRPUSDT', 'close_XLMUSDT', 'close_ICXUSDT', 'close_IOTAUSDT', 'close_ADAUSDT', 'close_ETCUSDT', 'close_NEOUSDT', 'close_LTCUSDT']

train size:  1226
valid size:  263
test size:  263
total size:  1752


<statsmodels.tsa.vector_ar.var_model.VARResultsWrapper object at 0x7fd770cf3d90>


  self._init_dates(dates, freq)
  pred.iloc[i][j] = prediction[i][j]
  self._init_dates(dates, freq)
  pred.iloc[i][j] = prediction[i][j]


RMSE value for close_QTUMUSDT is :  2.4671660177320085
RMSE value for close_VETUSDT is :  0.01933030801617754
RMSE value for close_XRPUSDT is :  0.13595817288349588
RMSE value for close_XLMUSDT is :  0.07409756398581484
RMSE value for close_ICXUSDT is :  0.47981077187734766
RMSE value for close_IOTAUSDT is :  0.34533006544278005
RMSE value for close_ADAUSDT is :  0.23066477800626328
RMSE value for close_ETCUSDT is :  8.513498167204158
RMSE value for close_NEOUSDT is :  13.391528379037657
RMSE value for close_LTCUSDT is :  42.466410704038836

---------------------------------

Cluster 1: ['close_EOSUSDT', 'close_ONTUSDT', 'close_NULSUSDT']

train size:  1226
valid size:  263
test size:  263
total size:  1752


<statsmodels.tsa.vector_ar.var_model.VARResultsWrapper object at 0x7fd770cf3310>
RMSE value for close_EOSUSDT is :  2.263578235903304
RMSE value for close_ONTUSDT is :  0.524117880645247
RMSE value for close_NULSUSDT is :  0.2368666152294531

---------------------------------



  self._init_dates(dates, freq)
  pred.iloc[i][j] = prediction[i][j]


In [155]:
# Create a dataframe
df = pd.DataFrame()
df["predicton"] = prediction[:, 0]
df["close_EOSUSDT"] = valid['close_EOSUSDT'].values

# Calculate MSE and RMSE for the prediction
from sklearn.metrics import mean_squared_error
from math import sqrt

mse = mean_squared_error(df["close_EOSUSDT"], df["predicton"])
mse
rmse = sqrt(mse)
rmse


2.263578235903304

#### Multivariate Time Series Forecasting with Deep Learning

[Source 1](https://towardsdatascience.com/multivariate-time-series-forecasting-with-deep-learning-3e7b3e2d2bcf) \
[Source 2](https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/)

In [172]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

def get_train_test_data(cluster_data, n_steps=10, test_size=0.2, shuffle=False):

    data = cluster_data.copy() 

    data['Date'] = pd.to_datetime(data['Date'])  # Convert the 'date' column to datetime

    # Sort the data by date
    data = data.sort_values(by='Date')

    # Define the number of previous time steps to consider for prediction
    n_steps = 10  # You can adjust this value

    # Create input data by shifting prices to create sequences
    X = data.drop(columns=['Date']).values
    X_seq = [X[i:i + n_steps] for i in range(len(X) - n_steps)]

    # Shift the closing price to predict the next closing price
    y = data.drop(columns=['Date']).shift(-n_steps).values

    # Split the data into training and testing sets
    _X_train, X_test, _y_train, y_test = train_test_split(X_seq, y[:-n_steps], test_size=0.2, shuffle=False)

    # split the training set into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(_X_train, _y_train, test_size=0.2, shuffle=False)

    # Reshape the data to 3D for LSTM
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    X_valid = np.array(X_valid)
    y_valid = np.array(y_valid)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    print('X_train shape:', X_train.shape)
    print('y_train shape:', y_train.shape)
    print('X_valid shape:', X_valid.shape)
    print('y_valid shape:', y_valid.shape)
    print('X_test shape:', X_test.shape)
    print('y_test shape:', y_test.shape)
    print("\n\n")

    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [158]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# Create a function to calculate RMSE
def calculate_rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

def lstm_training(cluster_data, n_steps=10, test_size=0.2, shuffle=False):
    data = cluster_data.copy()
    X_train, X_valid, X_test, y_train, y_valid, y_test = get_train_test_data(data, n_steps, test_size, shuffle)
    
    # Define the model
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(64, activation='relu', input_shape=(n_steps, X_train.shape[2]), return_sequences=True),
        tf.keras.layers.LSTM(64, activation='relu'),
        tf.keras.layers.Dense(X_train.shape[2])
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='mse')

    # Train the model
    model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_valid, y_valid))

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate and print RMSE
    for i, cripto in enumerate(data.columns[:-1]):
        rmse = calculate_rmse(y_test[:, i], y_pred[:, i])
        print(f'Root Mean Squared Error (RMSE) for {cripto}: {rmse:.4f}')

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
for cluster in cripto_clusters:
    print(f'Cluster {cluster}: {cripto_clusters[cluster]}\n')
    lstm_training(clusters_data[cluster])
    print("\n---------------------------------\n")

#### Foo Model

In [173]:
import random
from sklearn.metrics import mean_squared_error
from math import sqrt

# Define a function to calculate RMSE
def calculate_rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

# Simulate the Foo Model
def foo_model(x):
    #x = x[1]
    return x + (random.randint(-1, 1) * 0.2 * x)

def foo_model_training(cluster_data):
    _, X_valid, _, _, y_valid, _ = get_train_test_data(cluster_data)
    data = cluster_data.copy()
    for i, cripto in enumerate(data.columns[:-1]):
        predictions = foo_model(clusters_data[cluster]["close_EOSUSDT"].values)
        rmse = calculate_rmse(y_valid[i], predictions)
        print(f'Root Mean Squared Error (RMSE) for {cripto}: {rmse:.4f}')

In [187]:
clusters_data[cluster]["close_EOSUSDT"].values

2.6184

In [174]:
for cluster in cripto_clusters:
    print(f'Cluster {cluster}: {cripto_clusters[cluster]}\n')
    foo_model_training(clusters_data[cluster])
    print("\n---------------------------------\n")

Cluster 2: ['close_ETHUSDT', 'close_BNBUSDT', 'close_BTCUSDT', 'close_TRXUSDT']

X_train shape: (1114, 10, 4)
y_train shape: (1114, 4)
X_valid shape: (279, 10, 4)
y_valid shape: (279, 4)
X_test shape: (349, 10, 4)
y_test shape: (349, 4)



Root Mean Squared Error (RMSE) for close_ETHUSDT: 2812.8998
Root Mean Squared Error (RMSE) for close_BNBUSDT: 4371.4659
Root Mean Squared Error (RMSE) for close_BTCUSDT: 1703.3081
Root Mean Squared Error (RMSE) for close_TRXUSDT: 2403.2470

---------------------------------

Cluster 0: ['close_QTUMUSDT', 'close_VETUSDT', 'close_XRPUSDT', 'close_XLMUSDT', 'close_ICXUSDT', 'close_IOTAUSDT', 'close_ADAUSDT', 'close_ETCUSDT', 'close_NEOUSDT', 'close_LTCUSDT']

X_train shape: (1114, 10, 10)
y_train shape: (1114, 10)
X_valid shape: (279, 10, 10)
y_valid shape: (279, 10)
X_test shape: (349, 10, 10)
y_test shape: (349, 10)



Root Mean Squared Error (RMSE) for close_QTUMUSDT: 2.3333
Root Mean Squared Error (RMSE) for close_VETUSDT: 5.6839
Root Mean Squared E