### Install libraries


In [None]:
!pip install -r ../dev-requirements.txt

In [None]:
from common import get_dataframe, register_training_experiment
merged_df = get_dataframe()

In [None]:
# drop columns with NaN values
merged_df.dropna(axis=1, inplace=True)
print(merged_df.columns.__len__()-1)

In [None]:
# # copy the files of the merged_df columns to airflow/assets
# columns = [col.replace("close_", "") for col in merged_df.columns[1:]]
# for col in columns:
#     os.system(f"cp ../data/binance_1d/Binance_{col}_d.csv ../airflow/assets/binance_1d/{col}.csv")

### Plot the timeseries and the scaled timeseries

In [None]:
import mlflow
mlflow.set_experiment("EDA binance_1d")
run = mlflow.start_run()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math
# Create subplots with vertical space using Seaborn
chart_rows = math.ceil((merged_df.columns.__len__() - 1) / 3)
plt.figure(figsize=(5*chart_rows, 20))
grid = plt.GridSpec(chart_rows, 3, hspace=1.5)  # Adjust the value of hspace as needed

for i, col in enumerate(merged_df.columns[1:]):
    ax = plt.subplot(grid[i // 3, i % 3])
    sns.lineplot(data=merged_df, x="Date", y=col, ax=ax)
    ax.set_title(col)

    # Rotate x-axis labels to an oblique angle
    plt.xticks(rotation=45)  # You can adjust the angle as needed

    # Set x-axis limits to display the full date range
    ax.set_xlim(merged_df['Date'].min(), merged_df['Date'].max())

plt.savefig("fig.png")
plt.show()
mlflow.log_artifact("fig.png", "time series")


# Create a custom Min-Max scaling function
def custom_min_max_scaling(column):
    min_val = column.min()
    max_val = column.max()
    scaled_column = (column - min_val) / (max_val - min_val)
    return scaled_column

# Scale the numeric columns in the DataFrame (excluding 'date')
numeric_cols = merged_df.columns[1:]
scaled_df = merged_df.copy()
scaled_df[numeric_cols] = scaled_df[numeric_cols].apply(custom_min_max_scaling, axis=0)

# Create subplots with vertical space using Seaborn
plt.figure(figsize=(20, 20))
grid = plt.GridSpec(8, 3, hspace=1.5)  # Adjust the value of hspace as needed

for i, col in enumerate(scaled_df.columns[1:]):
    ax = plt.subplot(grid[i // 3, i % 3])
    sns.lineplot(data=scaled_df, x="Date", y=col, ax=ax)
    ax.set_title(col)

    # Rotate x-axis labels to an oblique angle
    plt.xticks(rotation=45)  # You can adjust the angle as needed

    # Set x-axis limits to display the full date range
    ax.set_xlim(merged_df['Date'].min(), merged_df['Date'].max())


# Display the plots
plt.savefig("fig.png")
plt.show()
mlflow.log_artifact("fig.png")


### Perform PACF

In [None]:
# Plot PACF for each column of merged_df except the date column using plot_pacf

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from statsmodels.graphics.tsaplots import plot_pacf


# remove the date column in a copy of the scaled_df
data = merged_df.copy()
data.drop(columns=["Date"], inplace=True)

for i, col in enumerate(data.columns):
    # Plot PACF for the column
    plot_pacf(merged_df.iloc[:, i], lags=20, alpha=0.05)
    plt.title(col)
    plt.show()
    plt.savefig("fig.png")
    mlflow.log_artifact("fig.png", f"PACF {col}")


In [None]:
run.end_run()

### Stationary test

In [None]:
from statsmodels.tsa.stattools import adfuller

def adfuller_test(time_series, significance_level=0.05):
    """
    Perform Augmented Dickey-Fuller (ADF) test for stationarity.

    Parameters:
    - time_series: A pandas Series or NumPy array containing the time series data.
    - significance_level: The significance level for the test (default is 0.05).

    Returns:
    - ADF test result and p-value.
    - A string indicating the stationarity based on the p-value.
    """

    result = adfuller(time_series)
    adf_statistic = result[0]
    p_value = result[1]

    if p_value <= significance_level:
        stationarity = "Stationary (p <= {0})".format(significance_level)
    else:
        stationarity = "Non-Stationary (p > {0})".format(significance_level)

    return adf_statistic, p_value, stationarity

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def calculate_and_plot_price_changes(_data, column_name, window_size=10):
    """
    Calculate and plot first-order percentage differences, rolling mean, and rolling standard deviation.

    Parameters:
    - data: A pandas DataFrame containing a 'date' index and the specified cryptocurrency price column.
    - column_name: The name of the column representing cryptocurrency prices.
    - window_size: The window size for the rolling statistics (default is 10).

    Returns:
    - None (plots the results).
    """
    
    data = _data.copy()
    if 'Date' not in data.columns:
        raise ValueError("The 'data' DataFrame must contain a 'date' column as the index.")

    if column_name not in data.columns:
        raise ValueError(f"The specified column '{column_name}' is not found in the DataFrame.")
    
    # Calculate daily percentage changes
    data['PriceChange'] = data[column_name].pct_change() * 100

    # Calculate rolling mean and rolling standard deviation
    data['RollingMean'] = data['PriceChange'].rolling(window=window_size).mean()
    data['RollingStd'] = data['PriceChange'].rolling(window=window_size).std()

    # Plot the results
    plt.figure(figsize=(12, 6))
    plt.plot(data.index, data['PriceChange'], label='Price Change (%)', color='blue')
    plt.plot(data.index, data['RollingMean'], label=f'Rolling Mean ({window_size}-hour)', color='green')
    plt.plot(data.index, data['RollingStd'], label=f'Rolling Std Deviation ({window_size}-hour)', color='red')

    plt.xlabel('Date')
    plt.ylabel('Percentage Change / Rolling Statistics')
    plt.title(f'{column_name} Price Changes and Rolling Statistics')
    plt.legend()
    plt.grid(True)

    plt.show()

In [None]:
for cripto in merged_df.columns[1:]:
    print(f'Analyzing: {cripto}')

    adf_statistic, p_value, stationarity = adfuller_test(merged_df[cripto])
    print("ADF Statistic:", adf_statistic)
    print("p-value:", p_value)
    print("Stationarity:", stationarity)

    calculate_and_plot_price_changes(merged_df[['Date',cripto]], cripto)
    print("\n")
    print("\n---------------------------------\n")