## Importing the required Libraries

In [46]:
import pandas as pd
import io
import os
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import kendalltau
from scipy.stats import pearsonr, spearmanr, kendalltau
import matplotlib.pyplot as plt
from scipy.fft import fft
from scipy.signal import hilbert
from scipy.stats import skew, kurtosis
import mplfinance as mpf
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
from sklearn.model_selection import train_test_split
import plotly.graph_objs as go
import kaleido
import talib
import warnings
warnings.simplefilter("ignore")




In [None]:
df_3m = pd.read_csv('btcusdt_3m_train.csv')
df_3m.rename(columns={'datetime': 'Date'}, inplace=True)
df_3m['Date'] = pd.to_datetime(df_3m['Date'])
df_3m = df_3m.drop('Unnamed: 0', axis=1) 
df_3m.set_index('Date', inplace=True)

In [None]:
df_5m = pd.read_csv('btcusdt_5m_train.csv')
df_5m.rename(columns={'datetime': 'Date'}, inplace=True)
df_5m['Date'] = pd.to_datetime(df_5m['Date'])
df_5m = df_5m.drop('Unnamed: 0', axis=1) 
df_5m.set_index('Date', inplace=True)


In [26]:
df_15m = pd.read_csv('btcusdt_15m_train.csv')
df_15m.rename(columns={'datetime': 'Date'}, inplace=True)
df_15m['Date'] = pd.to_datetime(df_15m['Date'])
df_15m = df_15m.drop('Unnamed: 0', axis=1) 
df_15m.set_index('Date', inplace=True)
df_15m

Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 05:30:00,13715.65,13715.65,13400.01,13556.15,123.616013
2018-01-01 05:45:00,13533.75,13550.87,13402.00,13521.12,98.136430
2018-01-01 06:00:00,13500.00,13545.37,13450.00,13470.41,79.904037
2018-01-01 06:15:00,13494.65,13690.87,13450.00,13529.01,141.699719
2018-01-01 06:30:00,13528.99,13571.74,13402.28,13445.63,72.537533
...,...,...,...,...,...
2021-05-31 23:00:00,36869.99,36906.96,36673.41,36784.46,798.973804
2021-05-31 23:15:00,36782.96,36949.80,36707.33,36934.73,721.908787
2021-05-31 23:30:00,36935.81,36955.02,36750.46,36759.67,555.415983
2021-05-31 23:45:00,36759.89,36874.39,36677.37,36758.45,577.061267


In [None]:
df_30m = pd.read_csv('btcusdt_30m_train.csv')
df_30m.rename(columns={'datetime': 'Date'}, inplace=True)
df_30m['Date'] = pd.to_datetime(df_30m['Date'])
df_30m = df_30m.drop('Unnamed: 0', axis=1) 
df_30m.set_index('Date', inplace=True)

In [13]:
df_1hr = pd.read_csv('btcusdt_1h_train.csv')
df_1hr.rename(columns={'datetime': 'Date'}, inplace=True)
df_1hr['Date'] = pd.to_datetime(df_1hr['Date'])
df_1hr = df_1hr.drop('Unnamed: 0', axis=1) 
df_1hr.set_index('Date', inplace=True)

In [None]:
daily_df = df_1hr.resample('D').agg({'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum'})

daily_df

## List of Indicators to test

CCI

In [27]:
df_cci_15m=df_15m.copy()

In [28]:
df_cci_15m['Return_t+1'] = df_cci_15m['close'].shift(-1) / df_cci_15m['close'] - 1

In [None]:
df_cci_15m['TP'] = (df_cci_15m['high'] + df_cci_15m['low'] + df_cci_15m['close']) / 3
df_cci_15m['TPMA'] = df_cci_15m['TP'].rolling(window=20).mean()

def mean_deviation(series):
    mean = series.mean()
    return series.map(lambda x: abs(x - mean)).mean()

df_cci_15m['MeanDev'] = df_cci_15m['TP'].rolling(window=20).apply(mean_deviation)
df_cci_15m['CCI'] = (df_cci_15m['TP'] - df_cci_15m['TPMA']) / (df_cci_15m['MeanDev'] * 0.015)
df_cci_15m.dropna()

In [30]:
pearson_correlation = df_cci_15m['Return_t+1'].corr(df_cci_15m['CCI'], method='pearson')
spearman_correlation = df_cci_15m['Return_t+1'].corr(df_cci_15m['CCI'], method='spearman')
kendall_correlation = df_cci_15m['Return_t+1'].corr(df_cci_15m['CCI'], method='kendall')

In [32]:
print(f'pearson_correlation {pearson_correlation*100}')
print(f'spearman_correlation {spearman_correlation*100}')
print(f'kendall_correlation {kendall_correlation*100}')

pearson_correlation -0.20515667929865747
spearman_correlation -5.695783959697842
kendall_correlation -3.939327444634303


In [48]:
def calculate_ic_and_correlations_df(df, frequency='15T'):
    """
    Calculates the Information Coefficient (IC) and correlation coefficients (Pearson, Spearman, Kendall)
    between alpha values and next day returns using a single dataframe that includes both OHLCV data and alpha values.

    :param df: DataFrame containing the OHLCV data and alpha values
    :param frequency: The frequency of the data ('5T' for 5 minutes, '15T' for 15 minutes, etc.)
    :return: Prints the IC and correlation coefficients with 3 decimal points and as percentages
    """
    # Calculate the number of periods in one day
    periods_per_day = pd.Timedelta('1D') // pd.Timedelta(frequency)

    # Calculate next day returns
    df['next_day_return'] = df['close'].shift(-periods_per_day) / df['close'] - 1

    # Drop the rows with NaN values that occur due to shifting
    df.dropna(subset=['next_day_return', 'alpha'], inplace=True)

    # Calculate correlations
    pearson_corr = df['alpha'].corr(df['next_day_return'], method='pearson')
    spearman_corr = df['alpha'].corr(df['next_day_return'], method='spearman')
    kendall_corr = df['alpha'].corr(df['next_day_return'], method='kendall')

    # Calculate Information Coefficient (IC) as the mean of daily Pearson correlations
    # Group by date to consider the high frequency data as daily data
    daily_corrs = df.groupby(df.index.date).apply(
        lambda x: x['alpha'].corr(x['next_day_return'], method='pearson'))
    ic = daily_corrs.mean()

    # Output the results with 3 decimal points and as percentages
    print(f'Information Coefficient (IC): {ic:.3f}')
    print(f'Pearson Correlation (%): {pearson_corr*100:.3f}%')
    print(f'Spearman Correlation (%): {spearman_corr*100:.3f}%')
    print(f'Kendall Correlation (%): {kendall_corr*100:.3f}%')

# Assuming df is the DataFrame containing the ohlcv values and alpha values respectively
# The index of the dataframe should be datetime objects that align
# You would call the function as follows:
# calculate_ic_and_correlations_df(df, '15T')  # Replace '15T' with the actual frequency of your data



In [76]:
df_alp1=df_15m.copy()
df_alp1['alpha'] = np.maximum(
(df_alp1['low'] / df_alp1['open']) * (df_alp1['high'] / df_alp1['close']),
        np.cos(np.minimum(df_alp1['close'], df_alp1['open']))
    )
calculate_ic_and_correlations_df(df_alp1)

Information Coefficient (IC): -0.068
Pearson Correlation (%): 0.931%
Spearman Correlation (%): 0.111%
Kendall Correlation (%): 0.069%


In [77]:
df_alp101=df_15m.copy()
df_alp101['alpha'] = (df_alp101['close']-df_alp101['open'])/(df_alp101['high']-df_alp101['low'])
calculate_ic_and_correlations_df(df_alp101)

Information Coefficient (IC): -0.090
Pearson Correlation (%): 0.197%
Spearman Correlation (%): -0.488%
Kendall Correlation (%): -0.328%


In [78]:
def calculate_vwap(df):
    vwap = (df['volume'] * (df['high'] + df['low'] + df['close']) / 3).cumsum() / df['volume'].cumsum()
    return vwap

# Function to calculate Alpha#41
def calculate_alpha_41(df):

    df['VWAP'] = calculate_vwap(df)
    df['alpha'] = np.sqrt(df['high'] * df['low'] - df['VWAP'])
    return df

df_alp41=df_15m.copy()
df_alp41=calculate_alpha_41(df_alp41)
calculate_ic_and_correlations_df(df_alp41)

Information Coefficient (IC): -0.652
Pearson Correlation (%): -0.723%
Spearman Correlation (%): -0.447%
Kendall Correlation (%): -0.369%


In [91]:
df_alp2=df_15m.copy()
df_alp2['Previous Close'] = df_alp2['close'].shift(1)
df_alp2['Return'] = (df_alp2['close'] - df_alp2['Previous Close']) / df_alp2['Previous Close']
df_alp2['alpha'] = ((df_alp2['low'] / df_alp2['close']) - df_alp2['Return']) +((df_alp2['low'] / df_alp2['close']) * np.cos(df_alp2['Return']))
calculate_ic_and_correlations_df(df_alp2)


Information Coefficient (IC): 0.064
Pearson Correlation (%): -1.232%
Spearman Correlation (%): -0.189%
Kendall Correlation (%): -0.084%


In [80]:
df_alp3=df_15m.copy()
df_alp3['alpha'] = np.minimum(0.938,df_alp3['low'] / df_alp3['close'])
calculate_ic_and_correlations_df(df_alp3)

Information Coefficient (IC): -0.001
Pearson Correlation (%): -1.308%
Spearman Correlation (%): -0.715%
Kendall Correlation (%): -0.584%


In [85]:
df_alp4=df_15m.copy()
df_alp4['alpha'] = np.minimum(df_alp4['low'] - df_alp4['open'], 0.133) + np.maximum(np.cos(df_alp4['volume']), df_alp4['high'] - df_alp4['close'])
calculate_ic_and_correlations_df(df_alp4)


Information Coefficient (IC): -0.067
Pearson Correlation (%): 0.800%
Spearman Correlation (%): 0.085%
Kendall Correlation (%): 0.043%


In [73]:
df_alp5=df_15m.copy()
df_alp5['alpha'] = np.maximum(np.minimum(df_alp5['low'] - df_alp5['open'], 0.133),(df_alp5['low'] - df_alp5['close']) / np.sqrt(df_alp5['high']))
calculate_ic_and_correlations_df(df_alp5)

Information Coefficient (IC): 0.006
Pearson Correlation (%): -1.476%
Spearman Correlation (%): -1.031%
Kendall Correlation (%): -0.654%


In [75]:
df_alp6=df_15m.copy()
df_alp6['Previous Close'] = df_alp6['close'].shift(1)
df_alp6['Return'] = (df_alp6['close'] - df_alp6['Previous Close']) / df_alp6['Previous Close']
df_alp6['alpha'] = ((df_alp6['high'] - df_alp6['close']) - df_alp6['Return'])/((df_alp6['low'] + df_alp6['Return']) + (-0.177 / df_alp6['close']))
calculate_ic_and_correlations_df(df_alp6)

Information Coefficient (IC): 0.082
Pearson Correlation (%): 4.081%
Spearman Correlation (%): 1.656%
Kendall Correlation (%): 1.107%


In [72]:
df_alp7=df_15m.copy()
df_alp7['Previous Close'] = df_alp7['close'].shift(1)
df_alp7['Return'] = (df_alp7['close'] - df_alp7['Previous Close']) / df_alp7['Previous Close']

df_alp7['alpha'] =np.maximum((df_alp7['high'] / df_alp7['close']) - df_alp7['Return'].abs(),
                             np.minimum(df_alp7['low'] / df_alp7['open'], df_alp7['high'] / df_alp7['close']))
calculate_ic_and_correlations_df(df_alp7)

Information Coefficient (IC): 0.025
Pearson Correlation (%): 2.606%
Spearman Correlation (%): 0.670%
Kendall Correlation (%): 0.467%


In [83]:
df_alp8=df_15m.copy()
df_alp8['alpha'] =0.386/df_alp8['low'] + df_alp8['low']/df_alp8['close']
calculate_ic_and_correlations_df(df_alp8)

Information Coefficient (IC): 0.016
Pearson Correlation (%): -2.231%
Spearman Correlation (%): -0.980%
Kendall Correlation (%): -0.626%


In [87]:
df_alp54=df_15m.copy()
df_alp54['alpha'] =(-1)*(df_alp54['low']-df_alp54['close'])*((df_alp54['open']/df_alp54['close'])**5)/(df_alp54['low']-df_alp54['high'])
calculate_ic_and_correlations_df(df_alp54)

Information Coefficient (IC): 0.055
Pearson Correlation (%): -0.031%
Spearman Correlation (%): 0.511%
Kendall Correlation (%): 0.340%


In [92]:
df_alp9=df_15m.copy()
df_alp9['alpha'] = (df_alp9['close']-df_alp9['open'])/(df_alp9['high']-df_alp9['low'])+0.001
calculate_ic_and_correlations_df(df_alp9)

Information Coefficient (IC): -0.090
Pearson Correlation (%): 0.197%
Spearman Correlation (%): -0.488%
Kendall Correlation (%): -0.328%


In [88]:
def delta(series, period):
    return series.diff(period)

# Define sign function
def sign(series):
    return series.apply(np.sign)

# Define correlation function
def correlation(series1, series2, period):
    return series1.rolling(window=period).corr(series2)


# Alpha#12: (sign(delta(volume, 1)) * (-1 * delta(close, 1)))
df_alp12=df_15m.copy()
df_alp12['alpha'] = sign(delta(df_alp12['volume'], 1)) * (-1 * delta(df_15m['close'], 1))
calculate_ic_and_correlations_df(df_alp12)

Information Coefficient (IC): 0.100
Pearson Correlation (%): -0.007%
Spearman Correlation (%): 0.371%
Kendall Correlation (%): 0.280%


In [89]:
# Alpha#13/6: (-1 * correlation(open, volume, 10))
df_alp13=df_15m.copy()
df_alp13['alpha'] = -1 * correlation(df_alp13['open'], df_alp13['volume'], 10)
calculate_ic_and_correlations_df(df_alp13)


Information Coefficient (IC): 0.174
Pearson Correlation (%): -1.703%
Spearman Correlation (%): 0.707%
Kendall Correlation (%): 0.485%


In [57]:
window_sizes = range(5, 101)  
df_1hr['fpc'] = df_1hr['close'].shift(-1) - df_1hr['close']

pearson_correlations = {}
spearman_correlations = {}
kendall_correlations = {}

for window in window_sizes:
    df_1hr[f'ema_{window}'] = df_1hr['close'].ewm(span=window, adjust=False).mean()
    df_1hr[f'ema_change_{window}'] = df_1hr[f'ema_{window}'].diff()

    pearson_correlation = df_1hr[f'ema_change_{window}'].corr(df_1hr['fpc'], method='pearson')
    spearman_correlation = df_1hr[f'ema_change_{window}'].corr(df_1hr['fpc'], method='spearman')
    kendall_correlation = df_1hr[f'ema_change_{window}'].corr(df_1hr['fpc'], method='kendall')

    pearson_correlations[window] = pearson_correlation
    spearman_correlations[window] = spearman_correlation
    kendall_correlations[window] = kendall_correlation



In [59]:
optimal_window_pearson = max(pearson_correlations, key=lambda x: abs(pearson_correlations[x]))
optimal_window_spearman = max(spearman_correlations, key=lambda x: abs(spearman_correlations[x]))
optimal_window_kendall = max(kendall_correlations, key=lambda x: abs(kendall_correlations[x]))
print(f"Optimal TRIMA window size for Pearson: {optimal_window_pearson}, Maximum Correlation: {pearson_correlations[optimal_window_pearson]}")
print(f"Optimal TRIMA window size for Spearman: {optimal_window_spearman}, Maximum Correlation: {spearman_correlations[optimal_window_spearman]}")
print(f"Optimal TRIMA window size for Kendall: {optimal_window_kendall}, Maximum Correlation: {kendall_correlations[optimal_window_kendall]}")

Optimal TRIMA window size for Pearson: 5, Maximum Correlation: -0.0354255402810652
Optimal TRIMA window size for Spearman: 5, Maximum Correlation: -0.07945392030641331
Optimal TRIMA window size for Kendall: 5, Maximum Correlation: -0.06545836261560696
