In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn import mixture
%matplotlib inline

In [2]:
def calculate_classification_data(df_clean, split_time, is_pca_on, pca_n_components, cluster_n):
    
    if is_pca_on:
        #pca transform
        pca_data = df_clean.copy()
        pca_data_standard = scale(pca_data)
        pca_model = PCA(n_components=pca_n_components, random_state=2021)
        pca_model.fit(pca_data_standard)
        for i in range(1,pca_n_components+1):
            pca_data[f'pca_{i}'] = pca_model.transform(pca_data_standard)[:,i-1]
        data_processed = pca_data.loc[:, 'pca_1':f'pca_{pca_n_components}'].copy()
    else:
        data_processed = pd.DataFrame(scale(df_clean), index=df_clean.index, columns=df_clean.columns).copy()
        
    #clustering
#     df_train = data_processed[(data_processed.index>pd.to_datetime(split_time)-pd.DateOffset(years=7)) & (data_processed.index<pd.to_datetime(split_time))].copy()
    df_train = data_processed[data_processed.index<pd.to_datetime(split_time)].copy()
    df_test = data_processed[(pd.to_datetime(split_time)<=data_processed.index) & (data_processed.index<=pd.to_datetime(split_time)+pd.DateOffset(years=1))].copy()
        
    km = KMeans(n_clusters=cluster_n, random_state=2021)
    results = km.fit_predict(df_train)
    #calculate the Silhouetter Score
    score = metrics.silhouette_score(df_train, km.labels_, metric='euclidean')
    print(f'Silhouette Score: {score}')
    df_train['label'] = results
#     df_train['label'] = df_train['label'].shift(-1)
    df_train.dropna(inplace=True)
    
#     df_train.reset_index(drop=True, inplace=True)
#     df_test.reset_index(drop=True, inplace=True)
    
#     df_train.to_csv(f'train_{split_time[0:4]}_fractional.csv')
#     df_test.to_csv(f'test_{split_time[0:4]}_fractional.csv')
    return df_train, df_test

In [3]:
df = pd.read_csv('../Data/rawData_Feb11.csv')

In [4]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df = df[df!=0]
df = df.apply(lambda x: x.pct_change())
df.dropna(inplace=True)

In [9]:
df_fractional_train_2018 = pd.read_csv('base_ffd_train_split2018-01-01.csv')
df_fractional_test_2018 = pd.read_csv('base_ffd_test_split2018-01-01_end2019-01-01.csv')
df_fractional_train_2019 = pd.read_csv('base_ffd_train_split2019-01-01.csv')
df_fractional_test_2019 = pd.read_csv('base_ffd_test_split2019-01-01_end2020-01-01.csv')
df_fractional_train_2020 = pd.read_csv('base_ffd_train_split2020-01-01.csv')
df_fractional_test_2020 = pd.read_csv('base_ffd_test_split2020-01-01_end2021-01-01.csv')
df_fractional_train_2021 = pd.read_csv('base_ffd_train_split2021-01-01.csv')
df_fractional_test_2021 = pd.read_csv('base_ffd_test_split2021-01-01_end2022-01-01.csv')

In [10]:
df_fractional_2018 = df_fractional_train_2018.append(df_fractional_test_2018)
df_fractional_2019 = df_fractional_train_2019.append(df_fractional_test_2019)
df_fractional_2020 = df_fractional_train_2020.append(df_fractional_test_2020)
df_fractional_2021 = df_fractional_train_2021.append(df_fractional_test_2021)

In [11]:
df_fractional_2018['Date'] = pd.to_datetime(df_fractional_2018['Date'])
df_fractional_2019['Date'] = pd.to_datetime(df_fractional_2019['Date'])
df_fractional_2020['Date'] = pd.to_datetime(df_fractional_2020['Date'])
df_fractional_2021['Date'] = pd.to_datetime(df_fractional_2021['Date'])

In [12]:
df_fractional_2018.set_index('Date', inplace=True)
df_fractional_2019.set_index('Date', inplace=True)
df_fractional_2020.set_index('Date', inplace=True)
df_fractional_2021.set_index('Date', inplace=True)

In [13]:
train_2018_fractional, test_2018_fractional = calculate_classification_data(df_fractional_2018, '2018-01-01', False, 8, 2)
train_2019_fractional, test_2019_fractional = calculate_classification_data(df_fractional_2019, '2019-01-01', False, 8, 2)
train_2020_fractional, test_2020_fractional = calculate_classification_data(df_fractional_2020, '2020-01-01', False, 8, 2)
train_2021_fractional, test_2021_fractional = calculate_classification_data(df_fractional_2021, '2021-01-01', False, 8, 2)

Silhouette Score: 0.26817335823466215
Silhouette Score: 0.2569776073835944
Silhouette Score: 0.26875229672351586
Silhouette Score: 0.262922477700826


In [14]:
train_2018_ret, test_2018_ret = calculate_classification_data(df, '2018-01-01', False, 8, 2)
train_2019_ret, test_2019_ret = calculate_classification_data(df, '2019-01-01', False, 8, 2)
train_2020_ret, test_2020_ret = calculate_classification_data(df, '2020-01-01', False, 8, 2)
train_2021_ret, test_2021_ret = calculate_classification_data(df, '2021-01-01', False, 8, 2)

Silhouette Score: 0.21883207190797224
Silhouette Score: 0.22574529805788388
Silhouette Score: 0.22020203602787194
Silhouette Score: 0.22411556769895105


In [15]:
train_2018_fractional_label = pd.DataFrame(train_2018_fractional['label'], index=train_2018_fractional.index).reset_index()
train_2019_fractional_label = pd.DataFrame(train_2019_fractional['label'], index=train_2019_fractional.index).reset_index()
train_2020_fractional_label = pd.DataFrame(train_2020_fractional['label'], index=train_2020_fractional.index).reset_index()
train_2021_fractional_label = pd.DataFrame(train_2021_fractional['label'], index=train_2021_fractional.index).reset_index()

In [16]:
df_2018_merge = pd.merge(train_2018_ret,train_2018_fractional_label, on = 'Date')
df_2019_merge = pd.merge(train_2019_ret,train_2019_fractional_label, on = 'Date')
df_2020_merge = pd.merge(train_2020_ret,train_2020_fractional_label, on = 'Date')
df_2021_merge = pd.merge(train_2021_ret,train_2021_fractional_label, on = 'Date')

In [20]:
df_2019_merge.groupby('label_y').agg(['mean', 'std']).transpose()

Unnamed: 0,label_y,0,1
RAY,mean,0.040835,-0.083326
RAY,std,0.620521,0.881637
VIX,mean,-0.036136,0.057174
VIX,std,0.964971,1.061627
QQQ,mean,0.049474,-0.091565
QQQ,std,0.699201,0.948831
T10Y3M,mean,0.019807,0.012382
T10Y3M,std,0.074939,0.100538
CL1,mean,0.027083,-0.011273
CL1,std,0.199935,0.421733
