# Clustering Analysis

In [2]:
#import needed packages
import os
import pandas as pd
import numpy as np

In [3]:
#Here I import the database and change pctchange into type float instead of object
dataSub = pd.read_csv('PANEL_MIS776_PROJECT_DB.xlsx - TheData.csv')

#rename %change to PCTChange
dataSub.columns.values[2] = "PCTChange"

# the reason the db was reading PCtchange as on object is because the column had % signs within the rows
#below I remove the percentage sign
dataSub['PCTChange'] = dataSub['PCTChange'].str.replace('%',' ')
#change pct change to type float
dataSub['PCTChange'] = dataSub.PCTChange.astype(float)
# divide by 100 to return PCt change to decimal form
dataSub['PCTChange'] = dataSub['PCTChange'].div(100)
dataSub.describe()

Unnamed: 0,Categories,close,PCTChange,EMA50,lagged EMA50,RSI,RSI-based MA,RSI-RSIMA,ATR,ADX,...,ASOPR,AccuTrend,NUPL,BTCINFLATION,Liveliness,NVT,CVDD,SP500,NASDAQ,POSNEG
count,1888.0,1887.0,1887.0,1887.0,1886.0,1887.0,1887.0,1887.0,1887.0,1887.0,...,1828.0,1887.0,1887.0,1887.0,1887.0,1887.0,1887.0,1303.0,1304.0,1888.0
mean,3.524364,19596.560212,0.001628,19382.962591,19382.697683,51.630688,51.686798,-0.05611,1187.914947,29.659611,...,1.014833,0.389399,0.360373,0.029376,0.607387,27.657978,6087.551868,3372.621957,10156.471826,0.516419
std,0.853584,17027.908287,0.041154,16687.159043,16691.580395,13.915309,12.326356,8.335639,1124.047393,11.09046,...,0.052776,0.342478,0.246017,0.011689,0.010918,13.854925,4382.261193,692.601534,2915.937781,0.499863
min,1.0,3183.0,-0.3881,3468.602,3468.602,9.49125,16.53173,-27.5029,81.59341,8.965417,...,0.726406,0.0,-0.43148,0.007084,0.562197,2.146851,708.4137,0.0,0.0,0.0
25%,3.0,7284.28,-0.0165,7299.6585,7299.44875,41.84719,43.00134,-5.226415,351.7895,21.052355,...,0.987671,0.084482,0.22839,0.018022,0.603012,17.923195,2899.2435,2787.855,7629.57425,0.0
50%,4.0,10343.23,0.0011,10286.25,10284.015,50.3235,49.88392,0.133769,583.7894,27.24957,...,1.006852,0.28116,0.402736,0.032114,0.608964,26.99885,4080.633,3122.87,9106.754,1.0
75%,4.0,32636.255,0.01965,34168.7,34215.915,60.62294,60.501555,5.510901,1923.285,37.182145,...,1.034633,0.684187,0.53226,0.039919,0.614559,34.418965,9729.394,3973.32,12904.6025,1.0
max,6.0,67554.84,0.2342,59135.22,59135.22,94.55025,83.39333,26.60058,5304.35,69.36892,...,1.39785,1.0,0.793382,0.055817,0.623179,91.0916,14690.94,4796.56,16057.44,1.0


In [4]:
#Features to incorporate for clustering, dataset - yvar(POSNEG) - ASOPR - NASDAQ - SP500 - lagged ema500 - categories
    #Asopr NASDAQ and sp500 were removed because they have too many missing values
        #NA's are handleable, I could just omit them but that would take away a large percentage of my data. 
            #Interpolation can be used as well, and that would be my next step in this project
    #Lagged ema 50 was removed because our original ema50 metric is now lagged, so the lagged ema50 is double lagged
    #Categories was removed because it is a non-lagged variable in the database that relates heavily to POSNEG
data = dataSub.drop(['POSNEG','Categories','lagged EMA50','ASOPR','SP500','NASDAQ'], axis=1)
data = data.dropna()

## Standardization

In [5]:
# Features to be standardized
    #features that are already in index or percentage form do not need to be standardized
        #Reasoning: index and % features are consistent throughout all time intervals
            # Example: RSI (an index) will be within 0 and 100 in any period so there are no standardization issues
                # closing price: trends upward over time (mostly) and can be associated with many different RSI levels
                     # By standardizing metrics like closing price we can remove the trend aspect of the metric

standFeatures = data.drop(['AccuTrend','AVGFees','ADX','RSI','RSI-based MA','RSI-RSIMA'], axis=1)
numerical_features = list(standFeatures)

In [6]:
# standardize the features that are not in index or percentage form

# 1. Import the class you will use
from sklearn.preprocessing import StandardScaler
# 2. Create an instance of the class
scaler = StandardScaler()
# 3. Use the fit method of the instance
scaler.fit(data[numerical_features])
# 4. Use the transform method to perform the transformation
data.loc[:, numerical_features] = scaler.transform(data[numerical_features])

In [7]:
data.head()

Unnamed: 0,close,PCTChange,EMA50,RSI,RSI-based MA,RSI-RSIMA,ATR,ADX,MACD,RP,...,AA,AVGFees,Puell,INVCAP,AccuTrend,NUPL,BTCINFLATION,Liveliness,NVT,CVDD
1,-0.960217,-3.85308,-0.928358,27.24211,54.74499,-27.5029,-0.778584,20.82422,-0.078634,-1.150489,...,-0.345856,0.000574,1.406163,-1.101545,0.414071,0.56182,1.686607,-3.937014,-1.526481,-1.215735
2,-0.396899,-2.917326,-0.384883,41.98441,55.34097,-13.3566,0.70888,39.10379,0.163959,-0.729508,...,0.956999,0.002262,2.223611,-0.71698,0.809552,1.024524,0.977185,-0.439023,-1.144019,-0.98726
3,-0.369403,-2.601357,-0.337741,42.88754,50.44409,-7.55655,0.484788,24.4313,-0.004643,-0.681355,...,2.144939,0.002065,2.945244,-0.673778,0.490867,0.958361,2.091637,-0.041237,-1.027067,-0.942247
4,-0.471501,-3.651346,-0.342361,36.64028,49.32975,-12.6895,0.451741,26.25201,-0.398609,-0.677275,...,0.901015,0.001998,1.621387,-0.670237,0.341794,0.720322,1.26044,0.005394,-1.40153,-0.932736
5,-0.564021,-2.504136,-0.405829,35.16951,39.18449,-4.01497,0.169113,38.74495,-0.797305,-0.680618,...,0.014682,0.000678,1.136904,-0.674486,0.01123,0.417459,1.44768,0.054957,-0.50544,-0.9135


## K means clustering

In [8]:
from sklearn.cluster import KMeans

In [9]:
#Center values for 2 clusters
kmeans = KMeans(n_clusters=2).fit(data)
centroids = (kmeans.cluster_centers_)
print(centroids)

[[-4.63525057e-02 -1.22573106e-01  5.42880359e-02  4.34859342e+01
   4.44939338e+01 -1.00799951e+00  1.48892609e-02  2.72057797e+01
  -4.50031162e-01  1.25707536e-01 -1.02351524e-01 -1.26836492e-01
  -3.14190990e-01 -4.20437200e-01 -2.45884102e-01  1.18475659e-01
  -1.91239325e-01  1.84545477e-04 -2.63457729e-01  1.24065000e-01
   3.33524645e-01 -2.88001563e-01 -1.48014184e-02  1.03409239e-01
   1.05996124e-02  1.33165948e-01]
 [ 8.18984888e-02  2.16569785e-01 -9.59194769e-02  6.60213448e+01
   6.43955966e+01  1.62574794e+00 -2.63072718e-02  3.39951935e+01
   7.95143036e-01 -2.22107890e-01  1.80841035e-01  2.24102600e-01
   5.55132174e-01  7.42854584e-01  4.34443318e-01 -2.09330159e-01
   3.37893529e-01  3.67503372e-04  4.65493495e-01 -2.19205756e-01
   4.88120331e-01  5.08859067e-01  2.61520662e-02 -1.82709872e-01
  -1.87280541e-02 -2.35285876e-01]]


In [10]:
# Calculate silhouette_score
from sklearn.metrics import silhouette_score

print(silhouette_score(data, kmeans.labels_))

0.36206727929847526


In [11]:
#Center values for 3 clusters
kmeans = KMeans(n_clusters=3).fit(data)
centroids = (kmeans.cluster_centers_)
print(centroids)

[[-1.78260143e-02 -1.97266105e-01  2.06605733e-01  3.61838848e+01
   3.70957583e+01 -9.11872933e-01  2.31967314e-01  3.73523923e+01
  -1.02493199e+00  2.90163435e-01 -6.77021817e-02 -1.37552534e-01
  -4.84825483e-01 -7.32504825e-01 -3.17576949e-01  2.07412031e-01
  -3.00583691e-01  1.78078125e-04 -4.06901737e-01  2.89634649e-01
   4.31232362e-01 -4.86771103e-01 -4.85568644e-02  3.16029812e-01
  -1.31193237e-01  2.60190403e-01]
 [ 1.85845135e-02  2.48749709e-01 -1.85575405e-01  6.88903658e+01
   6.76781385e+01  1.21222717e+00 -6.36840660e-02  3.87177933e+01
   9.30748547e-01 -3.25598658e-01  2.99394231e-01  3.27186238e-01
   7.13143483e-01  8.86968409e-01  6.26689001e-01 -3.00096686e-01
   4.12460543e-01  4.33203609e-04  6.16134063e-01 -3.21891897e-01
   5.70864666e-01  6.64406428e-01  7.16247451e-02 -3.18078454e-01
  -6.99562964e-02 -3.34701284e-01]
 [-7.92615140e-04 -2.97374981e-02 -5.32371100e-03  5.03815756e+01
   5.06587647e+01 -2.77189293e-01 -7.63700015e-02  2.16918740e+01
   2.1

In [12]:
# Calculate silhouette_score
from sklearn.metrics import silhouette_score

print(silhouette_score(data, kmeans.labels_))

#Silhouette score lower for 3 clusters. 

0.33135371613803183


In [12]:
# returning to 2 cluster version 
#Center values for 2 clusters
kmeans = KMeans(n_clusters=2).fit(data)
centroids = (kmeans.cluster_centers_)
print(centroids)

[[-4.63525057e-02 -1.22573106e-01  5.42880359e-02  4.34859342e+01
   4.44939338e+01 -1.00799951e+00  1.48892609e-02  2.72057797e+01
  -4.50031162e-01  1.25707536e-01 -1.02351524e-01 -1.26836492e-01
  -3.14190990e-01 -4.20437200e-01 -2.45884102e-01  1.18475659e-01
  -1.91239325e-01  1.84545477e-04 -2.63457729e-01  1.24065000e-01
   3.33524645e-01 -2.88001563e-01 -1.48014184e-02  1.03409239e-01
   1.05996124e-02  1.33165948e-01]
 [ 8.18984888e-02  2.16569785e-01 -9.59194769e-02  6.60213448e+01
   6.43955966e+01  1.62574794e+00 -2.63072718e-02  3.39951935e+01
   7.95143036e-01 -2.22107890e-01  1.80841035e-01  2.24102600e-01
   5.55132174e-01  7.42854584e-01  4.34443318e-01 -2.09330159e-01
   3.37893529e-01  3.67503372e-04  4.65493495e-01 -2.19205756e-01
   4.88120331e-01  5.08859067e-01  2.61520662e-02 -1.82709872e-01
  -1.87280541e-02 -2.35285876e-01]]


In [13]:
#Convert labels to dataframe
labels = (kmeans.labels_)
df_labels = pd.DataFrame(labels)
df_labels.rename(columns={0:'Cluster'}, inplace=True)

In [14]:
df_labels.head()

Unnamed: 0,Cluster
0,2
1,0
2,2
3,2
4,0


In [15]:
list(data.columns)

['close',
 'PCTChange',
 'EMA50',
 'RSI',
 'RSI-based MA',
 'RSI-RSIMA',
 'ATR',
 'ADX',
 'MACD',
 'RP',
 'RHODL',
 'RR',
 'MVRV-Z',
 'SupplyP',
 'MinerFeeRev%',
 'Thermocap',
 'AA',
 'AVGFees',
 'Puell',
 'INVCAP',
 'AccuTrend',
 'NUPL',
 'BTCINFLATION',
 'Liveliness',
 'NVT',
 'CVDD']

In [16]:
dY = dataSub.drop(['Categories','lagged EMA50','ASOPR','SP500','NASDAQ', 'PCTChange'], axis=1)


In [17]:
dY = dY.dropna()

In [18]:
BTC_Clust = pd.concat([dY, df_labels], axis=1)

In [19]:
#Seperate into 2 data sets
dat_C0 = BTC_Clust.loc[BTC_Clust['Cluster']==0]
dat_C1 = BTC_Clust.loc[BTC_Clust['Cluster']==1]

In [20]:
#Drop Cluster variable from each data set
dat_C0.drop(['Cluster'],inplace=True, axis = 1)
dat_C1.drop(['Cluster'],inplace=True, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


## Testing Relevence of the Clusters

In [21]:
# Decision tree for full dataframe
from sklearn.tree import DecisionTreeClassifier
class_tree = DecisionTreeClassifier(max_depth=3)
class_tree.fit(dY.drop(['POSNEG'], axis=1), dY['POSNEG'])

DecisionTreeClassifier(max_depth=3)

In [22]:
from sklearn.metrics import confusion_matrix
y_train_pred = class_tree.predict(dY.drop(['POSNEG'], axis=1))
confusion_matrix(dY['POSNEG'],y_train_pred)

array([[665, 247],
       [419, 556]], dtype=int64)

In [23]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true=dY['POSNEG'], y_pred=y_train_pred)
accuracy.round(3)

0.647

In [25]:
from sklearn.metrics import precision_score
precision = precision_score(y_true=dY['POSNEG'], y_pred=y_train_pred, average = None)
precision.round(3)

array([0.613, 0.692])

In [26]:
from sklearn.metrics import recall_score
recall = recall_score(y_true=dY['POSNEG'], y_pred=y_train_pred, average = None)
recall.round(3)

array([0.729, 0.57 ])

In [27]:
from sklearn.metrics import f1_score
f1_score(dY['POSNEG'], y_train_pred)

0.625421822272216

In [28]:
# Decion Tree for cluster 1
dat_C0 = dat_C0.dropna()
from sklearn.tree import DecisionTreeClassifier
class_tree = DecisionTreeClassifier(max_depth=3)
class_tree.fit(dat_C0.drop(['POSNEG'], axis=1), dat_C0['POSNEG'])

DecisionTreeClassifier(max_depth=3)

In [29]:
from sklearn.metrics import confusion_matrix
y_train_pred = class_tree.predict(dat_C0.drop(['POSNEG'], axis=1))
confusion_matrix(dat_C0['POSNEG'],y_train_pred)

array([[124, 117],
       [ 36, 171]], dtype=int64)

In [30]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true=dat_C0['POSNEG'], y_pred=y_train_pred)
accuracy.round(3)

0.658

In [31]:
from sklearn.metrics import precision_score
precision = precision_score(y_true=dat_C0['POSNEG'], y_pred=y_train_pred, average = None)
precision.round(3)

array([0.775, 0.594])

In [32]:
from sklearn.metrics import recall_score
recall = recall_score(y_true=dat_C0['POSNEG'], y_pred=y_train_pred, average = None)
recall.round(3)

array([0.515, 0.826])

In [33]:
from sklearn.metrics import f1_score
f1_score(dat_C0['POSNEG'], y_train_pred)

0.6909090909090908

In [34]:
# Decion Tree for cluster 2
dat_C1 = dat_C1.dropna()
from sklearn.tree import DecisionTreeClassifier
class_tree = DecisionTreeClassifier(max_depth=3)
class_tree.fit(dat_C1.drop(['POSNEG'], axis=1), dat_C1['POSNEG'])

DecisionTreeClassifier(max_depth=3)

In [35]:
from sklearn.metrics import confusion_matrix
y_train_pred = class_tree.predict(dat_C1.drop(['POSNEG'], axis=1))
confusion_matrix(dat_C1['POSNEG'],y_train_pred)

array([[169,  24],
       [121, 157]], dtype=int64)

In [36]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true=dat_C1['POSNEG'], y_pred=y_train_pred)
accuracy.round(3)

0.692

In [37]:
from sklearn.metrics import precision_score
precision = precision_score(y_true=dat_C1['POSNEG'], y_pred=y_train_pred, average = None)
precision.round(3)

array([0.583, 0.867])

In [38]:
from sklearn.metrics import recall_score
recall = recall_score(y_true=dat_C1['POSNEG'], y_pred=y_train_pred, average = None)
recall.round(3)

array([0.876, 0.565])

In [39]:
from sklearn.metrics import f1_score
f1_score(dat_C1['POSNEG'], y_train_pred)

0.6840958605664489

## Conclusions
The accuracy, precison, recall, and f1 scores for the full dataset are very close to the accuracy, precison, recall, and f1 scores for each of the clusters. This means the clusters are not adding any additional predictive power. 