In [30]:
import json
import requests
import pandas as pd
from pathlib import Path
import hvplot.pandas
import panel as pn
import holoviews as hv
import numpy as np
from pathlib import Path
from sklearn import cluster, covariance, manifold
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

import warnings
warnings.filterwarnings('ignore')


In [31]:
# Import positive news about institutional adoption
events_inst_pos=pd.read_csv(
    Path('Institutional_Adoption_Events_positive_in_2021.csv'),
    index_col='DATE',
    parse_dates=True,
    infer_datetime_format=True
)
events_inst_pos=events_inst_pos.fillna(0)
events_inst_pos=events_inst_pos[['Institutional Adopt']]
events_inst_pos=events_inst_pos.rename(columns={'Institutional Adopt': 'Institutional Adopt Pos'})

In [32]:
# Import negative news about institutional adoption
events_inst_neg=pd.read_csv(
    Path('Institutional_Adoption_Events_negative_in_2021.csv'),
    index_col='DATE',
    parse_dates=True,
    infer_datetime_format=True
)
events_inst_neg=events_inst_neg.fillna(0)
events_inst_neg=events_inst_neg[['Institutional Adopt']]
events_inst_neg=events_inst_neg.rename(columns={'Institutional Adopt': 'Institutional Adopt Neg'})

In [33]:
# Import China news
events_china=pd.read_csv(
    Path('China_events_2021.csv'),
    index_col='DATE',
    parse_dates=True,
    infer_datetime_format=True
)
events_china=events_china.fillna(0)
events_china=events_china[['INDICATOR']]

In [34]:
# Import regulatory news
events_regs=pd.read_csv(
    Path('Regulation_events_2021.csv'),
    index_col='DATE',
    parse_dates=True,
    infer_datetime_format=True
)
events_regs=events_regs.fillna(0)
events_regs=events_regs[['Regulation']]

In [35]:
# Import market cap data from the Price_Analysis notebook
mc=pd.read_csv(
    Path('crypto_market_caps.csv'),
    index_col='date',
    parse_dates=True,
    infer_datetime_format=True
)
#prices.sort_values(by='date')

In [36]:
# Combine positive institutional news with prices
combined_ins_pos=pd.merge(mc, events_inst_pos,left_index=True, right_index=True)

In [37]:
# Combine negative institutional news with prices
combined_ins_neg=pd.merge(mc, events_inst_neg, left_index=True, right_index=True)
#combined_ins_neg.head()

In [38]:
# Combine china news with prices
combined_china=pd.merge(mc, events_china, left_index=True, right_index=True)
#combined_china.head()

In [39]:
# Combine positive institutional news with prices
combined_regs=pd.merge(mc, events_regs, left_index=True, right_index=True)
#combined_regs.head()

In [40]:
def get_df(df):
    '''
    Prepare a dataframe for clustering analysis
    
    Argument: 
        df: A dataframe combining events and changes in market cap
    
            Options include:
            - combined_ins_pos (Positive Institutional News)
            - combined_ins_neg (Negative Institutional News)
            - combined_china (China News)
            - combined_regs (Regulatory News

    Returns:
        A dataframe with annual return and variance calculations
        
    
    '''
    
    # Drop NaN in rows and columns
    df = df.dropna(axis=0, how='all')
    df = df.dropna(axis=1)
    
    # Remove last column with the new signals)
    df = df.iloc[:, :-1]
    
    # List of column names
    column_names = list(df)
    
    # Calculate mean and variance for each token
    annual_returns = []
    variance = []
    
    for x in column_names:
        annual_returns.append(df[x].mean()*252)
        variance.append(df[x].std()*np.sqrt(252))
    
    # New dataframe with returns and variance data
    stats_df = pd.DataFrame([annual_returns,variance])
    stats_df = stats_df.T
    stats_df.columns = ('annual_return','variance')
    stats_df.index = column_names

    return stats_df

In [41]:
def get_elbow(df):
    '''
    Plot the elbow curve to select optimal k-values 

    Arguments: 
        df: A dataframe combining events and changes in market cap

    Returns:
        A plot with the elbow curve for the selected dataframe
    
    '''
    
    # Use get_df to retrieve the updated dataframe 
    stats_df = get_df(df)
    
    # Create a list to store inertia values and the values of k
    inertia = []
    k = list(range(1, 11))
    
    # Create a for-loop where each value of k is evaluated using the K-means algorithm
    # Fit the model using the service_ratings DataFrame
    # Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
    for i in k:
        k_model = KMeans(n_clusters=i, random_state=1)
        k_model.fit(stats_df)
        inertia.append(k_model.inertia_)
    
    # Define a DataFrame to hold the values for k and the corresponding inertia
    elbow_data = {"k": k, "inertia": inertia}
    df_elbow = pd.DataFrame(elbow_data)
    
    # Plot the DataFrame
    plot = df_elbow.hvplot.line(
        x="k", 
        y="inertia", 
        title="Elbow Curve",
        xticks=k
    )
    
    return plot

In [42]:
def get_kmeans_plot(df, k):
    '''
    Plot visualizing token clusters using the KMeans algorithm
    
    Arguments: 
        - df: A dataframe combining events and changes in market cap
        - k: The number of clusters to use

    Returns:
        A plot clustering the tokens based on changes in valuation and news category
    
    '''
    
    stats_df = get_df(df)
    
    # Create and initialize the K-means model instance for 4 clusters
    model = KMeans(n_clusters=k, random_state=1)
    
    # Fit the data to the instance of the model
    model.fit(stats_df)

    # Make predictions about the data clusters using the trained model
    crypto_clusters = model.predict(stats_df)
    
    # Add predicted clusters to dataframe and create new column with token names
    stats_df['crypto_cluster']=crypto_clusters
    #stats_df['names']=column_names
    
    # Visualize cluster data in a pot
    plot = stats_df.hvplot.scatter(x="annual_return", y="variance", by="crypto_cluster",text='names',text_baseline='top',hover=True, hover_cols=['index'])
    
    return plot

In [43]:
# China news clustering plot
china_news = get_kmeans_plot(combined_china, 4)
china_news



In [44]:
# Positive institutional news clustering plot
positive_inst_news = get_kmeans_plot(combined_ins_pos, 4)
positive_inst_news



In [45]:
# Negative institutional news clustering plot
negative_inst_news = get_kmeans_plot(combined_ins_neg, 4)
negative_inst_news



In [46]:
# Regulatory news clustering plot
regulatory_news = get_kmeans_plot(combined_regs, 4)
regulatory_news

