<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Description" data-toc-modified-id="Description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Description</a></span></li><li><span><a href="#Setup" data-toc-modified-id="Setup-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Other-functions" data-toc-modified-id="Other-functions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Other functions</a></span><ul class="toc-item"><li><span><a href="#Filter" data-toc-modified-id="Filter-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Filter</a></span></li><li><span><a href="#Clustering" data-toc-modified-id="Clustering-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Clustering</a></span></li><li><span><a href="#MiniSom" data-toc-modified-id="MiniSom-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>MiniSom</a></span></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Evaluation</a></span></li><li><span><a href="#Plotting" data-toc-modified-id="Plotting-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span>Plotting</a></span></li><li><span><a href="#Grid-Wrapper" data-toc-modified-id="Grid-Wrapper-3.6"><span class="toc-item-num">3.6&nbsp;&nbsp;</span>Grid Wrapper</a></span></li></ul></li><li><span><a href="#Clustering" data-toc-modified-id="Clustering-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Clustering</a></span><ul class="toc-item"><li><span><a href="#Options" data-toc-modified-id="Options-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Options</a></span></li><li><span><a href="#Graphs" data-toc-modified-id="Graphs-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Graphs</a></span></li></ul></li><li><span><a href="#Visualize" data-toc-modified-id="Visualize-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Visualize</a></span></li></ul></div>

# Clustering

## Description

- Cluster funds using Spektral Clustering

## Setup

In [1]:
import feather
from itertools import product
from multiprocessing import Pool

import numpy as np
import pandas as pd
import seaborn as sns
from scipy import sparse
from scipy.spatial.distance import euclidean

from minisom import MiniSom

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score

In [2]:
### Returns
path = '../data/processed/returns.feather'
returns = feather.read_dataframe(path)
returns = returns.rename(columns = {'caldt' : 'report_dt'})
returns = returns.assign(year = returns['report_dt'].dt.year)

### row_info
path = '../data/processed/row_info.feather'
row_info = feather.read_dataframe(path)
le = LabelEncoder()
row_info['lipper_class_num'] = le.fit_transform(row_info['lipper_class'])

### col_info
path = '../data/processed/col_info.feather'
col_info = feather.read_dataframe(path)

### Holdings
path = '../data/processed/holdings.npz'
holdings = sparse.load_npz(path)

## Other functions

### Filter

In [3]:
def filter_data(param, verbose = False):
    
    verbose = param['verbose']
    year = param['year']
        
    row_info_f = row_info.copy()
    if (year != 'full'):    # If year = full take whole sample
        row_info_f = row_info_f.query('year == @year')

    begin_date = row_info_f.iloc[0,:]['report_dt']
    end_date = begin_date + pd.DateOffset(years=1) # 1 year offset
    row_info_f.reset_index(drop = True, inplace=True)

    # Filter returns
    crsp_fundno_unique = row_info_f['crsp_fundno'].unique()
    returns_f = returns.copy()
    query = '''report_dt >= @begin_date and report_dt <= @end_date and crsp_fundno in @crsp_fundno_unique'''
    returns_f = returns_f.query(query)

    # Change return of month for which holdings apply to 0
    returns_f = returns_f.copy()
    mask = returns_f['report_dt'] == begin_date
    returns_f.loc[mask,'mret'] = 0
    
    # Drop all funds with first return observation after starting date
    drop_fundnos = returns_f.drop_duplicates('crsp_fundno').query('mret != 0')['crsp_fundno']
    returns_f.query('crsp_fundno not in @drop_fundnos', inplace=True)
    row_info_f.query('crsp_fundno not in @drop_fundnos', inplace=True)
    
    # Filter holdings accordingly and delet all securities with less than two observations
    holdings_f = holdings.copy()
    holdings_f = holdings[row_info_f['row']]
    
    holdings_b = sparse.csr_matrix(holdings_f, copy=True)
    holdings_b.data = np.ones(len(holdings_f.data))

    sum_sec_boolean = holdings_b.toarray().sum(0)
    col_mask = (sum_sec_boolean >= 2).flatten()

    holdings_f = holdings_f.tocsc()
    holdings_f = holdings_f[:,col_mask]
    holdings_f = holdings_f.tocsr()
    
    ## Preprocessing
    holdings_ft = normalize(holdings_f)
    
    if (verbose):
        print('Numer of unique funds:           {:10,d}'.format(row_info_f.shape[0]))

        print('Numer of unique securities:      {:10,d}'.format(holdings_ft.shape[1]))

        print('Begin date:                      {}'.format(begin_date.date()))
        print('End date:                        {}'.format(end_date.date()))
    
    return(row_info_f, returns_f, holdings_ft, begin_date, end_date)

### Clustering

In [4]:
def spectral_clustering(holdings_ft, param):
    verbose = param['verbose']

    if(verbose): print('Start clustering...')
    clustering = SpectralClustering(n_clusters = param['n_clusters'],
                                    assign_labels = param['assign_labels'], # kmeans or discretize
                                    eigen_solver = 'arpack',
                                    affinity = param['affinity'],
                                    gamma = param['gamma'],
                                    n_init = param['n_init'],
                                    n_jobs = -1,
                                    random_state = 0).fit(holdings_ft)
    if(verbose): print('Clustering finished')
    
    return(clustering)

In [5]:
def kmeans_clustering(holdings_ft, param):
    verbose = param['verbose']

    if(verbose): print('Start clustering...')
    clustering = KMeans(n_clusters = param['n_clusters'],
                        verbose = verbose,
                        n_init = param['n_init'], # Number of runs
                        n_jobs= -1,
                        random_state = 1
                       ).fit(holdings_ft)
    
    if(verbose): print('Clustering finished')
    
    return(clustering)

In [6]:
def som_clustering(holdings_ft, param):
    verbose = param['verbose']
    if(verbose): print('Start clustering...')
    
    ### Initialization and training ###
    # Configure SOM
    som = MiniSom(x = 15,
                  y = 15,
                  input_len = holdings_ft.shape[1],
    #             neighborhood_function = 'triangle',
                  sigma = 3.0,
                  learning_rate = 0.5)

    # Initialize
    data = holdings_ft.toarray()
    som.random_weights_init(data)

    # Train
    som.train_random(data, 10_000, verbose = verbose) # training with 100 iterations
    
    som_quantized = som.quantization(data)

    clustering = KMeans(n_clusters = param['n_clusters'],
                        verbose = verbose,
                        n_init = param['n_init'], # Number of runs
                        n_jobs= -1,
                        random_state = 1
                       ).fit(som_quantized)

    if(verbose): print('Clustering finished')    
    return(clustering)

### Evaluation

In [7]:
def calc_styleadj_returns(row_info_f, returns_f, style_cols):

    row_info_m = row_info_f.copy()
    returns_m = returns_f.copy()
    
    returns_m = returns_m.sort_values(['crsp_fundno','report_dt'])

    # merge predicted styles onto returns
    returns_m = returns_m.merge(row_info_m[['crsp_fundno', 'report_dt', style_cols]],
                            how='left',
                            on=['crsp_fundno', 'report_dt'])

    # Forward fill all styles and drop nas
    returns_m = (returns_m.apply(lambda x: x.fillna(method = 'ffill'))
    )

    # Calc mean return per style
    style_returns = (returns_m
                        .groupby([style_cols,'report_dt'])
                        .mean()
                        .reset_index()
                        .drop(columns='crsp_fundno')
    )

    # Merge style returns onto fund returns and calc tracking error
    returns_m = (returns_m
                        .rename(columns = {'mret' : 'fund_ret'}) 
                        .merge(style_returns,
                                    how = 'left',
                                    on = [style_cols,'report_dt'])
                        .assign(error = lambda df: df['fund_ret'] - df['mret'])
                        .rename(columns = {'mret' : 'style_ret'}) 
    )

    returns_m = returns_m[['crsp_fundno', 'report_dt', style_cols,
                       'fund_ret', 'style_ret', 'error']]

    return(returns_m, style_returns)

In [8]:
def error_vola_deciles(returns_m): 
    error_vol = (returns_m
                 .groupby(['crsp_fundno'])['error']
                 .std()
                 .reset_index())
    error_vol['error'] = error_vol['error'] * 100
    
    error_vol = (error_vol[['error']]
                .apply(lambda x : x.quantile(np.round(np.arange(0.1,1,0.1),2))))
    return(error_vol)

def error_vola_describe(returns_m): 
    error_vol = (returns_m
                 .groupby(['crsp_fundno'])['error']
                 .std()
                 .reset_index())
    error_vol['error'] = error_vol['error'] * 100
    return(error_vol[['error']].describe().T)

In [9]:
def simulation(row_info_f, returns_f, n_iterations):
    np.random.seed()

    #n_iterations = 500
    
    # First choose n samples of funds with one fund per cluster
    funds_list = []
    cluster = np.array(row_info_f[['crsp_fundno','cluster']])
    arr = np.arange(row_info_f.shape[0])

    for i in np.arange(n_iterations):
        np.random.shuffle(arr)
        cluster = cluster[arr]
        index = np.unique(cluster[:,1], return_index = True, return_inverse = False)[1]
        funds = cluster[index,0]
        funds_list.append(funds)


    mean_return = []
    mean_std = []
    returns_fundnos = returns_f['crsp_fundno'].values

    for funds in funds_list:
        # Take returns for sample and calc equally weighted average return
        returns_index = np.isin(returns_fundnos,funds)
        returns_s = returns_f[returns_index]
        returns_s = returns_s.groupby('report_dt')['mret'].mean()

        # Calc mean and std
        mean_return.append(returns_s.std())
        mean_std.append(returns_s.mean())


    mean_return = pd.DataFrame(mean_return).mean()
    mean_std = pd.DataFrame(mean_std).mean()
    
    return([mean_return[0], mean_std[0]])

In [10]:
def simulation_wrapper(row_info_f, returns_f, n_iterations):
    result_list = []
    pool = Pool()
    
    for i in range(n_iterations):
        pool.apply_async(simulation, callback = result_list.append)
    pool.close()
    pool.join()
    result = result_list
    return(result)

### Plotting

In [11]:
def plot_cluster(row_info_f, style, ax):
    data = round(
        pd.crosstab(
            row_info_f[style],row_info_f['cluster'], 
            margins = True, normalize = 'columns') * 100, 2).T

    data.plot(kind='bar', 
                 stacked=True, ax = ax)

    ax.legend().remove()
    label_list = data.columns.values.astype(str).repeat(data.shape[0])
    rects = ax.patches

    # For each bar: Place a label
    for i, rect in enumerate(rects):
        if rect.get_height() > 10:
            # Get X and Y placement of label from rect.
            x_value = rect.get_x() + rect.get_width() / 2
            y_value = rect.get_y() + rect.get_height() / 2

            # Use X value as label and format number with one decimal place
            label = "{}".format(label_list[i])

            # Create annotation
            ax.annotate(
                label,                        # Use `label` as label
                (x_value, y_value),           # Place label at end of the bar
                xytext = (0, 0),              # Horizontally shift label by `space`
                textcoords = 'offset points', # Interpret `xytext` as offset in points
                va='center',                  # Vertically center label
                ha='center',
                color = 'white',
                size = 12)                  # Horizontally align label 
    return(ax)
    
def plot_cluster_wrapper(row_info_f):
    
    f, axes = plt.subplots(nrows = 4, ncols=1, sharex=True, 
                           figsize = (15,6), gridspec_kw={'height_ratios':[1,2,2,2]})
    
    data = row_info_f['cluster'].value_counts(sort=False).append(to_append = pd.Series([0]))
    data.plot(kind='bar', ax = axes[0])
    axes[0].annotate('Total: {:,d}'.format(np.sum(data)),(5,100),ha ='center')

    plot_cluster(row_info_f,'cap_class', ax = axes[1])
    plot_cluster(row_info_f,'style_class', ax = axes[2])
    plot_cluster(row_info_f,'lipper_class', ax = axes[3])

### Grid Wrapper

In [12]:
def expand_grid(dictionary):
    temp = pd.DataFrame([row for row in product(*dictionary.values())], 
                           columns=dictionary.keys())
    return(temp)

In [13]:
def weighted_average_score(param_grid, relevant_params, measures):


    param_grid['param_id'] = (param_grid
                                  .groupby(relevant_params)
                                  .ngroup())
        
    scores = param_grid[measures]
    params_only = param_grid.drop(columns = measures)
    
    weights = (param_grid[['year','count']]
                                  .drop_duplicates()
                                  .assign(weight = lambda x: x['count'] / np.sum(x['count'])))
    weights = weights[['weight']].values

    scores = scores.groupby(params_only['param_id']).apply(lambda x: np.sum(x * weights))

    params_only = (params_only
                      .drop_duplicates(relevant_params)
                      .drop(columns = ['year','count']))

    result = params_only.merge(scores, how = 'left', on = 'param_id')
    
    return(result)

In [14]:
def full_algo(param_grid):
    n_row = param_grid.shape[0]
    
    # Loop over all supplyed params
    print('Start with params...')
    for i, param in param_grid.iterrows():
        row_info_f, returns_f, holdings_ft, begin_date, end_date = filter_data(param)
        
        algo = param['algo']
        if(algo == 'kmeans'):   clustering = spectral_clustering(holdings_ft, param)
        if(algo == 'spectral'): clustering = kmeans_clustering(holdings_ft, param)
        if(algo == 'som'):      clustering = som_clustering(holdings_ft, param)

        row_info_f = row_info_f.assign(cluster = clustering.labels_)

        db_score = davies_bouldin_score(holdings_ft.toarray(), row_info_f['cluster'])
        s_score = silhouette_score(holdings_ft, row_info_f['cluster'])

        param_grid.loc[i,'count'] = row_info_f.shape[0]
        param_grid.loc[i,'score db'] = db_score
        param_grid.loc[i,'score silhouette'] = s_score

        sim_results = simulation(row_info_f, returns_f, n_iterations = 500)
        param_grid.loc[i,'sim mret'] = sim_results[0]
        param_grid.loc[i,'sim std'] = sim_results[1]

        returns_m, style_returns = calc_styleadj_returns(row_info_f, returns_f, style_cols='cluster')
        param_grid.loc[i,'median tevola'] = error_vola_describe(returns_m)['50%'][0]
        
        progress = (i+1) / n_row * 100
        print('Progress:                {:4.2f}%'.format(progress))
    
    print('Evaluate Lipper clusters...')
    # Evaluate standart lipper classification
    years = param_grid['year'].unique()

    lipper_grid = pd.DataFrame()
    param_lipper = dict(
                    year = years,
                    verbose = [False]
                    )
    param_grid_lipper = expand_grid(param_lipper)
        
    for i, param_lipper in param_grid_lipper.iterrows():
        row_info_f, returns_f, holdings_ft, begin_date, end_date = filter_data(param_lipper)
        row_info_f = row_info_f.assign(cluster = row_info_f['lipper_class_num'])

        db_score = davies_bouldin_score(holdings_ft.toarray(), row_info_f['cluster'])
        s_score = silhouette_score(holdings_ft, row_info_f['cluster'])

        lipper_grid.loc[i,'year'] = param_lipper['year']
        lipper_grid.loc[i,'count'] = row_info_f.shape[0]
        lipper_grid.loc[i,'score db'] = db_score
        lipper_grid.loc[i,'score silhouette'] = s_score

        sim_results = simulation(row_info_f, returns_f, n_iterations = 500)
        lipper_grid.loc[i,'sim mret'] = sim_results[0]
        lipper_grid.loc[i,'sim std'] = sim_results[1]
        lipper_grid.loc[i,'algo'] = 'lipper'
        
        returns_m, style_returns = calc_styleadj_returns(row_info_f, returns_f, style_cols='cluster')
        lipper_grid.loc[i,'median tevola'] = error_vola_describe(returns_m)['50%'][0]
        
    # Concat and calc year weighted averages
    param_grid_full = pd.concat([param_grid,lipper_grid], axis = 0, sort=False)
    result = weighted_average_score(param_grid_full,
                                    relevant_params = ['n_clusters','assign_labels',
                                                       'affinity','gamma','n_init','algo'], 
                                    measures = ['score db', 'score silhouette', 'sim mret', 
                                                'sim std', 'median tevola'])
    result = result.drop(columns = ['style_class','verbose'])
    result['years'] = '{} - {}'.format(np.min(years), np.max(years))
    print('Finished')

    return(result)

## Clustering

### Options

In [15]:
full = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
param = dict(
    year             = [2015],         # Integer or string 'full' for the whole sample
    n_clusters       = [12],
    assign_labels    = ['kmeans'],       # 'kmeans' or 'discretize'
    affinity         = ['rbf'],          # One of: rbf, nearest_neighbors
    gamma            = [1],              # Sigma for rbf kernal
    n_init           = [100],            # N init of kmeans
    style_class      = ['lipper_class'], # Choose lipper_class, style_class or cap_class    
    verbose          = [False],
    algo             = ['kmeans','spectral','som']
)

param_grid = expand_grid(param)

In [None]:
result = full_algo(param_grid)

Start with params...
Progress:                33.33%


In [17]:
result

Unnamed: 0,n_clusters,assign_labels,affinity,gamma,n_init,algo,model,param_id,score db,score silhouette,sim mret,sim std,median tevola,years
0,12.0,kmeans,rbf,1.0,1.0,kmeans,clustering,0,4.32244,0.012,0.027271,0.010451,0.970588,2015 - 2016
1,12.0,kmeans,rbf,1.0,1.0,spectral,clustering,2,5.358464,0.03,0.028497,0.01049,0.951326,2015 - 2016
2,12.0,kmeans,rbf,1.0,1.0,som,clustering,1,5.815274,0.021137,0.028023,0.01059,0.927122,2015 - 2016
3,,,,,,lipper,,-1,7.137715,0.012997,0.029263,0.011409,0.932821,2015 - 2016


In [37]:
result.query(''' algo == 'kmeans' ''')

Unnamed: 0,n_clusters,assign_labels,affinity,gamma,n_init,algo,model,param_id,score db,score silhouette,sim mret,sim std,median tevola,years
0,12.0,kmeans,rbf,1.0,1.0,kmeans,clustering,0,4.32244,0.012,0.027411,0.010492,0.970588,2015 - 2016


In [27]:
result.query(''' model == 'lipper' ''')

Unnamed: 0,n_clusters,assign_labels,affinity,gamma,n_init,algo,model,param_id,score db,score silhouette,sim mret,sim std,median tevola
16,,,,,,,lipper,-1,7.299811,0.011866,0.031538,0.007055,0.912074


## Visualize

In [None]:
fig, ax1 = plt.subplots(figsize = (14,4))

ax2 = ax1.twinx()

sns.lineplot(data = result ,x = 'n_clusters', y='DB score', color='g', ax=ax1)
sns.lineplot(data = result ,x = 'n_clusters', y='Silhouette score', ax=ax2)

ax1.set_xlabel('n_cluster')
ax1.set_ylabel('DB score', color='g')
ax2.set_ylabel('Slihouette score', color='b')

plt.show()

In [None]:
plot_cluster_wrapper(row_info_f)

In [None]:
### Based on example code from sklearn ###

X = holdings_ft
n_clusters = [2,3,4,5,6,7,8,9,10,11,12,13,14,18,25]

clusters = []
results = []

for n_clusters in n_clusters:
    clusterer = SpectralClustering(
                    n_clusters=n_clusters,
                    n_init = 1000)
    
    
    cluster_fit = clusterer.fit(X)
    clusters.append(n_clusters)
    #results.append(cluster_fit.inertia_)    
    
    cluster_labels = cluster_fit.labels_
    
    # Create a subplot with 1 row and 2 columns
    fig, ax1 = plt.subplots()
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.2, 0.6])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, X.shape[0] + (n_clusters + 1) * 10])
    
    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          'The average silhouette_score is: {}'.format(round(silhouette_avg,4)))

    # Compute the silhouette scores for each sample

    # TODO can pool.map be implemented here?
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Description" data-toc-modified-id="Description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Description</a></span></li><li><span><a href="#Setup" data-toc-modified-id="Setup-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Other-functions" data-toc-modified-id="Other-functions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Other functions</a></span></li><li><span><a href="#Clustering" data-toc-modified-id="Clustering-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Clustering</a></span><ul class="toc-item"><li><span><a href="#Options" data-toc-modified-id="Options-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Options</a></span></li><li><span><a href="#Spectral" data-toc-modified-id="Spectral-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Spectral</a></span></li><li><span><a href="#K-means" data-toc-modified-id="K-means-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>K-means</a></span></li></ul></li><li><span><a href="#Visualize" data-toc-modified-id="Visualize-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Visualize</a></span></li><li><span><a href="#Chart-the-returns-of-the-formed-clustes" data-toc-modified-id="Chart-the-returns-of-the-formed-clustes-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Chart the returns of the formed clustes</a></span></li><li><span><a href="#Analysing-clusters" data-toc-modified-id="Analysing-clusters-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Analysing clusters</a></span></li></ul></div>