<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Description" data-toc-modified-id="Description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Description</a></span></li><li><span><a href="#Setup" data-toc-modified-id="Setup-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setup</a></span><ul class="toc-item"><li><span><a href="#Options" data-toc-modified-id="Options-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Options</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Filter-data" data-toc-modified-id="Filter-data-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Filter data</a></span></li><li><span><a href="#Other-functions" data-toc-modified-id="Other-functions-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Other functions</a></span></li></ul></li><li><span><a href="#SOM-clustering" data-toc-modified-id="SOM-clustering-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>SOM clustering</a></span></li><li><span><a href="#Plot-SOM-Map" data-toc-modified-id="Plot-SOM-Map-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Plot SOM Map</a></span></li><li><span><a href="#Test-Som-Parameters" data-toc-modified-id="Test-Som-Parameters-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Test Som Parameters</a></span></li></ul></div>

# Clustering

## Description

- Cluster funds using k-means

## Setup

In [None]:
import feather
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import sparse

import holoviews as hv
from holoviews import opts
hv.extension('bokeh', 'matplotlib')

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.preprocessing import LabelEncoder, normalize, minmax_scale
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans

from minisom import MiniSom

In [None]:
row_info.sample()

In [None]:
vdims = [('lipper_class', 'Lipper Class'), ('cap_class', 'Cap Class'), ('tna_latest', 'TNA')]
ds = hv.Dataset(row_info, ['year', 'crsp_fundno'], vdims)

In [None]:
ds = ds.aggregate(function=np.mean)
ds

In [None]:
layout = (ds.to(hv.Curve, 'year', 'tna_latest') )
layout.opts(
    opts.Curve(width=500, height=250, framewise=True))

### Options

In [None]:
year             = 2017
style_class      = 'lipper_class' # Choose lipper_class, style_class or cap_class

### Load Data

In [None]:
### Returns
path = '../data/processed/returns.feather'
returns = feather.read_dataframe(path)
returns = returns.rename(columns = {'caldt' : 'report_dt'})
returns = returns.assign(year = returns['report_dt'].dt.year)

### row_info
path = '../data/processed/row_info_f.feather'
row_info = feather.read_dataframe(path)

### col_info
path = '../data/processed/col_info_f.feather'
col_info = feather.read_dataframe(path)

### Holdings
path = '../data/processed/holdings_f.npz'
holdings = sparse.load_npz(path)

### Filter data 

In [None]:
def filter_data(year, verbose = False):
        
    row_info_f = row_info.copy()
    if (year != 'full'):    # If year = full take whole sample
        row_info_f = row_info_f.query('year == @year')

    begin_date = row_info_f.iloc[0,:]['report_dt']
    end_date = begin_date + pd.DateOffset(years=1) # 1 year offset
    row_info_f.reset_index(drop = True, inplace=True)

    # Filter returns
    crsp_fundno_unique = row_info_f['crsp_fundno'].unique()
    returns_f = returns.copy()
    query = '''report_dt >= @begin_date and report_dt <= @end_date and crsp_fundno in @crsp_fundno_unique'''
    returns_f = returns_f.query(query)

    # Change return of month for which holdings apply to 0
    returns_f = returns_f.copy()
    mask = returns_f['report_dt'] == begin_date
    returns_f.loc[mask,'mret'] = 0
    
    # Drop all funds with first return observation after starting date
    drop_fundnos = returns_f.drop_duplicates('crsp_fundno').query('mret != 0')['crsp_fundno']
    returns_f.query('crsp_fundno not in @drop_fundnos', inplace=True)
    row_info_f.query('crsp_fundno not in @drop_fundnos', inplace=True)
    
    # Filter holdings accordingly and delet all securities with less than two observations
    holdings_f = holdings.copy()
    holdings_f = holdings[row_info_f['row']]
    
    holdings_b = sparse.csr_matrix(holdings_f, copy=True)
    holdings_b.data = np.ones(len(holdings_f.data))

    sum_sec_boolean = holdings_b.toarray().sum(0)
    col_mask = (sum_sec_boolean >= 2).flatten()

    holdings_f = holdings_f.tocsc()
    holdings_f = holdings_f[:,col_mask]
    holdings_f = holdings_f.tocsr()
    
    ## Preprocessing
    preprocessing = 'l2'
    if (preprocessing == 'none'): holdings_ft = holdings_f
    if (preprocessing == 'l1'):   holdings_ft = normalize(holdings_f, norm = 'l1')
    if (preprocessing == 'l2'):   holdings_ft = normalize(holdings_f, norm = 'l2')

    
    if (verbose):
        print('Numer of unique funds:           {:10,d}'.format(row_info_f.shape[0]))
        print('Numer of unique securities:      {:10,d}'.format(holdings_ft.shape[1]))
        print('Begin date:                      {}'.format(begin_date.date()))
        print('End date:                        {}'.format(end_date.date()))
    
    return(row_info_f, returns_f, holdings_ft, begin_date, end_date)

In [None]:
row_info_f, returns_f, holdings_ft, begin_date, end_date = filter_data(2014, verbose = True)

### Other functions

In [None]:
def style_map(row_info):
    row_info_temp = row_info.query(''' lipper_class !=  'EIEI' ''')
    
    cap = [0,1,2,3]
    style = [0,1,2]

    counts = row_info_temp['cluster'].value_counts().sort_index()
    size = minmax_scale(counts,feature_range=(1,2)) * 20

    data = round(
            pd.crosstab(
                row_info_temp['cap_class'],row_info_temp['cluster'], 
                margins = True, normalize = 'columns') * 100, 2).T

    x = data.apply(lambda x: np.sum(x * cap) / 100, axis = 1)

    data = round(
            pd.crosstab(
                row_info_temp['style_class'],row_info_temp['cluster'], 
                margins = True, normalize = 'columns') * 100, 2).T
    y = data.apply(lambda x: np.sum(x * style) / 100, axis = 1)

    label = x.index[:-1]

    fig = plt.figure(figsize=(15,4))
    ax_s = fig.add_subplot(111)

    #ax_s.grid(True)

    plt.xlabel('Market cap dimension')
    plt.xticks([0,1,2,3], ['SC','MC','ML','LC'])

    plt.ylabel('Style dimension')
    plt.yticks([0,1,2], ['V','C','G'])

    for i, txt in enumerate(label):
        ax_s.annotate(txt, (x[i], y[i]),
                     xytext = (0, 0),              # Horizontally shift label by `space`
                     textcoords = 'offset points', # Interpret `xytext` as offset in points
                     va='center',                  # Vertically center label
                     ha='center',
                     color = 'black',
                     size = size[i])  

    plt.show()
    
def plot_cluster(row_info, style, ax):
    data = round(
        pd.crosstab(
            row_info[style],row_info['cluster'], 
            margins = True, normalize = 'columns') * 100, 2).T

    data.plot(kind='bar', 
                 stacked=True, ax = ax)

    ax.legend().remove()
    label_list = data.columns.values.astype(str).repeat(data.shape[0])
    rects = ax.patches

    # For each bar: Place a label
    for i, rect in enumerate(rects):
        if rect.get_height() > 10:
            # Get X and Y placement of label from rect.
            x_value = rect.get_x() + rect.get_width() / 2
            y_value = rect.get_y() + rect.get_height() / 2

            # Use X value as label and format number with one decimal place
            label = "{}".format(label_list[i])

            # Create annotation
            ax.annotate(
                label,                        # Use `label` as label
                (x_value, y_value),           # Place label at end of the bar
                xytext = (0, 0),              # Horizontally shift label by `space`
                textcoords = 'offset points', # Interpret `xytext` as offset in points
                va='center',                  # Vertically center label
                ha='center',
                color = 'white',
                size = 12)                  # Horizontally align label 
    return(ax)
    
def plot_cluster_wrapper(row_info):
    
    f, axes = plt.subplots(nrows = 4, ncols=1, sharex=True, 
                           figsize = (15,6), gridspec_kw={'height_ratios':[1,2,2,2]})
        
    data = row_info['cluster'].value_counts().sort_index().append(to_append = pd.Series([0]))
    data.plot(kind='bar', ax = axes[0])
    axes[0].annotate('Total: {:,d}'.format(np.sum(data)),(12,100),ha ='center',size=14)

    plot_cluster(row_info,'cap_class', ax = axes[1])
    plot_cluster(row_info,'style_class', ax = axes[2])
    plot_cluster(row_info,'lipper_class', ax = axes[3])
    plt.show()
    style_map(row_info)

In [None]:
def som_map(som, row_info, holdings, dimension = 'lipper_class', figsize=(15,15)):
    data = holdings.toarray()

    label = row_info[dimension].values
    encoder = LabelEncoder()
    num = encoder.fit_transform(label)

    cmap = cm.tab10(np.linspace(0.0, 1.0, np.max(num) + 1), alpha = 0.7)

    jit_x = np.random.normal(loc = 0, scale = 0.15, size = data.shape[1])
    jit_y = np.random.normal(loc = 0, scale = 0.15, size = data.shape[1])

    plt.figure(figsize=figsize)

    wmap = {}
    im = 0
    for data, label, num, jit_x, jit_y in zip(data, label, num, jit_x, jit_y):  # scatterplot
        w = som.winner(data)
        wmap[w] = im
        plt. text(w[0]+.5 + jit_x,  w[1]+.5 + jit_y,  str(label),
                  color = cmap[num], fontdict={'size': 10})
        im = im + 1
    plt.axis([0, som.get_weights().shape[0], 0,  som.get_weights().shape[1]])
    plt.show()

## SOM clustering

In [None]:
### Initialization and training ###
som = MiniSom(x = 20,
              y = 20,
              input_len = holdings_ft.shape[1],
              neighborhood_function = 'gaussian',
              sigma = 1.0,
              learning_rate = 0.5)

In [None]:
data = holdings_ft.toarray()
som.random_weights_init(data)
print("Training...")

som.train_random(data, 20_000, verbose = True) # training with 100 iterations
print("\n...ready!")

In [None]:
print('Start quantization...')

data = holdings_ft.toarray()
som_quantized = som.quantization(data)

print('Start kMeans...')
kmeans = KMeans(n_clusters = 10,
                verbose = True,
                n_init = 100, # Number of runs
                n_jobs= -1
               ).fit(som_quantized)

row_info_f = row_info_f.assign(cluster = kmeans.labels_)

## Plot SOM Map

In [None]:
for dim in ['cap_class','style_class','cluster']:
    som_map(som, row_info_f, holdings_ft, dimension = dim, figsize=(10,10))

In [None]:
plot_cluster_wrapper(row_info_f)

## Test Som Parameters 

- Size makes a big difference but also takes a lot of time. Good compromise leaning to fast execution: 20
- Iterations are alos important, however, with diminishing returns. 5.000 - 10.000 are good
- Sigma does not seem to make a big difference. We stick with 1
- Learning rate does not seem to make a big difference. We stick with 0.5


In [None]:
MiniSom()

In [None]:
def SOM_training(dim = 20, 
                 n_iterations = 6_000, 
                 sigma = 1,
                 neighborhood_function = 'gaussian', # 'gaussian', 'mexican_hat', 'bubble'
                 learning_rate = 0.5):
    
    ### Initialization and training ###
    som = MiniSom(x = dim,
                  y = dim,
                  input_len = holdings_ft.shape[1],
                  neighborhood_function = neighborhood_function,
                  sigma = 1.0,
                  learning_rate = 0.5)

    data = holdings_ft.toarray()
    som.random_weights_init(data)
    print("Training...")

    som.train_random(data, n_iterations, verbose = True) # training with 100 iterations
    som.quantization_error(data)
    
    print("\n...ready!")

In [None]:

SOM_training(n_iterations= 10_000, neighborhood_function= 'gaussian')
SOM_training(n_iterations= 10_000, neighborhood_function= 'bubble')

In [None]:
SOM_training(n_iterations= 10_000, neighborhood_function= 'triangle')