<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Description" data-toc-modified-id="Description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Description</a></span></li><li><span><a href="#Setup" data-toc-modified-id="Setup-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setup</a></span><ul class="toc-item"><li><span><a href="#Options" data-toc-modified-id="Options-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Options</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Filter-data" data-toc-modified-id="Filter-data-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Filter data</a></span></li><li><span><a href="#Other-functions" data-toc-modified-id="Other-functions-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Other functions</a></span></li></ul></li><li><span><a href="#SOM-clustering" data-toc-modified-id="SOM-clustering-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>SOM clustering</a></span></li><li><span><a href="#Plot-SOM-Map" data-toc-modified-id="Plot-SOM-Map-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Plot SOM Map</a></span></li><li><span><a href="#Test-Som-Parameters" data-toc-modified-id="Test-Som-Parameters-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Test Som Parameters</a></span></li></ul></div>

# Clustering

## Description

- Cluster funds using k-means

## Setup

In [None]:
import feather
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import sparse

import pickle

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.preprocessing import LabelEncoder, normalize, minmax_scale
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans

from minisom import MiniSom

### Options

### Load Data

In [None]:
path = '../data/processed/full.pickle'
pickle_off = open(path,"rb")
dict_all_years = pickle.load(pickle_off)

### Filter data 

### Other functions

In [None]:
def style_map(row_info):
    row_info_temp = row_info.query(''' lipper_class !=  'EIEI' ''')
    
    cap = [0,1,2,3]
    style = [0,1,2]

    counts = row_info_temp['cluster'].value_counts().sort_index()
    size = minmax_scale(counts,feature_range=(1,2)) * 20

    data = round(
            pd.crosstab(
                row_info_temp['cap_class'],row_info_temp['cluster'], 
                margins = True, normalize = 'columns') * 100, 2).T

    x = data.apply(lambda x: np.sum(x * cap) / 100, axis = 1)

    data = round(
            pd.crosstab(
                row_info_temp['style_class'],row_info_temp['cluster'], 
                margins = True, normalize = 'columns') * 100, 2).T
    y = data.apply(lambda x: np.sum(x * style) / 100, axis = 1)

    label = x.index[:-1]

    fig = plt.figure(figsize=(15,4))
    ax_s = fig.add_subplot(111)

    #ax_s.grid(True)

    plt.xlabel('Market cap dimension')
    plt.xticks([0,1,2,3], ['SC','MC','ML','LC'])

    plt.ylabel('Style dimension')
    plt.yticks([0,1,2], ['V','C','G'])

    for i, txt in enumerate(label):
        ax_s.annotate(txt, (x[i], y[i]),
                     xytext = (0, 0),              # Horizontally shift label by `space`
                     textcoords = 'offset points', # Interpret `xytext` as offset in points
                     va='center',                  # Vertically center label
                     ha='center',
                     color = 'black',
                     size = size[i])  

    plt.show()
    
def plot_cluster(row_info, style, ax):
    data = round(
        pd.crosstab(
            row_info[style],row_info['cluster'], 
            margins = True, normalize = 'columns') * 100, 2).T

    data.plot(kind='bar', 
                 stacked=True, ax = ax)

    ax.legend().remove()
    label_list = data.columns.values.astype(str).repeat(data.shape[0])
    rects = ax.patches

    # For each bar: Place a label
    for i, rect in enumerate(rects):
        if rect.get_height() > 10:
            # Get X and Y placement of label from rect.
            x_value = rect.get_x() + rect.get_width() / 2
            y_value = rect.get_y() + rect.get_height() / 2

            # Use X value as label and format number with one decimal place
            label = "{}".format(label_list[i])

            # Create annotation
            ax.annotate(
                label,                        # Use `label` as label
                (x_value, y_value),           # Place label at end of the bar
                xytext = (0, 0),              # Horizontally shift label by `space`
                textcoords = 'offset points', # Interpret `xytext` as offset in points
                va='center',                  # Vertically center label
                ha='center',
                color = 'white',
                size = 12)                  # Horizontally align label 
    return(ax)
    
def plot_cluster_wrapper(row_info):
    
    f, axes = plt.subplots(nrows = 4, ncols=1, sharex=True, 
                           figsize = (15,6), gridspec_kw={'height_ratios':[1,2,2,2]})
        
    data = row_info['cluster'].value_counts().sort_index().append(to_append = pd.Series([0]))
    data.plot(kind='bar', ax = axes[0])
    axes[0].annotate('Total: {:,d}'.format(np.sum(data)),(12,100),ha ='center',size=14)

    plot_cluster(row_info,'cap_class', ax = axes[1])
    plot_cluster(row_info,'style_class', ax = axes[2])
    plot_cluster(row_info,'lipper_class', ax = axes[3])
    plt.show()
    style_map(row_info)

In [None]:
def som_map(som, row_info, holdings, dimension = 'lipper_class', figsize=(15,15)):
    data = holdings.toarray()

    label = row_info[dimension].values
    encoder = LabelEncoder()
    num = encoder.fit_transform(label)

    cmap = cm.tab10(np.linspace(0.0, 1.0, np.max(num) + 1), alpha = 0.7)

    jit_x = np.random.normal(loc = 0, scale = 0.15, size = data.shape[1])
    jit_y = np.random.normal(loc = 0, scale = 0.15, size = data.shape[1])

    plt.figure(figsize=figsize)

    wmap = {}
    im = 0
    for data, label, num, jit_x, jit_y in zip(data, label, num, jit_x, jit_y):  # scatterplot
        w = som.winner(data)
        wmap[w] = im
        plt. text(w[0]+.5 + jit_x,  w[1]+.5 + jit_y,  str(label),
                  color = cmap[num], fontdict={'size': 10})
        im = im + 1
    plt.axis([0, som.get_weights().shape[0], 0,  som.get_weights().shape[1]])
    plt.xticks(size = 15)
    plt.yticks(size = 15)
    plt.show()

## SOM clustering

In [None]:
holdings_ft = dict_all_years[2018]['holdings_ft']
row_info_f = dict_all_years[2018]['row_info_f']

In [None]:
### Initialization and training ###
som = MiniSom(x = 15,
              y = 15,
              input_len = holdings_ft.shape[1],
              neighborhood_function = 'gaussian',
              sigma = 1.0,
              learning_rate = 0.5)

In [None]:
data = holdings_ft.toarray()
som.random_weights_init(data)
print("Training...")

som.train_random(data, 10_000, verbose = True) # training with 100 iterations
print("\n...ready!")

In [None]:
print('Start quantization...')

data = holdings_ft.toarray()

som_quantized = som.quantization(data)

print('Start kMeans...')
kmeans = KMeans(n_clusters = 13,
                verbose = True,
                n_init = 250, # Number of runs
                n_jobs= -1
               ).fit(som_quantized)

row_info_f = row_info_f.assign(cluster = kmeans.labels_)

## Plot SOM Map

In [None]:
for dim in ['cap_class','style_class','cluster']:
    som_map(som, row_info_f, holdings_ft, dimension = dim, figsize=(10,10))

In [None]:
plot_cluster_wrapper(row_info_f)

## Test Som Parameters 

- Size makes a big difference but also takes a lot of time. Good compromise leaning to fast execution: 20
- Iterations are alos important, however, with diminishing returns. 5.000 - 10.000 are good
- Sigma does not seem to make a big difference. We stick with 1
- Learning rate does not seem to make a big difference. We stick with 0.5


In [None]:
np.sqrt(5 * np.sqrt(5000))

In [None]:
def SOM_training(dim = 15, 
                 n_iterations = 10_000, 
                 neighborhood_function = 'gaussian', # 'gaussian', 'mexican_hat', 'bubble'
                ):
    
    ### Initialization and training ###
    som = MiniSom(x = dim,
                  y = dim,
                  input_len = holdings_ft.shape[1],
                  neighborhood_function = neighborhood_function,
                  sigma = 1.0,
                  learning_rate = 0.5)

    data = holdings_ft.toarray()
    som.random_weights_init(data)
    print("Training...")

    som.train_random(data, n_iterations, verbose = True) # training with 100 iterations
    som.quantization_error(data)
    
    print("\n...ready!")

In [None]:
SOM_training(dim = 15, n_iterations = 10_000, neighborhood_function= 'gaussian')

In [None]:
SOM_training(dim = 20, n_iterations= 10_000, neighborhood_function= 'gaussian')

In [None]:
SOM_training(dim = 15, n_iterations = 5_000, neighborhood_function= 'gaussian')

In [None]:
SOM_training(dim = 20, n_iterations= 5_000, neighborhood_function= 'gaussian')