<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Clustering" data-toc-modified-id="Clustering-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Clustering</a></span></li><li><span><a href="#Setup" data-toc-modified-id="Setup-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setup</a></span><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Checking-the-data" data-toc-modified-id="Checking-the-data-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Checking the data</a></span></li><li><span><a href="#Clustering" data-toc-modified-id="Clustering-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Clustering</a></span></li><li><span><a href="#Chart-the-returns-of-the-formed-clustes" data-toc-modified-id="Chart-the-returns-of-the-formed-clustes-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Chart the returns of the formed clustes</a></span></li><li><span><a href="#Analysing-clusters" data-toc-modified-id="Analysing-clusters-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Analysing clusters</a></span></li><li><span><a href="#PCA" data-toc-modified-id="PCA-2.6"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>PCA</a></span></li></ul></li><li><span><a href="#Visualize" data-toc-modified-id="Visualize-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Visualize</a></span></li></ul></div>

# Clustering 

### Description

- Cluster funds using k-means

In [None]:
import feather
import numpy as np
import datetime
import pandas as pd
from scipy import sparse

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans, SpectralClustering

from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.decomposition import TruncatedSVD

# Setup

In [None]:
# Convert to date format
begin_date = '2018-01-01' 
end_date = '2019-01-01'

## Load Data

In [None]:
### Returns
path = '../data/processed/returns.feather'
returns = feather.read_dataframe(path)

returns = returns.query('caldt >= @begin_date and caldt <= @end_date')

### row_info
path = '../data/processed/row_info.feather'
row_info = feather.read_dataframe(path)

row_info = row_info.query('report_dt >= @begin_date and report_dt <= @end_date')

row_info.reset_index(inplace=True,drop=True)
row_info['row'] = row_info.index

### col_info
path = '../data/processed/col_info.feather'
col_info = feather.read_dataframe(path)

### Holdings
path = '../data/processed/holdings.npz'
holdings = sparse.load_npz(path)

holdings = holdings[row_info.row.values]

print('Shape of row_info information')
print(row_info.shape)
print('Shape of holding information')
print(holdings.shape)

In [None]:
row_info.report_dt.describe()

## Checking the data

In [None]:
mask_columns = pd.Series(np.squeeze(np.array(holdings.sum(0))) > 10)

In [None]:
holdings = holdings[:,mask_columns.values]
col_info = col_info.loc[mask_columns.values,:]

col_info.reset_index(drop = True,inplace=True)

In [None]:
print(
    holdings.shape,
    row_info.shape,
    col_info.shape)

In [None]:
# Sum per stock
pd.DataFrame(holdings.sum(0)).T.plot()

In [None]:
col_info['sum'] = holdings.sum(0).T
col_info.sort_values(by = 'sum',ascending=False).head(10)

In [None]:
i = 100
my_sum = holdings[:,i].toarray().T.flatten()

print('Security:')
print(col_info.loc[i,:].security_name)
print()
print('Sum of holdings:             {:10.2f}'.format(np.sum(my_sum)))
print('Number of companies holding: {:10.2f}'.format(np.sum(my_sum != 0)))
print('Average position:            {:10.2f}'.format(np.sum(my_sum) / np.sum(my_sum != 0)))

In [None]:
colsums = pd.DataFrame(holdings.sum(0)).T

In [None]:
colsums.idxmax()

In [None]:
col_info.loc[462]

In [None]:
# Sum per fund
pd.DataFrame(holdings.sum(1)).plot()

In [None]:
i = 70
print(np.sum(holdings[i,:].toarray().T))
print(row_info.fund_name[i])
plt.plot(holdings[i,:].toarray().T)
plt.show()
mask = (holdings[i,:].toarray().T > 0)
col_info.loc[mask.flatten()].sort_values(by='sum',ascending = False).head()

## Clustering

In [None]:
clustering = SpectralClustering(n_clusters=4,
                                assign_labels='discretize',
                                eigen_solver='amg',
                                n_jobs= -1,
                                random_state=0).fit(holdings)

In [None]:
clustering.labels_

In [None]:
print('Start kMeans...')
kmeans = KMeans(n_clusters = 4,
                verbose = True,
                n_init = 5, # Number of runs
                n_jobs= -1,
                random_state=123).fit(holdings)

In [None]:
labels = pd.DataFrame(clustering.labels_)

In [None]:
labels = pd.DataFrame(kmeans.labels_)

In [None]:
summary['cluster'] = labels.values

In [None]:
summary.loc[summary.cluster == 2,['fund_name']].values

In [None]:
round(pd.crosstab(summary['cap_class'],summary['cluster'], margins = True, normalize = 'columns') * 100, 2)

In [None]:
round(pd.crosstab(summary['style_class'],summary['cluster'], margins = True, normalize = 'columns') * 100, 2)

## Chart the returns of the formed clustes

In [None]:
summary.sample()

#### Merge new clusters into returns

In [None]:
returns_merged = returns.merge(summary, how='left', left_on='crsp_fundno',right_on='fund_no')
returns_merged = returns_merged.dropna(axis = 0, how = 'any')
returns_merged = returns_merged[['caldt','fund_no','port_no','mret','lipper_class','cluster']]
returns_merged.sample()

#### Calc mean return per Obj_cd

In [None]:
cluster_codes = returns_merged[['cluster']].drop_duplicates()
cluster_codes = cluster_codes['cluster'].values
cluster_codes

In [None]:
# Analysis of selected funds
returns_clus = returns_merged.groupby(['cluster','caldt'])['mret'].agg(['mean','count','std'])
returns_clus['cumret'] = returns_clus.groupby('cluster')[['mean']].apply(cumreturn)

#### Pivot to make multiple columns, one for each obj_cd

In [None]:
returns_clus = returns_clus.pivot_table(
        values='cumret', 
        index=['caldt'], 
        columns='cluster')

returns_clus.plot(
              kind='line',
              use_index=True,
              figsize=(16,8),
              title='Mean cumreturn per Group',
              legend='best')

## Analysing clusters

In [None]:
plt.plot(clustering.cluster_centers_[0,:])

In [None]:
plt.plot(kmeans.cluster_centers_[1,:])

In [None]:
plt.plot(kmeans.cluster_centers_[2,:])

## PCA

In [None]:
pca = PCA(n_components=10)
pca.fit(holdings.toarray())  

print(pca.explained_variance_ratio_)  

print(pca.singular_values_)  

In [None]:
df = pd.DataFrame(pca.components_).T
df = pd.DataFrame({
    'pca_1' : df.iloc[:,0],
    'pca_2' : df.iloc[:,1],
    'labels' : labels.iloc[:,0]
})

In [None]:
df.plot.scatter(x='pca_1',
                y='pca_2',
                c='labels',
                colormap='viridis')

# Visualize

In [None]:
MiniBatchKMeans?

In [None]:
### Based on example code from sklearn ###

X = holdings
n_clusters = [2,3,4,5]

clusters = []
results = []

for n_clusters in n_clusters:
    clusterer = MiniBatchKMeans(
                    n_clusters=n_clusters,
                    verbose = False,
                    n_init = 1000)
    
    
    cluster_fit = clusterer.fit(holdings)
    clusters.append(n_clusters)
    results.append(cluster_fit.inertia_)    
    
    cluster_labels = cluster_fit.labels_
    
    # Create a subplot with 1 row and 2 columns
    fig, ax1 = plt.subplots()
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, X.shape[0] + (n_clusters + 1) * 10])
    
    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample

    # TODO can pool.map be implemented here?
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()