<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Description" data-toc-modified-id="Description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Description</a></span></li><li><span><a href="#Setup" data-toc-modified-id="Setup-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Other-functions" data-toc-modified-id="Other-functions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Other functions</a></span><ul class="toc-item"><li><span><a href="#Filter" data-toc-modified-id="Filter-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Filter</a></span></li></ul></li><li><span><a href="#Analysis-of-distance-measure" data-toc-modified-id="Analysis-of-distance-measure-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Analysis of distance measure</a></span></li></ul></div>

# Clustering 

## Description

- Cluster funds using Spektral Clustering

## Setup

In [None]:
import feather
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import sparse

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.neighbors import kneighbors_graph

from graph_tool.all import *
import cairo

In [None]:
### Returns
path = '../data/processed/returns.feather'
returns = feather.read_dataframe(path)
returns = returns.rename(columns = {'caldt' : 'report_dt'})
returns = returns.assign(year = returns['report_dt'].dt.year)

### row_info
path = '../data/processed/row_info_f.feather'
row_info = feather.read_dataframe(path)

### col_info
path = '../data/processed/col_info_f.feather'
col_info = feather.read_dataframe(path)

### Holdings
path = '../data/processed/holdings_f.npz'
holdings = sparse.load_npz(path)

## Other functions

### Filter

In [None]:
def filter_data(year, verbose = False):
        
    row_info_f = row_info.copy()
    if (year != 'full'):    # If year = full take whole sample
        row_info_f = row_info_f.query('year == @year')

    begin_date = row_info_f.iloc[0,:]['report_dt']
    end_date = begin_date + pd.DateOffset(years=1) # 1 year offset
    row_info_f.reset_index(drop = True, inplace=True)

    # Filter returns
    crsp_fundno_unique = row_info_f['crsp_fundno'].unique()
    returns_f = returns.copy()
    query = '''report_dt >= @begin_date and report_dt <= @end_date and crsp_fundno in @crsp_fundno_unique'''
    returns_f = returns_f.query(query)

    # Change return of month for which holdings apply to 0
    returns_f = returns_f.copy()
    mask = returns_f['report_dt'] == begin_date
    returns_f.loc[mask,'mret'] = 0
    
    # Drop all funds with first return observation after starting date
    drop_fundnos = returns_f.drop_duplicates('crsp_fundno').query('mret != 0')['crsp_fundno']
    returns_f.query('crsp_fundno not in @drop_fundnos', inplace=True)
    row_info_f.query('crsp_fundno not in @drop_fundnos', inplace=True)
    row_info_f['style_class'] = row_info_f['style_class'].astype(str) + 'E'
    
    # Filter holdings accordingly and delet all securities with less than two observations
    holdings_f = holdings.copy()
    holdings_f = holdings[row_info_f['row']]
    
    holdings_b = sparse.csr_matrix(holdings_f, copy=True)
    holdings_b.data = np.ones(len(holdings_f.data))

    sum_sec_boolean = holdings_b.toarray().sum(0)
    col_mask = (sum_sec_boolean >= 10).flatten()
    
    holdings_f = holdings_f.tocsc()
    holdings_f = holdings_f[:,col_mask]
    holdings_f = holdings_f.tocsr()
    
    holdings_b = sparse.csr_matrix(holdings_f, copy=True)
    holdings_b.data = np.ones(len(holdings_f.data))
    
    sum_fund_boolean = holdings_b.toarray().sum(1)
    row_mask = (sum_fund_boolean >= 10).flatten()
    
    holdings_f = holdings_f[row_mask]
    row_info_f = row_info_f[row_mask]
    
    ## Preprocessing
    holdings_ft = normalize(holdings_f, norm = 'l2')
    
    if (verbose):
        print('Numer of unique funds:           {:10,d}'.format(row_info_f.shape[0]))

        print('Numer of unique securities:      {:10,d}'.format(holdings_ft.shape[1]))

        print('Begin date:                      {}'.format(begin_date.date()))
        print('End date:                        {}'.format(end_date.date()))
    
    return(row_info_f, returns_f, holdings_ft, begin_date, end_date)

In [None]:
row_info_f, returns_f, holdings_ft, begin_date, end_date = filter_data(2016, verbose = True)

## Analysis of distance measure

In [None]:
graph_data = kneighbors_graph(holdings, 
                              n_neighbors = 2059,
                              mode = 'distance',
                              p = 2)
distance = graph_data.data

In [None]:
mymax = graph_data.max(axis= 0).data
mymax

In [None]:
mymin = graph_data.min(axis = 1)
mymin

In [None]:
plt.hist(distance)

In [None]:
plt.plot(graph_data[100].toarray().T)

In [None]:
col_info.loc[89]

In [None]:
holdings_ft[0].nonzero()

In [None]:
holdings[holdings[13226].nonzero()]

In [None]:
holdings[holdings[13225].nonzero()]

In [None]:
holdings_ft[holdings_ft[1080].nonzero()]

In [None]:
col_info.loc[1451]

In [None]:
row_info.loc[0].fund_name

In [None]:
row_info.loc[13226].fund_name

In [None]:
row_info.loc[13225].fund_name

In [None]:
row_info.loc[13235].fund_name

In [None]:
row_info_f.loc[1083].fund_name

In [None]:
pd.DataFrame(graph_data[0].toarray().T).sort_values(by=0, ascending = True)

In [None]:
plt.plot(pd.DataFrame(graph_data[8].toarray().T).sort_values(by=0).reset_index(drop=True))


In [None]:
row_info.head(5)

In [None]:
holdings_t = normalize(holdings, norm = 'l2')

data = holdings_t[0:5]

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

distances = euclidean_distances(data)
pd.DataFrame(distances)
# plt.hist(distance, bins = 100)

In [None]:
## Options

# TODO adjust colour and shapes


def setup_graph(year = 2018,n_neighbors = 30,size = 1000):

    row_info_f, returns_f, holdings_ft, begin_date, end_date = filter_data(year, verbose = True)
    raw_data = holdings_ft[0:size]
    row_info_f = row_info_f[0:size]
    
    graph_data = kneighbors_graph(raw_data, 
                                  n_neighbors = n_neighbors,
                                  mode = 'distance',
                                  p = 2)
    distance = graph_data.data
    vertex_n = graph_data.shape[0]
    x, y = graph_data.nonzero()

    g = Graph(directed=False)
    g.add_vertex(vertex_n)

    for s , t in zip(x,y):
        g.add_edge(g.vertex(s), g.vertex(t))

    e_distance = g.new_edge_property('double', vals = distance)
    # e_distance = prop_to_size(e_distance, mi=1, ma=100, log=False, power=0.5)
    
    pos = sfdp_layout(g, eweight = e_distance)
    
    return(g, pos, row_info_f)

def draw_graph(g, pos, row_info_f, dimension, output):
    
    le = LabelEncoder()
    row_info_f.loc[:,'f_encoded'] = le.fit_transform(row_info_f[dimension])
    cluster_text  = row_info_f[dimension]
    cluster_fill  = row_info_f['f_encoded']
    
    v_fill  = g.new_vertex_property('int', vals = cluster_fill.values)
    v_text  = g.new_vertex_property('string', vals = cluster_text.values)
    
    graph_draw(
    g,
    inline = True,
    output = output,
    output_size=(2_000, 2_000),
    # bg_color = 'white',
    vprops={
        'size': 20,
        'color': 'black',
        'fill_color': v_fill,
        'text': v_text,
        'text_color': 'white',
        'font_size' : 20,
        'font_weight': cairo.FONT_WEIGHT_BOLD,
    },
    eprops={
        'pen_width': 0.1,
        'color': 'grey'
    })

In [None]:
g, pos, row_info_f = setup_graph(year=2014, n_neighbors=25, size=3000)

draw_graph(g, pos, row_info_f,
           dimension='cap_class',
           output='../reports/figures/graph/cap.png')

draw_graph(g, pos, row_info_f,
           dimension='style_class',
           output='../reports/figures/graph/style.png')