<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Description" data-toc-modified-id="Description-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Description</a></span></li><li><span><a href="#Setup" data-toc-modified-id="Setup-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Graph" data-toc-modified-id="Graph-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Graph</a></span></li><li><span><a href="#Other-functions" data-toc-modified-id="Other-functions-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Other functions</a></span></li><li><span><a href="#Analysis-of-distance-measure" data-toc-modified-id="Analysis-of-distance-measure-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Analysis of distance measure</a></span></li></ul></div>

# Clustering 

## Description

- Cluster funds using Spektral Clustering

## Setup

In [None]:
import feather
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import sparse

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.neighbors import kneighbors_graph

from graph_tool.all import *
import cairo

In [None]:
path = '../data/processed/full.pickle'
pickle_off = open(path,"rb")
dict_all_years = pickle.load(pickle_off)

holdings_ft = dict_all_years[2018]['holdings_ft']

## Graph

In [None]:
## Options

# TODO adjust colour and shapes

def setup_graph(year = 2018,n_neighbors = 15):
    
    holdings_ft = dict_all_years[year]['holdings_ft']
    row_info_f = dict_all_years[year]['row_info_f']
    
    graph_data = kneighbors_graph(holdings_ft, 
                                  n_neighbors = n_neighbors,
                                  mode = 'distance',
                                  p = 2)
    distance = graph_data.data
    vertex_n = graph_data.shape[0]
    x, y = graph_data.nonzero()

    g = Graph(directed=False)
    g.add_vertex(vertex_n)

    for s , t in zip(x,y):
        g.add_edge(g.vertex(s), g.vertex(t))

    e_distance = g.new_edge_property('double', vals = distance)
    #e_distance = prop_to_size(e_distance, mi=1, ma=100, log=False, power=0.5)
    
    pos = sfdp_layout(g, eweight = e_distance)
    
    return(g, pos, row_info_f)

def draw_graph(g, pos, row_info_f, dimension, output):
    
    le = LabelEncoder()
    row_info_f.loc[:,'f_encoded'] = le.fit_transform(row_info_f[dimension])
    cluster_text  = row_info_f[dimension]
    cluster_fill  = row_info_f['f_encoded']
    
    v_fill  = g.new_vertex_property('int', vals = cluster_fill.values)
    v_text  = g.new_vertex_property('string', vals = cluster_text.values)
    
    graph_draw(
    g,
    inline = True,
    output = output,
    output_size=(2500, 2500),
    # bg_color = 'white',
    vprops={
        'size': 60,
        'color': 'black',
        'fill_color': v_fill,
        'text': v_text,
        'text_color': 'white',
        'font_size' : 25,
        'font_weight': cairo.FONT_WEIGHT_BOLD,
    },
    eprops={
        'pen_width': 0.2,
        'color': 'grey'
    })

In [None]:
g, pos, row_info_f = setup_graph(year=2018, n_neighbors=15)

draw_graph(g, pos, row_info_f,
           dimension='cap_class',
           output='../reports/figures/graph/cap.png')

draw_graph(g, pos, row_info_f,
           dimension='style_class',
           output='../reports/figures/graph/style.png')

## Other functions

In [None]:
holdings_ft

In [None]:
holdings_ft.eliminate_zeros()
holdings_ft

## Analysis of distance measure

In [None]:
holdings_ft = dict_all_years[2018]['holdings_ft']
row_info_f = dict_all_years[2018]['row_info_f']
col_info_f = dict_all_years[2018]['col_info_f']

graph_data = kneighbors_graph(holdings_ft, 
                              n_neighbors = 1979,
                              mode = 'distance',
                              p = 2,
                             )
distance = graph_data.data

In [None]:
graph_data_a = graph_data.toarray()

In [None]:
mymax = graph_data_a.max(axis = 1)
mymax

In [None]:
mymax = graph_data_a.min(axis = 1)
mymax

In [None]:
np.arange(0,1.4,0.1)

In [None]:
print(9.36000e+02)

In [None]:
plt.hist(distance, bins = np.arange(0.01,1.4,0.1))

In [None]:
row_info_f.iloc[6]

In [None]:
dict_all_years[2018]['row_info_f']['tna_latest'].sum()

In [None]:
k = 1000
plt.plot(graph_data[k].toarray().T)

In [None]:
x = pd.DataFrame(graph_data[k].toarray().T).sort_values(by=0, ascending = True).head(5).index
pd.DataFrame(graph_data[k].toarray().T).sort_values(by=0, ascending = True).head(5)

In [None]:
for i in x:
    print(i)
    print(row_info_f.iloc[i].fund_name)
    print(holdings_ft[i,:].nonzero()[1])
    print(holdings_ft[i,holdings_ft[i].nonzero()[1]])

In [None]:
holdings_ft_a = pd.DataFrame(holdings_ft.toarray())

In [None]:
holdings_ft

In [None]:
holdings_ft_a.drop_duplicates().shape

In [None]:
plt.plot(pd.DataFrame(graph_data[8].toarray().T).sort_values(by=0).reset_index(drop=True))
