# Machine Learning for Asset Managers

## Chapter 4 Optimal Clustering

### Snippet 4.1 Base Clustering

In [2]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples

In [None]:
def clusterKMeansBase(corr0, maxNumClusters = 10, n_init = 10) :

    x = ((1-corr0.fillna(0))/2)**0.5
    silh = pd.Series()   # Observations matrix
    
    for init in range(n_init) :
        for i in range (2, maxNumClusters +1) :
            kmeans_ = KMeans(n_clusters = i, n_jobs=1, n_init=1)
            kmeans_ = kmeans_.fit(x)
            
            silh_ = silhouette_samples(x, kmeans_.labels_)
            stat = (silh_.mean() / silh_.std(), silh.mean/silh.std())
            
            if np.isnan(stat[1]) or stat[0] > stat[1] :
                silh = silh_
                kmeans = kmeans_
    
    # Reordering
    newIdx = np.argsort(kmeans.labels_)
    corr1 = corr0.iloc[newIdx] # reorder rows
    corr1 = corr1.iloc[:, newIdx] # reorder columns
    
    clstrs = {i:corr0.columns[np.where(kmeans.labesl_==i)[0]].tolist() \
             for i in np.unique(kmeans.labels_)}  # cluster members
    silh = pd.Series(silh, index = x.index)
    
    return corr1, clstrs, silh
    