In [None]:
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns

from pathlib import Path
from sklearn import preprocessing
from sklearn.metrics import pairwise
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import KMeans
from pandas.plotting import parallel_coordinates

%matplotlib inline

In [None]:
stocks_df=pd.read_excel('stocks.xlsx')
stocks_df.set_index('Company',inplace=True)

stocks_df=stocks_df.apply(lambda x:x.astype('float64'))
stocks_df.head().round(3)

In [None]:
fig,ax=plt.subplots()
stocks_df.plot('AvRet','Beta', kind='scatter',ax=ax)

for k,v in stocks_df.iterrows():
        ax.annotate(k,v,fontsize=7)

In [None]:
d=pairwise.pairwise_distances(stocks_df, metric='euclidean')
pd.DataFrame (d,columns=stocks_df.index, index=stocks_df.index).head(10)

In [None]:
stocks_df_norm=stocks_df.apply(preprocessing.scale,axis=0)
stocks_df_norm=(stocks_df-stocks_df.mean())/stocks_df.std()
#compute the normalized distances:
d_norm= pairwise.pairwise_distances(stocks_df_norm, metric='euclidean')
pd.DataFrame (d_norm,columns=stocks_df_norm.index, index=stocks_df.index).head(10)

In [None]:
Z=linkage(stocks_df_norm, method='single')

fig=plt.figure(figsize=(10,6))
fig.subplots_adjust(bottom=0.23)
plt.title('Hierarchical Clutering (Single Linkage)')
plt.xlabel('Company')
dendrogram(Z,labels=stocks_df_norm.index,color_threshold=2.75)
plt.axhline(y=2.75,color='black', linewidth=0.8,linestyle='dashed')
plt.show()

In [None]:
Z=linkage(stocks_df_norm, method='average')

fig=plt.figure(figsize=(10,6))
fig.subplots_adjust(bottom=0.23)
plt.title('Hierarchical Clustering (Single Linkage)')
plt.xlabel('Company')
dendrogram(Z,labels=stocks_df_norm.index,color_threshold=2.75)
plt.axhline(y=2.75,color='black', linewidth=0.8,linestyle='dashed')
plt.show()

In [None]:
memb=fcluster(linkage(stocks_df_norm,'single'),6,criterion='maxclust')
memb=pd.Series(memb, index=stocks_df_norm.index)
for key, item in memb.groupby(memb):
    print(key,':',','.join(item.index))

In [None]:
stocks_df_norm.index=['{}:{}'.format(cluster,state) for cluster, state in zip(memb, stocks_df_norm.index)]
sns.clustermap(stocks_df_norm,method='single',col_cluster=False,cmap="mako_r")
plt.show()

# K-Means Clustering

In [None]:
kmeans=KMeans(n_clusters=4,random_state=0).fit(stocks_df_norm)

In [None]:
memb=pd.Series(kmeans.labels_,index=stocks_df_norm.index)
for key,item in memb.groupby(memb):
    print(key,':',','.join(item.index))

In [None]:
inertia = []
for n_clusters in range(1, 7):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(stocks_df_norm)
    inertia.append(kmeans.inertia_ / n_clusters)
inertias = pd.DataFrame({'n_clusters': range(1, 7), 'inertia': inertia})
ax = inertias.plot(x='n_clusters', y='inertia')
plt.xlabel('Number of clusters(k)')
plt.ylabel('Average Within-Cluster Squared Distances')
plt.ylim((0, 1.1 * inertias.inertia.max()))
ax.legend().set_visible(False)
plt.show()