In [None]:
""" 
K-means clustering of 22 NMF topic components for each REPORT 
"""

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
with open('../data/topics/NMF_report_risk_topics.pickle', 'rb') as read_file:
    risk_topics = pickle.load(read_file)

In [3]:
risk_topics.columns

Index(['year', 'date_filed', 'accessionNo', 'formType', 'sic', 'ticker',
       'companyName', 'C_0_N', 'C_1_N', 'C_2_N', 'C_3_N', 'C_4_N', 'C_5_N',
       'C_6_N', 'C_7_N', 'C_8_N', 'C_9_N', 'C_10_N', 'C_11_N', 'C_12_N',
       'C_13_N', 'C_14_N', 'C_15_N', 'C_16_N', 'C_17_N', 'C_18_N', 'C_19_N',
       'C_20_N', 'C_21_N', 'ticker_year'],
      dtype='object')

In [4]:
X = np.array(risk_topics[['C_0_N', 'C_1_N', 'C_2_N', 'C_3_N', 'C_4_N', 'C_5_N',
       'C_6_N', 'C_7_N', 'C_8_N', 'C_9_N', 'C_10_N', 'C_11_N', 'C_12_N', 'C_13_N', 
       'C_14_N', 'C_15_N', 'C_16_N', 'C_17_N', 'C_18_N', 'C_19_N', 'C_20_N', 'C_21_N']])

In [21]:
# helper function that allows us to display data in 2 dimensions an highlights the clusters
def display_cluster(X,km=[],num_clusters=0):
    color = 'brgcmyk'
    alpha = 0.5
    s = 20
    if num_clusters == 0:
        plt.scatter(X[:,0],X[:,1],c = color[0],alpha = alpha,s = s)
    else:
        for i in range(num_clusters):
            plt.scatter(X[km.labels_==i,0],X[km.labels_==i,1],c = color[i],alpha = alpha,s=s)
            plt.scatter(km.cluster_centers_[i][0],km.cluster_centers_[i][1],c = color[i], marker = 'x', s = 100)

In [6]:
num_clusters = 11
km = KMeans(n_clusters=num_clusters,random_state=10,n_init=5) # n_init, number of times the K-mean algorithm will run
km.fit(X)

KMeans(n_clusters=11, n_init=5, random_state=10)

In [23]:
#display_cluster(X,km,num_clusters)

In [11]:
clusters = pd.DataFrame(data=km.labels_.tolist(), columns=['cluster'])

In [9]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
order_centroids[0]

array([15,  3,  9,  6, 18,  2, 21, 11,  7, 16, 12, 19,  0, 17, 14, 10,  1,
        8,  4, 20, 13,  5])

In [12]:
risks_clusters = pd.concat([risk_topics, clusters], axis=1)

In [14]:
risks_clusters.columns

Index(['year', 'date_filed', 'accessionNo', 'formType', 'sic', 'ticker',
       'companyName', 'C_0_N', 'C_1_N', 'C_2_N', 'C_3_N', 'C_4_N', 'C_5_N',
       'C_6_N', 'C_7_N', 'C_8_N', 'C_9_N', 'C_10_N', 'C_11_N', 'C_12_N',
       'C_13_N', 'C_14_N', 'C_15_N', 'C_16_N', 'C_17_N', 'C_18_N', 'C_19_N',
       'C_20_N', 'C_21_N', 'ticker_year', 'cluster'],
      dtype='object')

In [18]:
risks_clusters.head()

Unnamed: 0,year,date_filed,accessionNo,formType,sic,ticker,companyName,C_0_N,C_1_N,C_2_N,...,C_14_N,C_15_N,C_16_N,C_17_N,C_18_N,C_19_N,C_20_N,C_21_N,ticker_year,cluster
0,2011,2011-01-26,0001193125-11-014919,10-K,6141 Personal Credit Institutions,DFS,Discover Financial Services,5.584129,3.066305,10.936688,...,6.703711,18.419776,8.177222,7.926849,24.056956,9.213463,3.122806,13.527562,DFS_2011,0
1,2011,2011-01-26,0001193125-11-014958,10-K,2090 Miscellaneous Food Preparations &amp; Kin...,MKC,MCCORMICK & CO INC,3.77313,8.638299,16.756952,...,4.35998,0.570886,2.60513,14.03999,4.008455,1.592415,8.630127,19.83224,MKC_2011,1
2,2011,2011-01-27,0000796343-11-000003,10-K,7372 Services-Prepackaged Software,ADBE,ADOBE SYSTEMS INC,8.297228,14.661515,7.967888,...,7.813456,2.410104,17.577878,32.538083,2.791825,8.750564,13.307759,12.335037,ADBE_2011,2
3,2011,2011-01-27,0001193125-11-016253,10-K,5961 Retail-Catalog &amp; Mail-Order Houses,AMZN,AMAZON COM INC,2.012721,7.593752,3.399372,...,7.186137,1.661762,3.619441,25.972473,8.807608,3.476674,3.080109,15.646035,AMZN_2011,7
4,2011,2011-01-28,0001065088-11-000003,10-K,"7389 Services-Business Services, NEC",EBAY,EBAY INC,3.048407,22.189821,35.948725,...,13.187547,13.035927,22.637687,34.606789,9.73524,11.517763,5.449361,28.92705,EBAY_2011,2


In [20]:
risks_clusters.to_excel('report_clusters11.xlsx', index=False)

In [33]:
#######
num_clusters = 10
km = KMeans(n_clusters=num_clusters,random_state=10,n_init=10) # n_init, number of times the K-mean algorithm will run
km.fit(X)

clusters = pd.DataFrame(data=km.labels_.tolist(), columns=['cluster'])
risks_clusters = pd.concat([risk_topics, clusters], axis=1)
risks_clusters.to_excel('report_clusters10.xlsx', index=False)

km.cluster_centers_

array([[ 6.20523729, 45.82510197, 29.21926968, 13.69078688, 18.04689713,
        13.51329645,  6.61101633, 11.12427696,  7.46013751,  2.62664567,
         6.64994942,  8.66506636,  9.53435617, 10.7774166 ,  7.97764869,
         2.29761613,  6.87753097, 11.54131393,  9.46620845,  9.0473154 ,
         8.21292475, 19.03362654],
       [ 5.73474057, 12.10240465, 13.3578268 , 10.34550863, 19.47842046,
        14.93040997,  6.4478146 ,  8.8908236 ,  6.91704327,  2.71977524,
         6.88991096,  9.3985431 ,  7.14746381,  6.7154593 ,  7.33627782,
         2.20381501, 12.82855338, 18.52443879,  7.59238609,  8.04800305,
         7.96154633, 24.64932234],
       [13.94254893, 13.45116829, 17.81266085, 32.73363622, 14.90196668,
        21.06283468, 11.16981439, 17.65590403,  9.52975921,  4.25553961,
        13.28663848, 13.28677222, 16.6111791 ,  6.33298371, 13.22171124,
         4.08343874,  9.27550032, 11.76902362,  8.52463502, 10.57144652,
         8.5537997 , 23.12685364],
       [ 8.64339723

In [26]:
#######
num_clusters = 12
km = KMeans(n_clusters=num_clusters,random_state=10,n_init=10) # n_init, number of times the K-mean algorithm will run
km.fit(X)
clusters = pd.DataFrame(data=km.labels_.tolist(), columns=['cluster'])
risks_clusters = pd.concat([risk_topics, clusters], axis=1)
risks_clusters.to_excel('report_clusters12.xlsx', index=False)

In [27]:
#######
num_clusters = 9
km = KMeans(n_clusters=num_clusters,random_state=10,n_init=10) # n_init, number of times the K-mean algorithm will run
km.fit(X)
clusters = pd.DataFrame(data=km.labels_.tolist(), columns=['cluster'])
risks_clusters = pd.concat([risk_topics, clusters], axis=1)
risks_clusters.to_excel('report_clusters9.xlsx', index=False)

In [28]:
#######
num_clusters = 8
km = KMeans(n_clusters=num_clusters,random_state=10,n_init=10) # n_init, number of times the K-mean algorithm will run
km.fit(X)
clusters = pd.DataFrame(data=km.labels_.tolist(), columns=['cluster'])
risks_clusters = pd.concat([risk_topics, clusters], axis=1)
risks_clusters.to_excel('report_clusters8.xlsx', index=False)

In [31]:
#######
num_clusters = 7
km = KMeans(n_clusters=num_clusters,random_state=10,n_init=10) # n_init, number of times the K-mean algorithm will run
km.fit(X)
clusters = pd.DataFrame(data=km.labels_.tolist(), columns=['cluster'])
risks_clusters = pd.concat([risk_topics, clusters], axis=1)
risks_clusters.to_excel('report_clusters7.xlsx', index=False)

#km.cluster_centers_

array([[ 7.71475549, 12.12249996, 16.12050803, 15.82311668, 19.73200403,
        18.89020522,  8.92044564, 11.3100315 ,  7.30179319,  3.41833512,
         8.8767562 , 11.31521706,  9.80515243,  7.05931357,  9.13534445,
         2.99169418, 16.11465682, 18.69374973,  8.5720701 ,  9.11712213,
         8.33675239, 29.94814985],
       [ 8.33737824, 11.29700751, 13.78586977, 13.94832317, 11.78921001,
         7.01355969, 11.39300695, 11.12223713,  5.86027243,  5.88745053,
         9.70377065, 15.09334734, 14.82894773,  6.15383635,  7.98871763,
         2.16537757,  8.05043499,  6.88279404,  9.85637246, 11.17504194,
        41.59329424, 27.9551027 ],
       [ 7.46771616,  5.53521646, 13.48037305, 30.00335744,  5.00637835,
         5.51068673, 20.80508283, 11.93652853,  5.10014374, 20.41074596,
         5.5425693 , 12.64655169,  9.44338377,  4.69465527,  6.20151736,
        29.77099897,  9.14978195,  5.74804133, 17.01433942,  7.98450871,
         4.42369251, 13.04930299],
       [ 4.70968669

In [30]:
#######
num_clusters = 6
km = KMeans(n_clusters=num_clusters,random_state=10,n_init=10) # n_init, number of times the K-mean algorithm will run
km.fit(X)
clusters = pd.DataFrame(data=km.labels_.tolist(), columns=['cluster'])
risks_clusters = pd.concat([risk_topics, clusters], axis=1)
risks_clusters.to_excel('report_clusters6.xlsx', index=False)

# 6 is too few - the banks and insurances were dipersed