In [None]:
""" K-means clustering of 22 NMF topic components """

In [11]:
import pickle
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
with open('../data/topics/NMF_22_risk_topics_FINAL.pickle', 'rb') as read_file:
    risk_topics = pickle.load(read_file)

In [5]:
risk_topics.columns

Index(['index', 'date_filed', 'filedAt', 'formType', 'accessionNo', 'ticker',
       'cik', 'companyName', 'companyNameLong', 'linkToTxt', 'formType', 'sic',
       'fiscalYearEnd', 'value', 'preproc_text', 'clean_text',
       'risk_topic_nmf', 'clean_text', 'C_0', 'C_1', 'C_2', 'C_3', 'C_4',
       'C_5', 'C_6', 'C_7', 'C_8', 'C_9', 'C_10', 'C_11', 'C_12', 'C_13',
       'C_14', 'C_15', 'C_16', 'C_17', 'C_18', 'C_19', 'C_20', 'C_21',
       'risk_topics'],
      dtype='object')

In [7]:
X = np.array(risk_topics[['C_0', 'C_1', 'C_2', 'C_3', 'C_4', 'C_5', 'C_6', 'C_7', 'C_8', 'C_9',
       'C_10', 'C_11', 'C_12', 'C_13', 'C_14', 'C_15', 'C_16', 'C_17', 'C_18', 'C_19', 'C_20', 'C_21']])

In [12]:
# helper function that allows us to display data in 2 dimensions an highlights the clusters
def display_cluster(X,km=[],num_clusters=0):
    color = 'brgcmyk'
    alpha = 0.5
    s = 20
    if num_clusters == 0:
        plt.scatter(X[:,0],X[:,1],c = color[0],alpha = alpha,s = s)
    else:
        for i in range(num_clusters):
            plt.scatter(X[km.labels_==i,0],X[km.labels_==i,1],c = color[i],alpha = alpha,s=s)
            plt.scatter(km.cluster_centers_[i][0],km.cluster_centers_[i][1],c = color[i], marker = 'x', s = 100)

In [9]:
num_clusters = 11
km = KMeans(n_clusters=num_clusters,random_state=10,n_init=5) # n_init, number of times the K-mean algorithm will run
km.fit(X)

KMeans(n_clusters=11, n_init=5, random_state=10)

In [14]:
#display_cluster(X,km,num_clusters)

In [15]:
clusters = pd.DataFrame(data=km.labels_.tolist(), columns=['cluster'])

In [16]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
order_centroids[0]

array([ 3, 12, 11,  7,  0,  5, 14,  2, 15, 21,  6, 18, 20, 10, 19, 17,  8,
       16,  9, 13,  4,  1])

In [17]:
risks_clusters = pd.concat([risk_topics, clusters], axis=1)

In [18]:
risks_clusters.shape

(330641, 42)

In [21]:
risks_clusters.columns

Index(['index', 'date_filed', 'filedAt', 'formType', 'accessionNo', 'ticker',
       'cik', 'companyName', 'companyNameLong', 'linkToTxt', 'formType', 'sic',
       'fiscalYearEnd', 'value', 'preproc_text', 'clean_text',
       'risk_topic_nmf', 'clean_text', 'C_0', 'C_1', 'C_2', 'C_3', 'C_4',
       'C_5', 'C_6', 'C_7', 'C_8', 'C_9', 'C_10', 'C_11', 'C_12', 'C_13',
       'C_14', 'C_15', 'C_16', 'C_17', 'C_18', 'C_19', 'C_20', 'C_21',
       'risk_topics', 'cluster'],
      dtype='object')

In [22]:
subset = risks_clusters[['value', 'risk_topic_nmf', 'risk_topics', 'cluster']]

In [25]:
subset[subset['cluster']==0].risk_topics.value_counts()

Interest Rate        7249
Derivatives            47
Loans                  42
Market                 38
Insurance              13
Liquidity2              8
Liquidity1              6
Commodity               6
Stock                   4
Credit                  4
Foreign Exchange        2
Investment              2
Capital                 2
Regulation              1
Name: risk_topics, dtype: int64

In [26]:
subset[subset['cluster']==1].risk_topics.value_counts()

Risk Management        29653
Regulation             23686
Credit                 15722
Insurance              13852
Interest Rate          13518
Market                 12505
Capital                12228
Investment              9924
Product Development     9362
Commodity               8928
Other                   8155
Security                7578
Foreign Exchange        7162
Disclosure              4340
Loans                   4260
Stock                   3402
Derivatives             3022
Liquidity2              2791
Fair Value              2243
Liquidity1              2182
Accounting              1733
Internal Control        1639
Name: risk_topics, dtype: int64

In [27]:
subset[subset['cluster']==2].risk_topics.value_counts()

Accounting    4501
Name: risk_topics, dtype: int64

In [28]:
subset[subset['cluster']==3].risk_topics.value_counts()

Foreign Exchange       7079
Interest Rate           157
Derivatives             139
Regulation               51
Liquidity2               37
Commodity                13
Other                     6
Fair Value                3
Market                    2
Credit                    1
Product Development       1
Investment                1
Name: risk_topics, dtype: int64

In [29]:
subset[subset['cluster']==4].risk_topics.value_counts()

Interest Rate          8082
Derivatives            5627
Market                 4810
Commodity              3723
Stock                  3022
Liquidity2             2495
Liquidity1             2466
Fair Value             1563
Other                   853
Foreign Exchange        581
Risk Management         453
Credit                  366
Investment              271
Loans                   237
Capital                 115
Insurance                84
Security                 78
Disclosure               50
Regulation               47
Product Development      36
Name: risk_topics, dtype: int64

In [30]:
subset[subset['cluster']==5].risk_topics.value_counts()

Regulation             17711
Insurance               8274
Security                3239
Other                   1460
Commodity               1147
Capital                  889
Risk Management          576
Market                   433
Product Development      370
Disclosure               369
Investment               252
Foreign Exchange         173
Interest Rate            168
Credit                   112
Liquidity2               107
Loans                     87
Liquidity1                85
Derivatives               43
Internal Control          34
Stock                     25
Accounting                21
Fair Value                10
Name: risk_topics, dtype: int64

In [31]:
subset[subset['cluster']==6].risk_topics.value_counts()

Internal Control    5786
Accounting            11
Security               3
Name: risk_topics, dtype: int64

In [32]:
subset[subset['cluster']==7].risk_topics.value_counts()

Investment             6398
Liquidity1              152
Risk Management         137
Interest Rate           104
Market                   99
Liquidity2               71
Capital                  57
Fair Value               42
Insurance                27
Other                    23
Security                 18
Loans                    14
Commodity                11
Product Development       4
Regulation                3
Foreign Exchange          3
Credit                    2
Disclosure                1
Stock                     1
Name: risk_topics, dtype: int64

In [33]:
subset[subset['cluster']==8].risk_topics.value_counts()

Disclosure             4538
Regulation               39
Other                     6
Product Development       5
Loans                     1
Name: risk_topics, dtype: int64

In [34]:
subset[subset['cluster']==9].risk_topics.value_counts()

Product Development    7334
Regulation              285
Insurance               279
Commodity               160
Market                  157
Security                120
Interest Rate            42
Other                    35
Risk Management          33
Disclosure               10
Investment                7
Derivatives               6
Foreign Exchange          6
Loans                     6
Credit                    5
Liquidity1                4
Capital                   1
Name: risk_topics, dtype: int64

In [35]:
subset[subset['cluster']==10].risk_topics.value_counts()

Credit                 10680
Loans                   4702
Liquidity2               302
Other                    210
Risk Management          183
Market                   128
Capital                   84
Derivatives               78
Interest Rate             75
Security                  68
Investment                50
Fair Value                49
Insurance                 35
Commodity                 35
Product Development       25
Regulation                20
Liquidity1                18
Foreign Exchange           6
Disclosure                 3
Name: risk_topics, dtype: int64

In [36]:
risks_clusters.head(2)

Unnamed: 0,index,date_filed,filedAt,formType,accessionNo,ticker,cik,companyName,companyNameLong,linkToTxt,...,C_14,C_15,C_16,C_17,C_18,C_19,C_20,C_21,risk_topics,cluster
0,0,2020-02-19,2020-02-19T21:42:50-05:00,10-K,0001562762-20-000064,T,732717,AT&T INC.,AT&T INC. (Filer),https://www.sec.gov/Archives/edgar/data/732717...,...,0.0018,0.00036,0.04513,0.06339,0.01076,0.00853,0.00754,0.02536,Product Development,1
1,1,2020-02-19,2020-02-19T21:42:50-05:00,10-K,0001562762-20-000064,T,732717,AT&T INC.,AT&T INC. (Filer),https://www.sec.gov/Archives/edgar/data/732717...,...,0.0,0.0,0.0,0.00951,0.0,0.03202,0.04999,0.0559,Investment,7


In [37]:
clusters_info = risks_clusters[['date_filed','ticker', 'companyName', 'sic',
      'value', 'risk_topic_nmf', 'C_0', 'C_1', 'C_2', 'C_3', 'C_4',
       'C_5', 'C_6', 'C_7', 'C_8', 'C_9', 'C_10', 'C_11', 'C_12', 'C_13',
       'C_14', 'C_15', 'C_16', 'C_17', 'C_18', 'C_19', 'C_20', 'C_21',
       'risk_topics', 'cluster']]

In [38]:
clusters_info.to_excel('risk_clusters_11.xlsx', index=False)