In [9]:
"""
NMF topic modeling of each risk paragraph
2011-2020
"""

''

In [34]:
import pandas as pd
import numpy as np
import pickle
import datetime
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [35]:
with open('../data/preproc/clean_text_all_years.pickle', 'rb') as read_file:
    prepped_text = pickle.load(read_file)

In [36]:
prepped_text.shape

(330641, 15)

In [37]:
prepped_text.columns

Index(['date_filed', 'filedAt', 'formType', 'accessionNo', 'ticker', 'cik',
       'companyName', 'companyNameLong', 'linkToTxt', 'formType', 'sic',
       'fiscalYearEnd', 'value', 'preproc_text', 'clean_text'],
      dtype='object')

In [42]:
# add additional stopwords
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['risk', 'risks', 'factor', 'factors', 'item', 'quantitative', 'qualitative',
                 'company', 'business', 'firm', 'us', 'our', 
                'could', 'may', 'quarter', 'year'
                "january", "february", "march", "april", "may", "june", 
                "july", "august", "september", "october", "november", "december",
                "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "oct", "nov", "dec",
                "million", "thousand", "hundred", "billion"])

In [43]:
# Count Vectorizer with bi-grams
vectorizer = CountVectorizer(stop_words = stopwords, ngram_range=(1,2))
risk_words = vectorizer.fit_transform(prepped_text.clean_text)
risk_words.shape

(330641, 1097806)

In [7]:
nmf_model = NMF(20) 
risk_topic20 = nmf_model.fit_transform(risk_words)

words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]
topic_words20 = [[words[e] for e in l] for l in t]
topic_words20

[['value',
  'fair',
  'fair value',
  'liability',
  'asset',
  'change',
  'asset liability',
  'measurement',
  'carrying',
  'change fair'],
 ['financial',
  'audit',
  'internal',
  'internal control',
  'control',
  'financial statement',
  'statement',
  'reporting',
  'financial reporting',
  'control financial'],
 ['result',
  'financial',
  'operation',
  'condition',
  'result operation',
  'financial condition',
  'including',
  'impact',
  'affect',
  'cost'],
 ['rate',
  'interest',
  'interest rate',
  'debt',
  'swap',
  'fixed',
  'change',
  'rate swap',
  'variable',
  'fixed rate'],
 ['statement',
  'forward',
  'looking',
  'forward looking',
  'looking statement',
  'result',
  'future',
  'report',
  'uncertainty',
  'actual'],
 ['currency',
  'foreign',
  'foreign currency',
  'exchange',
  'exchange rate',
  'contract',
  'rate',
  'currency exchange',
  'dollar',
  'denominated'],
 ['credit',
  'rating',
  'counterparty',
  'counterparties',
  'exposure',
  'c

In [44]:
nmf_model = NMF(22) 
risk_topic22 = nmf_model.fit_transform(risk_words)

words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]
topic_words22 = [[words[e] for e in l] for l in t]
topic_words22

[['value',
  'fair',
  'fair value',
  'liability',
  'asset',
  'change',
  'carrying',
  'measurement',
  'asset liability',
  'input'],
 ['audit',
  'internal',
  'internal control',
  'control',
  'reporting',
  'control financial',
  'financial reporting',
  'financial statement',
  'statement',
  'material'],
 ['financial',
  'financial statement',
  'statement',
  'consolidated',
  'consolidated financial',
  'financial instrument',
  'instrument',
  'financial condition',
  'institution',
  'condition'],
 ['rate',
  'interest',
  'interest rate',
  'debt',
  'swap',
  'fixed',
  'change',
  'rate swap',
  'variable',
  'fixed rate'],
 ['statement',
  'forward',
  'looking',
  'forward looking',
  'looking statement',
  'result',
  'future',
  'report',
  'uncertainty',
  'actual'],
 ['currency',
  'foreign',
  'foreign currency',
  'exchange',
  'exchange rate',
  'rate',
  'currency exchange',
  'dollar',
  'contract',
  'denominated'],
 ['credit',
  'rating',
  'counterparty'

In [45]:
nmf_model = NMF(24) 
risk_topic24 = nmf_model.fit_transform(risk_words)

words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]
topic_words24 = [[words[e] for e in l] for l in t]
topic_words24

[['value',
  'fair',
  'fair value',
  'asset',
  'liability',
  'change',
  'asset liability',
  'measurement',
  'carrying',
  'change fair'],
 ['control',
  'internal',
  'internal control',
  'audit',
  'financial reporting',
  'control financial',
  'reporting',
  'financial',
  'material',
  'financial statement'],
 ['financial',
  'financial statement',
  'financial instrument',
  'instrument',
  'statement',
  'financial condition',
  'condition',
  'institution',
  'consolidated',
  'financial institution'],
 ['rate',
  'interest',
  'interest rate',
  'debt',
  'swap',
  'fixed',
  'rate swap',
  'change',
  'variable',
  'fixed rate'],
 ['statement',
  'forward',
  'looking',
  'forward looking',
  'looking statement',
  'result',
  'future',
  'report',
  'uncertainty',
  'actual'],
 ['currency',
  'foreign',
  'foreign currency',
  'exchange',
  'exchange rate',
  'rate',
  'currency exchange',
  'dollar',
  'contract',
  'denominated'],
 ['credit',
  'rating',
  'counterp

In [27]:
nmf_model = NMF(30) 
risk_topic30 = nmf_model.fit_transform(risk_words)

words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]
topic_words30 = [[words[e] for e in l] for l in t]
topic_words30



[['value',
  'fair',
  'fair value',
  'change',
  'liability',
  'change fair',
  'measurement',
  'input',
  'carrying',
  'valuation'],
 ['control',
  'internal',
  'internal control',
  'audit',
  'financial reporting',
  'control financial',
  'reporting',
  'financial',
  'material',
  'financial statement'],
 ['result',
  'operation',
  'condition',
  'result operation',
  'financial condition',
  'adversely',
  'affect',
  'adverse',
  'material',
  'impact'],
 ['rate',
  'interest',
  'interest rate',
  'swap',
  'debt',
  'change',
  'fixed',
  'rate swap',
  'variable',
  'fixed rate'],
 ['statement',
  'forward',
  'looking',
  'forward looking',
  'looking statement',
  'result',
  'future',
  'report',
  'uncertainty',
  'actual'],
 ['currency',
  'foreign',
  'foreign currency',
  'exchange',
  'exchange rate',
  'rate',
  'currency exchange',
  'dollar',
  'denominated',
  'exposure'],
 ['credit',
  'rating',
  'counterparty',
  'exposure',
  'counterparties',
  'collat

In [None]:
# after review decide on 22 topics

In [46]:
prepped_text['risk_topic_nmf'] = risk_topic22.argmax(axis=1)

In [47]:
risks_info = prepped_text.reset_index()

In [48]:
components = pd.DataFrame(risk_topic22.round(5),
             index = prepped_text.clean_text,
             columns = ["C_0", "C_1","C_2", "C_3", "C_4", "C_5", "C_6", "C_7", "C_8", "C_9", 
                       "C_10", "C_11", "C_12", "C_13", "C_14", "C_15", "C_16", "C_17", "C_18",
                       "C_19", "C_20", "C_21"])

c_info = components.reset_index()

c_info.head(2)

Unnamed: 0,clean_text,C_0,C_1,C_2,C_3,C_4,C_5,C_6,C_7,C_8,...,C_12,C_13,C_14,C_15,C_16,C_17,C_18,C_19,C_20,C_21
0,We produce and distribute high quality video c...,0.0,0.0023,0.02775,0.00046,0.00406,0.0,0.00323,0.02977,5e-05,...,0.0,0.01559,0.0018,0.00036,0.04513,0.06339,0.01076,0.00853,0.00754,0.02536
1,Our cost to provide current benefit and fundin...,0.0,0.01641,0.06734,0.14595,0.03456,0.0,0.0,0.24135,0.00342,...,0.0,0.01173,0.0,0.0,0.0,0.00951,0.0,0.03202,0.04999,0.0559


In [49]:
risks_info.shape

(330641, 17)

In [50]:
c_info.shape

(330641, 23)

In [51]:
# add the topic components to the full df

all_df = pd.concat([risks_info, c_info], axis=1)
all_df.shape

(330641, 40)

In [63]:
all_df.columns

Index(['index', 'date_filed', 'filedAt', 'formType', 'accessionNo', 'ticker',
       'cik', 'companyName', 'companyNameLong', 'linkToTxt', 'formType', 'sic',
       'fiscalYearEnd', 'value', 'preproc_text', 'clean_text',
       'risk_topic_nmf', 'clean_text', 'C_0', 'C_1', 'C_2', 'C_3', 'C_4',
       'C_5', 'C_6', 'C_7', 'C_8', 'C_9', 'C_10', 'C_11', 'C_12', 'C_13',
       'C_14', 'C_15', 'C_16', 'C_17', 'C_18', 'C_19', 'C_20', 'C_21'],
      dtype='object')

In [None]:
all_df_review = all_df[['date_filed', 'ticker', 'companyName', 'value', 'risk_topic_nmf']]
all_df_review.to_excel('review_22_nmf_topics.xlsx', index=False)

In [65]:
######
# CountVectorizer and NMF with 22 topics
topic_names = {0:"Fair Value", 1:"Accounting", 2:"Other", 3:"Interest Rate", 
               4:"Disclosure", 5:"Foreign Exchange ", 6:"Credit", 
               7:"Investment", 8:"Stock", 9:"Capital",
              10:"Liquidity1", 11:"Market", 12:"Derivatives", 13:"Internal Control", 
               14:"Liquidity2", 15:"Loans", 16:"Security", 17: "Product Development", 
               18:"Risk Management", 19:"Insurance", 20:"Commodity", 21: "Regulation"}



In [66]:
all_df["risk_topics"] = all_df.risk_topic_nmf.map(topic_names)
all_df.head(2)

Unnamed: 0,index,date_filed,filedAt,formType,accessionNo,ticker,cik,companyName,companyNameLong,linkToTxt,...,C_13,C_14,C_15,C_16,C_17,C_18,C_19,C_20,C_21,risk_topics
0,0,2020-02-19,2020-02-19T21:42:50-05:00,10-K,0001562762-20-000064,T,732717,AT&T INC.,AT&T INC. (Filer),https://www.sec.gov/Archives/edgar/data/732717...,...,0.01559,0.0018,0.00036,0.04513,0.06339,0.01076,0.00853,0.00754,0.02536,Product Development
1,1,2020-02-19,2020-02-19T21:42:50-05:00,10-K,0001562762-20-000064,T,732717,AT&T INC.,AT&T INC. (Filer),https://www.sec.gov/Archives/edgar/data/732717...,...,0.01173,0.0,0.0,0.0,0.00951,0.0,0.03202,0.04999,0.0559,Investment


In [67]:
with open('../data/topics/NMF_22_risk_topics_FINAL.pickle', 'wb') as to_write:
    pickle.dump(all_df, to_write)

In [None]:
#################

In [None]:
## Additional topic modeling with TF-IDF

In [39]:
#### TRY TFIDF and NMF
TFvectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2))
TFrisk_words = TFvectorizer.fit_transform(prepped_text.clean_text)
TFrisk_words.shape

(330641, 1097806)

In [41]:
nmf_model = NMF(22) 
TFrisk_topic22 = nmf_model.fit_transform(TFrisk_words)

words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]
TFtopic_words22 = [[words[e] for e in l] for l in t]
TFtopic_words22

[['interest rate',
  'interest',
  'rate',
  'swap',
  'debt',
  'rate management',
  'rate swap',
  'fixed',
  'rate debt',
  'variable'],
 ['control',
  'control become',
  'become inadequate',
  'period subject',
  'evaluation effectiveness',
  'subject control',
  'degree compliance',
  'change condition',
  'inadequate change',
  'condition degree'],
 ['currency',
  'foreign',
  'foreign currency',
  'exchange',
  'currency exchange',
  'exchange rate',
  'foreign exchange',
  'rate',
  'dollar',
  'denominated'],
 ['audit',
  'internal control',
  'internal',
  'control',
  'control financial',
  'financial reporting',
  'reporting',
  'reasonable',
  'material',
  'financial'],
 ['product',
  'insurance',
  'related',
  'customer',
  'service',
  'loss',
  'cost',
  'including',
  'system',
  'regulatory'],
 ['credit',
  'concentrations',
  'concentrations credit',
  'counterparty',
  'credit management',
  'counterparty credit',
  'rating',
  'loan',
  'counterparties',
  'cred

In [40]:
nmf_model = NMF(25) 
TFrisk_topic25 = nmf_model.fit_transform(TFrisk_words)

words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]
TFtopic_words25 = [[words[e] for e in l] for l in t]
TFtopic_words25

[['interest rate',
  'interest',
  'rate',
  'swap',
  'debt',
  'rate management',
  'rate swap',
  'fixed',
  'rate debt',
  'variable'],
 ['control',
  'control become',
  'become inadequate',
  'period subject',
  'evaluation effectiveness',
  'subject control',
  'degree compliance',
  'change condition',
  'inadequate change',
  'condition degree'],
 ['currency',
  'foreign currency',
  'foreign',
  'currency exchange',
  'exchange',
  'exchange rate',
  'rate',
  'denominated',
  'dollar',
  'fluctuation'],
 ['audit',
  'internal control',
  'internal',
  'control',
  'control financial',
  'financial reporting',
  'reporting',
  'reasonable',
  'material',
  'financial'],
 ['product',
  'insurance',
  'related',
  'customer',
  'service',
  'loss',
  'cost',
  'including',
  'system',
  'regulatory'],
 ['credit',
  'counterparty',
  'credit management',
  'counterparty credit',
  'rating',
  'loan',
  'counterparties',
  'credit rating',
  'collateral',
  'credit related'],
 ['

In [49]:
nmf_model = NMF(12)  
TFrisk_topic12 = nmf_model.fit_transform(doc_word)

words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]
TFtopic_words12 = [[words[e] for e in l] for l in t]
TFtopic_words12

[['interest rate',
  'interest',
  'rate',
  'rate management',
  'debt',
  'swaps',
  'interest rates',
  'rates',
  'rate swaps',
  'fixed'],
 ['periods subject',
  'subject controls',
  'controls become',
  'evaluation effectiveness',
  'become inadequate',
  'projections evaluation',
  'degree compliance',
  'changes conditions',
  'inherent limitations',
  'inadequate changes'],
 ['audit',
  'internal control',
  'internal',
  'control',
  'control financial',
  'financial reporting',
  'reporting',
  'reasonable',
  'standards',
  'material'],
 ['foreign',
  'currency',
  'foreign currency',
  'exchange',
  'currency exchange',
  'exchange rate',
  'foreign exchange',
  'rate',
  'exchange rates',
  'contracts'],
 ['value',
  'fair',
  'fair value',
  'cash',
  'derivative',
  'instruments',
  'changes',
  'rates',
  'hedge',
  'derivatives'],
 ['credit',
  'concentration',
  'concentration credit',
  'concentrations',
  'concentrations credit',
  'counterparty',
  'credit manage