In [9]:
"""
LDA topic modeling of each risk paragraph
2011-2020
"""

''

In [32]:
import pandas as pd
import numpy as np
import pickle
import datetime
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [33]:
with open('../data/preproc/clean_text_all_years.pickle', 'rb') as read_file:
    prepped_text = pickle.load(read_file)

In [34]:
prepped_text.shape

(330641, 15)

In [35]:
prepped_text.columns

Index(['date_filed', 'filedAt', 'formType', 'accessionNo', 'ticker', 'cik',
       'companyName', 'companyNameLong', 'linkToTxt', 'formType', 'sic',
       'fiscalYearEnd', 'value', 'preproc_text', 'clean_text'],
      dtype='object')

In [36]:
# add additional stopwords
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['risk', 'risks', 'factor', 'factors', 'item', 'quantitative', 'qualitative',
                 'company', 'business', 'firm', 'us', 'our', 
                'could', 'may', 'quarter', 'year'
                "january", "february", "march", "april", "may", "june", 
                "july", "august", "september", "october", "november", "december",
                "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "oct", "nov", "dec",
                "million", "thousand", "hundred", "billion"])

In [37]:
# Count Vectorizer with bi-grams
vectorizer = CountVectorizer(stop_words = stopwords, ngram_range=(1,2))
risk_words = vectorizer.fit_transform(prepped_text.clean_text)
risk_words.shape

(330641, 1097806)

In [13]:
# LDA 20
lda_model = LatentDirichletAllocation(n_components=20)
lda_topics20 = lda_model.fit_transform(risk_words)
#lda_topics.shape

In [14]:
words = vectorizer.get_feature_names()
t = lda_model.components_.argsort(axis=1)[:,-1:-11:-1]
topic_words20 = [[words[e] for e in l] for l in t]
topic_words20

[['rate',
  'value',
  'based',
  'expected',
  'fair',
  'fair value',
  'option',
  'stock',
  'assumption',
  'interest'],
 ['investment',
  'asset',
  'plan',
  'financial',
  'condition',
  'return',
  'information',
  'discussion',
  'result',
  'financial condition'],
 ['currency',
  'foreign',
  'hedge',
  'foreign currency',
  'exchange',
  'value',
  'cash',
  'derivative',
  'flow',
  'cash flow'],
 ['management',
  'committee',
  'operational',
  'process',
  'control',
  'board',
  'officer',
  'policy',
  'framework',
  'compliance'],
 ['service',
  'system',
  'information',
  'security',
  'technology',
  'data',
  'customer',
  'party',
  'third',
  'third party'],
 ['product',
  'operation',
  'result',
  'cost',
  'market',
  'including',
  'condition',
  'customer',
  'change',
  'impact'],
 ['credit',
  'loan',
  'loss',
  'rating',
  'collateral',
  'portfolio',
  'counterparty',
  'customer',
  'allowance',
  'amount'],
 ['insurance',
  'claim',
  'liability',
  

In [31]:
# LDA 22
lda_model = LatentDirichletAllocation(n_components=22)
lda_topics22 = lda_model.fit_transform(risk_words)
#lda_topics.shape

words = vectorizer.get_feature_names()
t = lda_model.components_.argsort(axis=1)[:,-1:-11:-1]
topic_words22 = [[words[e] for e in l] for l in t]
topic_words22

[['insurance',
  'reinsurance',
  'contract',
  'mortgage',
  'management',
  'life',
  'premium',
  'state',
  'loan',
  'policy'],
 ['rate',
  'interest',
  'interest rate',
  'currency',
  'foreign',
  'exchange',
  'foreign currency',
  'derivative',
  'instrument',
  'market'],
 ['loan',
  'loss',
  'credit',
  'portfolio',
  'allowance',
  'commercial',
  'account',
  'based',
  'estimate',
  'claim'],
 ['control',
  'financial',
  'reporting',
  'internal',
  'internal control',
  'procedure',
  'financial reporting',
  'policy',
  'policy procedure',
  'compliance'],
 ['financial',
  'market',
  'condition',
  'liquidity',
  'result',
  'credit',
  'insurance',
  'operation',
  'economic',
  'adversely'],
 ['capital',
  'requirement',
  'ratio',
  'based',
  'bank',
  'asset',
  'regulatory',
  'federal',
  'basel',
  'tier'],
 ['result',
  'operation',
  'cost',
  'property',
  'claim',
  'insurance',
  'subject',
  'adverse',
  'liability',
  'environmental'],
 ['investment',

In [17]:
# LDA 24
lda_model = LatentDirichletAllocation(n_components=24)
lda_topics24 = lda_model.fit_transform(risk_words)
#lda_topics.shape

words = vectorizer.get_feature_names()
t = lda_model.components_.argsort(axis=1)[:,-1:-11:-1]
topic_words24 = [[words[e] for e in l] for l in t]
topic_words24

[['cash',
  'property',
  'facility',
  'insurance',
  'change',
  'amount',
  'loss',
  'operation',
  'coverage',
  'nuclear'],
 ['statement',
  'forward',
  'looking',
  'forward looking',
  'looking statement',
  'result',
  'report',
  'financial',
  'uncertainty',
  'future'],
 ['insurance',
  'claim',
  'health',
  'service',
  'coverage',
  'liability',
  'care',
  'product',
  'venture',
  'cost'],
 ['management',
  'committee',
  'credit',
  'policy',
  'loan',
  'board',
  'process',
  'liquidity',
  'officer',
  'operational'],
 ['capital',
  'ratio',
  'based',
  'requirement',
  'asset',
  'regulatory',
  'bank',
  'tier',
  'basel',
  'federal'],
 ['credit',
  'rating',
  'credit rating',
  'security',
  'investment',
  'related',
  'default',
  'exposure',
  'loss',
  'collateral'],
 ['reinsurance',
  'investment',
  'benefit',
  'contract',
  'entity',
  'insurance',
  'life',
  'annuity',
  'equity',
  'variable'],
 ['investment',
  'cash',
  'security',
  'credit',
 

In [18]:
# LDA 25
lda_model = LatentDirichletAllocation(n_components=25)
lda_topics25 = lda_model.fit_transform(risk_words)
#lda_topics.shape

words = vectorizer.get_feature_names()
t = lda_model.components_.argsort(axis=1)[:,-1:-11:-1]
topic_words25 = [[words[e] for e in l] for l in t]
topic_words25

[['loan',
  'asset',
  'credit',
  'investment',
  'portfolio',
  'loss',
  'plan',
  'return',
  'mortgage',
  'allowance'],
 ['data',
  'shall',
  'service',
  'certain',
  'performance',
  'financial',
  'guidance',
  'shares',
  'payment',
  'information'],
 ['interest',
  'debt',
  'rate',
  'term',
  'net',
  'income',
  'loan',
  'interest rate',
  'loss',
  'wa'],
 ['product',
  'service',
  'health',
  'change',
  'cash',
  'care',
  'financial',
  'market',
  'new',
  'amount'],
 ['rate',
  'cost',
  'investment',
  'market',
  'capital',
  'operating',
  'term',
  'change',
  'future',
  'fund'],
 ['value',
  'fair',
  'fair value',
  'asset',
  'market',
  'estimate',
  'flow',
  'cash flow',
  'rate',
  'cash'],
 ['insurance',
  'reinsurance',
  'loss',
  'claim',
  'liability',
  'coverage',
  'property',
  'policy',
  'premium',
  'amount'],
 ['currency',
  'foreign',
  'exchange',
  'foreign currency',
  'rate',
  'exchange rate',
  'contract',
  'currency exchange',
  

In [19]:
# LDA 30
lda_model = LatentDirichletAllocation(n_components=30)
lda_topics30 = lda_model.fit_transform(risk_words)
#lda_topics.shape

words = vectorizer.get_feature_names()
t = lda_model.components_.argsort(axis=1)[:,-1:-11:-1]
topic_words30 = [[words[e] for e in l] for l in t]
topic_words30

[['system',
  'information',
  'security',
  'data',
  'service',
  'technology',
  'party',
  'third',
  'third party',
  'customer'],
 ['insurance',
  'loss',
  'reinsurance',
  'liability',
  'claim',
  'coverage',
  'financial',
  'certain',
  'exposure',
  'service'],
 ['operation',
  'result',
  'regulation',
  'cost',
  'environmental',
  'financial',
  'subject',
  'regulatory',
  'result operation',
  'law'],
 ['entity',
  'venture',
  'joint',
  'joint venture',
  'interest',
  'vie',
  'investment',
  'activity',
  'partner',
  'equity'],
 ['investment',
  'security',
  'cash',
  'market',
  'equity',
  'credit',
  'asset',
  'portfolio',
  'fund',
  'change'],
 ['insurance',
  'including',
  'gas',
  'property',
  'change',
  'policy',
  'liability',
  'program',
  'coverage',
  'health'],
 ['contract',
  'revenue',
  'customer',
  'cost',
  'sale',
  'service',
  'product',
  'loss',
  'recognized',
  'payment'],
 ['capital',
  'requirement',
  'bank',
  'ratio',
  'based'

In [23]:
prepped_text['post_topic_lda'] = lda_topics25.argmax(axis=1)

In [24]:
risks_info = prepped_text.reset_index()

In [26]:
components = pd.DataFrame(lda_topics25.round(5),
             index = prepped_text.clean_text,
             columns = ["C_0", "C_1","C_2", "C_3", "C_4", "C_5", "C_6", "C_7", "C_8", "C_9", 
                       "C_10", "C_11", "C_12", "C_13", "C_14", "C_15", "C_16", "C_17", "C_18",
                       "C_19", "C_20", "C_21", "C_22", "C_23", "C_24"])

c_info = components.reset_index()

c_info.head(2)

Unnamed: 0,clean_text,C_0,C_1,C_2,C_3,C_4,C_5,C_6,C_7,C_8,...,C_15,C_16,C_17,C_18,C_19,C_20,C_21,C_22,C_23,C_24
0,We produce and distribute high quality video c...,0.00019,0.00019,0.00019,0.67532,0.00019,0.00019,0.00019,0.00019,0.00019,...,0.07563,0.00019,0.00019,0.00019,0.00019,0.00019,0.00019,0.00019,0.00019,0.00019
1,Our cost to provide current benefit and fundin...,0.1085,0.00019,0.06664,0.05732,0.63621,0.00019,0.00019,0.03738,0.00019,...,0.00019,0.00019,0.00019,0.02984,0.00019,0.00019,0.00019,0.03012,0.00019,0.00019


In [27]:
risks_info.shape

(331272, 17)

In [28]:
c_info.shape

(331272, 26)

In [29]:
# add the topic components to the full df

all_df = pd.concat([risks_info, c_info], axis=1)
all_df.shape

(331272, 43)

In [30]:
with open('../data/topics/LDA_25_risk_topics_DRAFT.pickle', 'wb') as to_write:
    pickle.dump(all_df, to_write)