In [1]:
import pandas as pd
import numpy as np
import os
import re
import pickle
import matplotlib.pyplot as plt

In [2]:
rf_df = pd.read_csv('RF_df.csv', index_col=0)
rf_df.dropna(inplace=True)
rf_df.sample(10)

Unnamed: 0,cik,reporting year,filing date,Item 1A
26,1627606,2018,20180419,We may also be subject to involuntary product ...
23,1138951,2007,20080313,Our international consumer business may not su...
37,1075066,2007,20080414,Our share price has been subject to extreme pr...
19,1001115,2015,20151119,We may experience difficulties in connection w...
7,1116521,2011,20111209,We cannot predict which competitors may enter ...
26,1487999,2012,20130221,"When lessees default, we may fail to recover a..."
12,1574774,2013,20140226,The success of our business is partially depen...
29,78239,2010,20100331,Our trademarks and other intellectual property...
17,1370489,2013,20140415,WE RELY ON ACCESS TO FUNDING FROM THE UNITED S...
42,21510,2005,20051215,Risks related to our industry\nOur market is u...


In [None]:
word_cnt = rf_df['Item 1A'].map(lambda x: len(x.split()))
Q05 = word_cnt.quantile(q=0.05)
Q95 = word_cnt.quantile(q=0.95)

print(f'5-th percentile: {Q05}')
print(f'95-th percentile: {Q95}')

fig = plt.figure(figsize=(10,6))
plt.hist(word_cnt, bins=100)
plt.show();

In [4]:
# Filter too short and too long risk factors
filtered_rf_df = rf_df[(word_cnt>Q05) & (word_cnt<Q95)]

# Create sample train data 
new_ind = (filtered_rf_df['cik'].map(str) + '-' + 
           filtered_rf_df['reporting year'].map(str) + '-' + 
           filtered_rf_df['filing date'].map(str) + '-' + 
           filtered_rf_df.index.map(str))

# Sample data to train the topic models on 50% of the data, randomly selected
raw_text_data = filtered_rf_df.set_index(new_ind)['Item 1A'].sample(len(filtered_rf_df)//2, random_state=101)

## **Model implementation**

### **Ensembe LDA**

In [5]:
import gensim.corpora as corpora
from gensim.models import EnsembleLda, CoherenceModel

from text_tokenizer import tokenizer

# Parameters:
num_cpu = os.cpu_count()

In [None]:
sample_tokens = tokenizer(raw_text_data, n_jobs=10)

with open("LDA/sample_tokens.txt", "wb") as fp:
    pickle.dump(sample_tokens, fp)

In [None]:
# Generate Dictionary
lda_dict = corpora.Dictionary(sample_tokens)

# Filter most common and rare words
lda_dict.filter_extremes(no_below=100, no_above=99000)

# Save lda_dict to disk
lda_dict.save('LDA/lda_dict')

In [None]:
# Creat corpus from Dictionary
lda_dict = corpora.Dictionary.load('LDA/lda_dict')
lda_corpus = [lda_dict.doc2bow(text) for text in sample_tokens]

In [None]:
def compute_coherence_values(dictionary, corpus, limit, start=10, step=5):
    """
    Compute coherence for num_topics in a specific range

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_model = {}
    for num_topics in range(start, limit, step):

        # Train ensemble LDA model
        model = EnsembleLda(
            topic_model_class='ldamulticore', corpus=corpus, id2word=dictionary, num_topics=num_topics, 
            num_models=5, random_state=101, ensemble_workers=num_cpu//2, distance_workers=num_cpu//2,
            alpha='asymmetric' #, decay=0.5, offset=64 # best params from Hoffman paper
            )
        
        cm = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, coherence='u_mass', )
        cm_score = cm.get_coherence()
        coherence_model[num_topics] = (cm_score, model)

        print(f'Model with {num_topics} trained | Coherence score: {cm_score}')

    return coherence_model

In [None]:
coherence_model = compute_coherence_values(dictionary=lda_dict, corpus=lda_corpus, 
                                           limit=121, start=40, step=10)

In [None]:
num_topics = [len(x[1].stable_topics) for x in coherence_model.values()]#list(coherence_model.keys())
coherence_values = [-x[0] for x in coherence_model.values()]

fig = plt.figure(figsize=(10,6))
plt.scatter(num_topics, coherence_values, alpha=0.5)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
max_c = max(coherence_model, key=lambda x: -coherence_model[x][0])
best_model = coherence_model[max_c][1]

best_model.save('LDA/best_lda')

In [None]:
print(max_c)
best_model.print_topics()

95


[(31,
  '0.193*"service" + 0.078*"provider" + 0.044*"provide" + 0.030*"arrangement" + 0.027*"network" + 0.015*"physician" + 0.013*"relationship" + 0.011*"ability" + 0.011*"maintain" + 0.009*"co"'),
 (1,
  '0.061*"personnel" + 0.057*"retain" + 0.039*"key" + 0.037*"employee" + 0.037*"attract" + 0.033*"business" + 0.029*"qualified" + 0.027*"ability" + 0.026*"management" + 0.019*"depend"'),
 (24,
  '0.071*"condition" + 0.070*"economic" + 0.034*"business" + 0.033*"result" + 0.031*"financial" + 0.031*"affect" + 0.024*"market" + 0.024*"adverse" + 0.022*"consumer" + 0.021*"demand"'),
 (11,
  '0.065*"acquisition" + 0.063*"business" + 0.046*"acquire" + 0.023*"integrate" + 0.023*"operation" + 0.019*"result" + 0.017*"risk" + 0.015*"company" + 0.015*"management" + 0.014*"include"'),
 (23,
  '0.209*"party" + 0.182*"third" + 0.020*"software" + 0.016*"service" + 0.014*"center" + 0.013*"business" + 0.011*"vendor" + 0.010*"use" + 0.010*"certain" + 0.010*"result"'),
 (33,
  '0.196*"rate" + 0.121*"interes

### Top2Vec

In [None]:
rf_list = pd.read_csv('RF_df.csv', index_col=0)

In [None]:
def tokenizer_func(text):
    
    doc = nlp(text)
    # Identify named entities
    ents = [ent.lemma_ for ent in doc.ents]
    # To remove stop words, punctuations, and currency tokens
    mask = lambda t: not (t.is_stop or t.is_punct or t.is_currency or t.is_space or t.ent_iob_ !='O')
    tokens = [tok.lemma_ for tok in filter(mask, doc)]
    tokens.extend(ents)

    return tokens

In [None]:
top2vec_model = Top2Vec(documents=sample_docs.tolist(), speed="deep-learn", 
                        workers=10, document_ids=sample_docs.index.tolist(), tokenizer=tokenizer_func)

2021-12-21 11:11:44,151 - top2vec - INFO - Pre-processing documents for training
2021-12-21 11:50:09,364 - top2vec - INFO - Creating joint document/word embedding
2021-12-21 14:52:27,308 - top2vec - INFO - Creating lower dimension embedding of documents
2021-12-21 14:56:13,577 - top2vec - INFO - Finding dense areas of documents
2021-12-21 14:56:23,666 - top2vec - INFO - Finding topics


In [None]:
top2vec_model.save('top2vec_model')

In [None]:
# Original script / custome tokenizer / 100000 random sample
print(top2vec_model.get_num_topics())
topic_words, word_scores, topic_nums = top2vec_model.get_topics()
topic_words

1762


array([['goodwill', 'impairment', 'intangible', ..., 'asc', 'estimate',
        'deferred'],
       ['weakness', 'misstatement', 'reporting', ..., 'scrutiny',
        'disclosure', 'public'],
       ['acceleration', 'default', 'repay', ..., 'subordinated',
        'govern', 'lend'],
       ...,
       ['fela', 'highlight', 'suitable', ..., 'objective', 'discovery',
        'purchaser'],
       ['cfc', 'lockout', 'recycling', ..., 'bottle', 'espcs', 'rural'],
       ['conformity', 'recycling', 'compliant', ..., 'principle',
        'cayman', 'distributable']], dtype='<U18')