In [3]:
#Dependencies
import pandas as pd
import gensim #the library for Topic modelling
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim_models #LDA visualization library

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

In [5]:
df = pd.read_csv('All entities.csv')
print(df.shape)
df

(372, 1)


Unnamed: 0,All entities
0,BLACKROCK ADV UKLTD-MORGAN-AGG
1,BLACKROCK FIN MG AAF-MORGANTRN
2,Commingled Pension Trust Fund (Core Bond) of J...
3,COMMINGLED PENSION TRUST FUND (CORE PLUS BOND)...
4,COMMINGLED PENSION TRUST FUND (CORE PLUS BOND)...
...,...
367,MORGAN STANLEY & CO INTERNATIONAL PLC
368,MORGAN STANLEY & CO INTL P-GBR
369,MORGAN STANLEY & CO INTL P-GBR
370,MORGAN STANLEY & CO INTL P-IOS


In [6]:
#clean the data
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

In [8]:
df['text_clean']=df['All entities'].apply(clean)

In [9]:
df

Unnamed: 0,All entities,text_clean
0,BLACKROCK ADV UKLTD-MORGAN-AGG,"[blackrock, adv, ukltdmorganagg]"
1,BLACKROCK FIN MG AAF-MORGANTRN,"[blackrock, fin, mg, aafmorgantrn]"
2,Commingled Pension Trust Fund (Core Bond) of J...,"[commingled, pension, trust, fund, core, bond,..."
3,COMMINGLED PENSION TRUST FUND (CORE PLUS BOND)...,"[commingled, pension, trust, fund, core, plus,..."
4,COMMINGLED PENSION TRUST FUND (CORE PLUS BOND)...,"[commingled, pension, trust, fund, core, plus,..."
...,...,...
367,MORGAN STANLEY & CO INTERNATIONAL PLC,"[morgan, stanley, co, international, plc]"
368,MORGAN STANLEY & CO INTL P-GBR,"[morgan, stanley, co, intl, pgbr]"
369,MORGAN STANLEY & CO INTL P-GBR,"[morgan, stanley, co, intl, pgbr]"
370,MORGAN STANLEY & CO INTL P-IOS,"[morgan, stanley, co, intl, pios]"


In [10]:
#create dictionary
dictionary = corpora.Dictionary(df['text_clean'])
#Total number of non-zeroes in the BOW matrix (sum of the number of unique words per document over the entire corpus).
print(dictionary.num_nnz)

2168


In [11]:
#create document term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['text_clean'] ]
print(len(doc_term_matrix))

372


In [12]:
lda = gensim.models.ldamodel.LdaModel

In [17]:
num_topics=15
%time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

Wall time: 4.23 s


In [18]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.187*"j" + 0.187*"p" + 0.111*"morgan" + 0.028*"mgt" + 0.028*"incny" + 0.028*"inv" + 0.028*"canadator" + 0.028*"ont" + 0.002*"ldn" + 0.002*"jp"'),
 (1,
  '0.096*"blackrock" + 0.091*"morgan" + 0.091*"plc" + 0.091*"ireland" + 0.091*"trustee" + 0.090*"jp" + 0.088*"bank" + 0.088*"liability" + 0.046*"fund" + 0.045*"leveraged"'),
 (2,
  '0.208*"co" + 0.204*"stanley" + 0.198*"morgan" + 0.093*"plc" + 0.093*"international" + 0.079*"llc" + 0.042*"s" + 0.005*"usa" + 0.005*"america" + 0.005*"merrill"'),
 (3,
  '0.120*"cap" + 0.106*"morgan" + 0.100*"sv" + 0.091*"stanley" + 0.041*"ltd" + 0.041*"td" + 0.041*"ubs11" + 0.041*"2016" + 0.021*"nasg" + 0.021*"nany"'),
 (4,
  '0.218*"morgan" + 0.198*"jp" + 0.172*"security" + 0.127*"llc" + 0.048*"limited" + 0.035*"australia" + 0.018*"ltd" + 0.013*"mgt" + 0.013*"au" + 0.009*"asset"'),
 (5,
  '0.107*"morgan" + 0.076*"asia" + 0.063*"stanley" + 0.055*"ltd" + 0.055*"class" + 0.054*"2018h3" + 0.048*"trust" + 0.044*"mortgage" + 0.044*"commercial" + 0.044*"s

In [19]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)