# Topic modelling 

## LDA 

In [1]:
#pip install pyldavis


In [2]:
import numpy as np
import pandas as pd 

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
data = pd.read_csv('all_participants.csv')
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

data

Unnamed: 0,Participant
0,"Yes, thank you, Uli. The good thing is that I ..."
1,Three quick questions. One is just a follow-up...
2,"Well, good afternoon to everybody here in Lond..."
3,"Well, thank you very much. We will now begin w..."


In [4]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []

    for row in range(len(data)):
        for text in texts:
            doc = nlp(data.loc[row,"Participant"])
        
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

lemmatized_texts = lemmatization(data)
lemmatized_texts[0]

"thank good thing be have interface surrounding again somy read glass break minute meeting be kind of enough toget quality product drugstore corner be online again thank thank think mention already part major part keyfigure result think start get detailed p&l look key metric remarkable development key metric whichwere only drive also market whomsoever first strong cash flow nearly reach mark again westarte little bit slow year think mention previous occasion wasone mobility contract come contract again little bit later whichwa cash negative first quarter turnaround have already willturnaround get money back second half year cash flow think support positive message quality theresult even little bit more depressed overall growth cash flow stay verypositive right hand side asset own management % cantell development dollar yield especially yield thegovernment bond today be approach so increase be reallyremarkable again be only drive very good result also yield andespecially currency exchan

In [5]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print (data_words[0][0:20])

['thank', 'good', 'thing', 'be', 'have', 'interface', 'surrounding', 'again', 'somy', 'read', 'glass', 'break', 'minute', 'meeting', 'be', 'kind', 'of', 'enough', 'toget', 'quality']


In [6]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

print (corpus[0][0:20])

word = id2word[[0][:1][0]]
print (word)

[(0, 1), (1, 6), (2, 3), (3, 2), (4, 2), (5, 1), (6, 1), (7, 1), (8, 16), (9, 4), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 8), (16, 1), (17, 27), (18, 1), (19, 1)]
absolute


In [7]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

### Visualising data and topics

In [8]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


Initial attempt at LDA topic modelling using 4 sample texts from earnings calls 

**ToDo**
- clean text to better categorise topics 

source: https://github.com/wjbmattingly/topic_modeling_textbook/blob/main/03_03_lda_model_demo.ipynb