# Trying out an LDA model to apply topic modelling to MEAs

In [2]:
!pip install numpy --upgrade



In [1]:
# Handle imports
import numpy as np
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import pickle
from gensim.models import CoherenceModel
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
import pprint
import pyLDAvis.gensim_models as gensimvis
from tqdm import tqdm_notebook as tqdm



In [2]:
nlp = spacy.load('en_core_web_trf')
lemmatizer = nlp.get_pipe("lemmatizer")
print(nlp.pipeline)

[('transformer', <spacy_transformers.pipeline_component.Transformer object at 0x000001431641EFA8>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x000001430DDDB6A8>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x0000014316446C18>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x000001430DDE53C8>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x00000143162D6D88>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x0000014316446D68>)]


In [3]:
nlp.Defaults.stop_words.update(["ohio","division", "department", "commerce","financial","revised","code","institutions","chapter","codified","institution"])
# Iterates over the words in the stop words list and resets the "is_stop" flag.
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

In [4]:
# Load data
df = pd.read_csv("Standardized Data/Final_cleaned.csv")

In [5]:
def clean_text(sample_text):
    sample_text = sample_text.translate(str.maketrans('', '', string.punctuation))
    sample_text = " ".join([w for w in sample_text.split() if not w.isdigit()])
    sample_text = sample_text.lower()
    doc = nlp(sample_text)
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    doc = nlp.make_doc(doc)
    tokens = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return tokens

In [6]:
sample_text = df.loc[3,"Text"]
print(clean_text(sample_text))

['charge', 'responsibility', 'enforce', 'mortgage', 'loan', 'act', 'codify', 'rc', 'protect', 'public', 'proscribed', 'mortgage', 'lending', 'practice', 'find', 'order', 'necessary', 'appropriate', 'interest', 'public', 'consistent', 'purpose', 'mortgage', 'loan', 'act', 'aj', 'investments', 'inc', 'dba', 'aj', 'fund', 'respondent', 'corporation', 'register', 'mortgage', 'lender', 'pursuant', 'rc', 'fail', 'renew', 'registration', 'period', 'begin', 'july', 'business', 'address', 'record', 'respondent', 'east', '34th', 'street', 'willoughby', 'june', 'serve', 'respondent', 'certified', 'mail', 'notice', 'divisions', 'intent', 'revoke', 'respondent', 'mortgage', 'lender', 'certificate', 'registration', 'notice', 'respondent', 'right', 'hearing', 'matter', 'notice', 'pursuant', 'rc', 'notice', 'state', 'respondent', 'notify', 'entitled', 'adjudicative', 'hearing', 'request', 'thirty', 'day', 'date', 'mail', 'notice', 'respondent', 'request', 'hearing', 'thirty', 'day', 'date', 'mailing',

In [8]:
text = df["Text"][:10]
text_list = []
for doc in tqdm(text):
    if not pd.isna(doc):
        pr = clean_text(doc)
        text_list.append(pr)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/10 [00:00<?, ?it/s]

In [9]:
text_list[2]

['charge',
 'responsibility',
 'enforce',
 'mortgage',
 'broker',
 'act',
 'codify',
 'rc',
 'protect',
 'public',
 'proscribed',
 'mortgage',
 'lending',
 'practice',
 'find',
 'order',
 'necessary',
 'appropriate',
 'interest',
 'public',
 'consistent',
 'purpose',
 'mortgage',
 'broker',
 'act',
 'mortgage',
 'specialists',
 'inc',
 'respondent',
 'kentucky',
 'corporation',
 'renew',
 'certificate',
 'registration',
 'period',
 'april',
 'mortgage',
 'broker',
 'pursuant',
 'rc',
 'return',
 'certificate',
 'september',
 'business',
 'address',
 'record',
 'respondent',
 'john',
 'sutherland',
 'drive',
 'suite',
 'nicholasville',
 'ky',
 'july',
 'serve',
 'respondent',
 'certified',
 'mail',
 'notice',
 'divisions',
 'intent',
 'revoke',
 'respondent',
 'mortgage',
 'broker',
 'certificate',
 'registration',
 'notice',
 'respondent',
 'right',
 'hearing',
 'matter',
 'notice',
 'pursuant',
 'rc',
 'notice',
 'state',
 'respondent',
 'notify',
 'entitled',
 'adjudicative',
 'heari

In [10]:
# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(text_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in text_list]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=5 
                                           )

In [None]:
import pyLDAvis.gensim_models as gensimvis

Error: Kernel is dead