In [None]:
# Text Mining - Federalist Papers
# Part 3
# We will combine all text pre-processing into a single cell and proceed from there
# Topic modeling

In [None]:
# import key libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# the dataset is in Documents folder, so changing default folder to Documents before reading
# import os and change directory to Documents
import os
# os.chdir("Documents")

# read federalist.csv
papers = pd.read_csv("federalist.csv")
papers

In [None]:
# combining all pre-processing into a single cell
# filter to papers written by Hamilton, Madison, and Unknown
papers = papers[papers["Author"].isin(["HAMILTON", "MADISON","UNKNOWN"])]

# remove the common first sentence from all documents
papers["Text"] = papers["Text"].str.replace("To the People of the State of New York:", "")

# Remove punctuation from the text column
papers["Text"] = papers["Text"].str.replace('[^\w\s]', '', regex=True)

# convert all words to lowercase
papers["Text"] = papers["Text"].str.lower()

# removal of stop_words
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

stop = stopwords.words("english")

papers["Text"] = papers["Text"].apply(lambda x: " ".join(x for x in x.split()
                                                         if x not in stop))
# stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
papers["Text"] = papers["Text"].apply(lambda x: " ".join([st.stem(word)
                                                         for word in x.split()]))

# further remove custom stopwords, which are problem specific
stop += ["would", "may", "must", "one", "upon", "might", "shall", "could"]
papers["Text"] = papers["Text"].apply(lambda x: " ".join(x for x in x.split()
                                                         if x not in stop))

papers["Text"]

In [None]:
# Tokenize the documents in the Text column
corpus = [doc.split() for doc in papers["Text"]]

# create DFM
import gensim
from gensim import corpora, models

# Create the term dictionary of the corpus
dictionary = corpora.Dictionary(corpus)

dictionary.filter_extremes(no_below = 2, no_above = 0.75)

# Convert the corpus into Document Term Matrix
DFM = [dictionary.doc2bow(doc) for doc in corpus]


In [None]:
# Create LDA model using gensim library
# https://radimrehurek.com/gensim/models/ldamodel.html
n_topics = 8
ldamodel = models.LdaModel(DFM, num_topics=n_topics, id2word=dictionary, passes=40)

print(ldamodel.print_topics(num_topics=n_topics, num_words=10))


In [None]:
# ! pip install pyLDAvis

In [None]:
# Interactive topic modeling visualization
# https://pyldavis.readthedocs.io/en/latest/readme.html
# ! pip install pyLDAvis
import pyLDAvis
pyLDAvis.enable_notebook()

import pyLDAvis.gensim_models
vis = pyLDAvis.gensim_models.prepare(ldamodel, DFM, dictionary)
vis
