In [None]:
# Text Mining - Federalist Papers
# Part 1

In [None]:
# import key libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# import os and change directory to Documents
import os
os.chdir("Documents")
os.getcwd()

In [None]:
# read federalist.csv
papers = pd.read_csv("federalist.csv")
papers

In [None]:
papers["Author"].value_counts()

In [None]:
# filter to papers written by Hamilton, Madison, and Unknown
papers = papers[papers["Author"].isin(["HAMILTON", "MADISON","UNKNOWN"])]
papers["Author"].value_counts()

In [None]:
# remove the common first sentence from all documents
papers["Text"] = papers["Text"].str.replace("To the People of the State of New York:", "")
papers

In [None]:
# Create a function dim() to output number of terms/features in a DFM/DTM
# set() is an unordered collection of unique items
def dim():
    dimensions = len(set(papers["Text"].str.split().explode().values))
    print(f'{dimensions} dimensions in the DFM.')
    
dim()

In [None]:
# simple frequency analysis - Top 20 words
freq = pd.Series(' '.join(papers["Text"]).split()).value_counts()[:30]
freq

In [None]:
# removing punctuations
papers["Text"] = papers["Text"].str.replace(r'[^\w\s]+', '')
dim()

In [None]:
# convert all words to lowercase
papers["Text"] = papers["Text"].str.lower()
papers["Text"].head()

In [None]:
# simple frequency analysis after conversion - Top 30 words
freq = pd.Series(' '.join(papers["Text"]).split()).value_counts()[:30]
freq

In [None]:
dim()

In [None]:
# removal of stop_words
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

stop = stopwords.words("english")

papers["Text"] = papers["Text"].apply(lambda x: " ".join(x for x in x.split()
                                                         if x not in stop))

# frequency analysis after removing default stopwords
freq = pd.Series(" ".join(papers["Text"]).split()).value_counts()[:30]
freq

In [None]:
dim()

In [None]:
# stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
papers["Text"] = papers["Text"].apply(lambda x: " ".join([st.stem(word)
                                                         for word in x.split()]))
dim()

In [None]:
# simple frequency analysis after stemming - Top 30 words
freq = pd.Series(' '.join(papers["Text"]).split()).value_counts()[:30]
freq

In [None]:
# further remove custom stopwords, which are problem specific
stop += ["would", "may", "must", "one", "upon", "might", "shall", "could"]
papers["Text"] = papers["Text"].apply(lambda x: " ".join(x for x in x.split()
                                                         if x not in stop))
dim()

In [None]:
# simple frequency analysis after another round of stop words
freq = pd.Series(' '.join(papers["Text"]).split()).value_counts()[:30]
freq