In [None]:
# Text Mining - Federalist Papers
# Part 4
# We will combine all text pre-processing into a single cell and proceed from there
# SVD and predicting authorship

In [None]:
# import key libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# the dataset is in Documents folder, so changing default folder to Documents before reading
# import os and change directory to Documents
import os
# os.chdir("Documents")

# read federalist.csv
papers = pd.read_csv("federalist.csv")
papers

In [None]:
# combining all pre-processing into a single cell
# filter to papers written by Hamilton, Madison, and Unknown
papers = papers[papers["Author"].isin(["HAMILTON", "MADISON","UNKNOWN"])]

# remove the common first sentence from all documents
papers["Text"] = papers["Text"].str.replace("To the People of the State of New York:", "")

# Remove punctuation from the text column
papers["Text"] = papers["Text"].str.replace('[^\w\s]', '', regex=True)

# convert all words to lowercase
papers["Text"] = papers["Text"].str.lower()

# removal of stop_words
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

stop = stopwords.words("english")

papers["Text"] = papers["Text"].apply(lambda x: " ".join(x for x in x.split()
                                                         if x not in stop))
# stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
papers["Text"] = papers["Text"].apply(lambda x: " ".join([st.stem(word)
                                                         for word in x.split()]))

# further remove custom stopwords, which are problem specific
stop += ["would", "may", "must", "one", "upon", "might", "shall", "could"]
papers["Text"] = papers["Text"].apply(lambda x: " ".join(x for x in x.split()
                                                         if x not in stop))

papers["Text"]

In [None]:
# creating the corpus and dictionary
import gensim
from gensim import corpora, models

# Tokenize the documents in the Text column
corpus = [doc.split() for doc in papers["Text"]]

# Create the term dictionary of the corpus
dictionary = corpora.Dictionary(corpus)

dictionary.filter_extremes(no_below = 2, no_above = 0.75)

# Convert the corpus into Document Term Matrix
DFM = [dictionary.doc2bow(doc) for doc in corpus]

In [None]:
# Apply SVD to create 8 dimensions
# https://radimrehurek.com/gensim/models/lsimodel.html
# create the TF-IDF model
tfidf = models.TfidfModel(DFM)
DFM_tfidf = tfidf[DFM]

n_SVD = 8
SVD_model = models.LsiModel(DFM_tfidf, id2word=dictionary, num_topics=n_SVD)
SVD = SVD_model[DFM_tfidf]

# convert results into array
svd_array = gensim.matutils.corpus2csc(SVD).T.toarray()

# convert results to data frame
svd_df = pd.DataFrame(svd_array)

# show SVD results - reduced vector representation of the documents
svd_df

In [None]:
# prepare data frame for predictive models
model_df = pd.concat([papers.reset_index()["Author"], svd_df], axis=1, ignore_index=True).rename({0:"Author"}, axis=1)
model_df

In [None]:
# partitioning data so test data's authors are UNKNOWN
# no validation set due to small sample size
testData = model_df[model_df["Author"] == "UNKNOWN"]
trainData = model_df[~(model_df["Author"] == "UNKNOWN")]

# manually dummy code target variable so Author HAMILTON = 1
trainData["Author"] = [1 if x == "HAMILTON" else 0 for x in trainData.Author]

# create DV and IV sets
y_train = trainData["Author"]
X_train = trainData[trainData.columns[trainData.columns != "Author"]]
X_test = testData[testData.columns[testData.columns != "Author"]]

y_train.value_counts()

In [None]:
# build decision tree predictive model
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)

tree.fit(X_train, y_train)
tree_predictions = tree.predict(X_test)
tree_predictions

tree.score(X_train, y_train)

In [None]:
# build logistic regression model
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(class_weight="balanced")
LR.fit(X_train, y_train)

lr_predictions = LR.predict(X_test)
lr_predictions

LR.score(X_train, y_train)

results = pd.DataFrame(lr_predictions, columns=["Sklearn LR"])
results["Decision Tree"] = tree_predictions

doc_labels = list(range(65,77))
results.index = doc_labels

results