In [None]:
# Text Mining - Federalist Papers
# Part 2
# We will combine all text pre-processing into a single cell and proceed from there
# Word cloud, document similarity and clustering

In [None]:
# import key libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# the dataset is in Documents folder, so changing default folder to Documents before reading
# import os and change directory to Documents
import os
# os.chdir("Documents")

# read federalist.csv
papers = pd.read_csv("federalist.csv")
papers

In [None]:
# combining all pre-processing into a single cell
# filter to papers written by Hamilton, Madison, and Unknown
papers = papers[papers["Author"].isin(["HAMILTON", "MADISON","UNKNOWN"])]

# remove the common first sentence from all documents
papers["Text"] = papers["Text"].str.replace("To the People of the State of New York:", "")

# Remove punctuation from the text column
papers["Text"] = papers["Text"].str.replace('[^\w\s]', '', regex=True)

# convert all words to lowercase
papers["Text"] = papers["Text"].str.lower()

# removal of stop_words
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

stop = stopwords.words("english")

papers["Text"] = papers["Text"].apply(lambda x: " ".join(x for x in x.split()
                                                         if x not in stop))
# stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
papers["Text"] = papers["Text"].apply(lambda x: " ".join([st.stem(word)
                                                         for word in x.split()]))

# further remove custom stopwords, which are problem specific
stop += ["would", "may", "must", "one", "upon", "might", "shall", "could"]
papers["Text"] = papers["Text"].apply(lambda x: " ".join(x for x in x.split()
                                                         if x not in stop))

papers["Text"]


In [None]:
# remember to install WordCloud first
# !pip install WordCloud

from wordcloud import WordCloud

comment_words = str(' '.join(papers["Text"]).split())

# word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(comment_words)

# if you get font error, can try the following
# wordcloud = WordCloud(font_path='/path/to/your/font.ttf', width=800, height=400, 
# background_color='white').generate(sample_text)

plt.figure(figsize=(8,8))
plt.clf()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# Further text mining with terms and documents
# Tokenize the documents in the Text column
corpus = [doc.split() for doc in papers["Text"]]

In [None]:
corpus[0:10]

In [None]:
import gensim
from gensim import corpora, models

In [None]:
# Create the term dictionary of the corpus
dictionary = corpora.Dictionary(corpus)

In [None]:
dictionary.filter_extremes(no_below = 2, no_above = 0.75)

In [None]:
# Convert the corpus into Document Term Matrix
DFM = [dictionary.doc2bow(doc) for doc in corpus]

In [None]:
print(DFM)

In [None]:
# number of documents
len(DFM)

In [None]:
# number of terms
len(dictionary.token2id)

In [None]:
# Compute similarity between documents
# Distance is 1-similarity
from gensim.similarities import MatrixSimilarity
simil = MatrixSimilarity(DFM, num_features=len(dictionary))
distance = 1 - simil[DFM]

# convert simil to a data frame
text_sim = pd.DataFrame(simil[DFM])
text_sim[2]

In [None]:
text_sim[0:5]

In [None]:
# hierarchical clustering of the documents
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt

Z = hierarchy.single(distance)

plt.figure(figsize=(20,10))

dn = hierarchy.dendrogram(Z, orientation='right', leaf_font_size='11', labels=papers.index)


In [None]:
# Creating TF-IDF for K-means clustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Vectorize the documents using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(papers["Text"])

# Convert TF-IDF matrix to a DataFrame to visualize it better (optional)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the first few rows of the TF-IDF DataFrame
print(tfidf_df.head())


In [None]:
# Perform KMeans clustering
num_clusters = 5  # Assuming we want to cluster the documents into 5 clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Assign the cluster labels to the documents
papers['Cluster'] = kmeans.labels_

# Display the first few rows of the dataframe with cluster assignments
print(papers[['ID', 'Author', 'Cluster']].head())

# Optionally, examine cluster centroids or top terms per cluster
print("Top terms per cluster:")

order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names_out()

for i in range(num_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :10]]
    print(f"Cluster {i}: {' '.join(top_ten_words)}")

In [None]:
# Word similarity - Word2Vector a word embedding model to identify similar words
# https://radimrehurek.com/gensim/models/word2vec.html
# window: maximum distance between target word and words around it, default=5
# min_count: minimum count of words to consider when training model, default=5
# workers: number of partitions during training, default=3
#sg: the training algorithm, CBOW (0) or skip gram (1), default=CBOW

from gensim.models import Word2Vec

# train model
model = Word2Vec(corpus, min_count=20, workers=3, window=3, sg=0)

# most similar words to 'state'
model.wv.most_similar("state", topn=10)