# ADS 509 Final Project: Topic Modeling

In [2]:
# These libraries may be useful to you

#!pip3 install pyLDAvis==3.4.1 --user  #You need to restart the Kernel after installation.
# You also need a Python version => 3.9.0
from nltk.corpus import brown

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import pyLDAvis
import pyLDAvis.lda_model
import pyLDAvis.gensim_models

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

from spacy.lang.en.stop_words import STOP_WORDS as stopwords

from collections import Counter, defaultdict

#!python3 -m spacy download en_core_web_sm

nlp = spacy.load('en_core_web_sm')

In [4]:
# add any additional libaries you need here
np.int = np.int_
np.float = np.float_
import re
import os
import nltk
import zipfile
import shutil
import html
import matplotlib
import matplotlib.pyplot as plt

#first, download the stopwords.zip file from https://www.nltk.org/nltk_data/ and then extract it
stop_source_dir = '/Users/calebmccurdy/downloads/stopwords'
stop_dest_dir = '/Users/calebmccurdy/nltk_data/corpora/stopwords'

if os.path.exists(stop_dest_dir):
    shutil.rmtree(stop_dest_dir)
shutil.move(stop_source_dir, stop_dest_dir)

#first, download the punkt.zip file from https://www.nltk.org/nltk_data/ and then extract it
punkt_source_dir = '/Users/calebmccurdy/downloads/punkt'
punkt_dest_dir = '/Users/calebmccurdy/nltk_data/corpora/punkt'

if os.path.exists(punkt_dest_dir):
    shutil.rmtree(punkt_dest_dir)
shutil.move(punkt_source_dir, punkt_dest_dir)

'/Users/calebmccurdy/nltk_data/corpora/punkt'

In [6]:
from nltk.corpus import stopwords
from string import punctuation
sw = stopwords.words("english")

In [7]:
punctuation = set(punctuation) # speeds up comparison
extra_sw = ['cnn', 'fox', 'news', 'said', 'told', 'would', 'read', 'get', 'could', 'also',
            'think', 'time', 'even', 'former', 'ask', 'asked', 'â']
sw.extend(extra_sw)

In [8]:
# This function comes from the BTAP repo.

def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

## TF-IDF and Count Vectorization

In [10]:
df = pd.read_csv('/Users/calebmccurdy/Desktop/USD/ADS 509/Project/Politics_Project.csv')
df

Unnamed: 0,Organization,Article,Tokens
0,FOX,haley-blames-trump-gop-loss-key-special-electi...,"['haley', 'blames', 'trump', 'gop', 'loss', 'k..."
1,FOX,biden-says-his-memory-fine-he-most-qualified-p...,"['biden', 'says', 'memory', 'fine', 'qualified..."
2,FOX,nikki-haley-calls-robert-hur-report-unbelievab...,"['nikki', 'haley', 'calls', 'robert', 'hur', '..."
3,FOX,dnc-alleges-rfk-jr-campaign-violating-election...,"['dnc', 'alleges', 'rfk', 'jr', 'campaign', 'v..."
4,FOX,border-impeachment-courts-gridlock-reigns-wash...,"['border', 'impeachment', 'courts', 'gridlock'..."
...,...,...,...
126,CNN,_2024_02_13new-york-special-election-what-to-w...,"['watch', 'new', 'yorkâ', 's', 'highstakes', '..."
127,CNN,_2024_02_08biden-special-counsel-beau-death_index,"['biden', 'slams', 'special', 'counsel', 'hur'..."
128,CNN,_2024_02_12cq-brown-nato-trump_index,"['joint', 'chiefs', 'chairman', 'says', 'â', '..."
129,CNN,_2024_02_14new-york-special-election-takeaways...,"['takeaways', 'new', 'yorkâ', 's', 'highstakes..."


In [25]:
# Split the DataFrame by Organization
cnn_df = df[df['Organization'] == 'CNN']
fox_df = df[df['Organization'] == 'FOX']

In [26]:
cnn_df

Unnamed: 0,Organization,Article,Tokens
38,CNN,_2024_02_08donald-trump-supreme-court-2024-ana...,"['supreme', 'court', 'faces', 'greatest', 'tes..."
39,CNN,_2024_02_08americans-detained-idf-gaza_index,"['two', 'americans', 'detained', 'idf', 'gaza'..."
40,CNN,_2024_02_13biden-trump-nato-comments_index,"['biden', 'accuses', 'trump', 'bowing', 'putin..."
41,CNN,_2024_02_14house-intel-chairman-serious-nation...,"['house', 'intel', 'chairman', 'announces', 'â..."
42,CNN,_2024_02_09kamala-harris-responds-special-coun...,"['white', 'house', 'wonâ', 't', 'rule', 'relea..."
...,...,...,...
126,CNN,_2024_02_13new-york-special-election-what-to-w...,"['watch', 'new', 'yorkâ', 's', 'highstakes', '..."
127,CNN,_2024_02_08biden-special-counsel-beau-death_index,"['biden', 'slams', 'special', 'counsel', 'hur'..."
128,CNN,_2024_02_12cq-brown-nato-trump_index,"['joint', 'chiefs', 'chairman', 'says', 'â', '..."
129,CNN,_2024_02_14new-york-special-election-takeaways...,"['takeaways', 'new', 'yorkâ', 's', 'highstakes..."


In [27]:
fox_df

Unnamed: 0,Organization,Article,Tokens
0,FOX,haley-blames-trump-gop-loss-key-special-electi...,"['haley', 'blames', 'trump', 'gop', 'loss', 'k..."
1,FOX,biden-says-his-memory-fine-he-most-qualified-p...,"['biden', 'says', 'memory', 'fine', 'qualified..."
2,FOX,nikki-haley-calls-robert-hur-report-unbelievab...,"['nikki', 'haley', 'calls', 'robert', 'hur', '..."
3,FOX,dnc-alleges-rfk-jr-campaign-violating-election...,"['dnc', 'alleges', 'rfk', 'jr', 'campaign', 'v..."
4,FOX,border-impeachment-courts-gridlock-reigns-wash...,"['border', 'impeachment', 'courts', 'gridlock'..."
5,FOX,fox-news-politics-age-old-question,"['politics', 'dems', 'ageold', 'question', 'ma..."
6,FOX,gop-rep-tenney-calls-invoke-25th-amendment-rem...,"['gop', 'rep', 'tenney', 'calls', 'invoke', '2..."
7,FOX,.DS_Store,"['bud1', 'e', 'dsdb']"
8,FOX,fox-news-politics-feds-decline-charge-elderly-...,"['politics', 'feds', 'decline', 'charge', 'eld..."
9,FOX,trumps-nato-comments-trigger-fierce-media-euro...,"['trumpâ', 's', 'nato', 'comments', 'trigger',..."


In [51]:
count_text_vectorizer = CountVectorizer(stop_words=list(sw), min_df=2, max_df=0.9)
count_text_vectors = count_text_vectorizer.fit_transform(df["Tokens"])
count_text_vectors.shape

(131, 5186)

In [52]:
cnn_count_text_vectorizer = CountVectorizer(stop_words=list(sw), min_df=2, max_df=0.9)
cnn_count_text_vectors = cnn_count_text_vectorizer.fit_transform(cnn_df["Tokens"])
cnn_count_text_vectors.shape

(93, 4208)

In [53]:
fox_count_text_vectorizer = CountVectorizer(stop_words=list(sw), min_df=2, max_df=0.9)
fox_count_text_vectors = fox_count_text_vectorizer.fit_transform(fox_df["Tokens"])
fox_count_text_vectors.shape

(38, 1953)

In [54]:
tfidf_text_vectorizer = TfidfVectorizer(stop_words=list(sw), min_df=2, max_df=0.9)
tfidf_text_vectors = tfidf_text_vectorizer.fit_transform(df['Tokens'])
tfidf_text_vectors.shape

(131, 5186)

In [65]:
cnn_tfidf_text_vectorizer = TfidfVectorizer(stop_words=list(sw), min_df=2, max_df=0.9)
cnn_tfidf_text_vectors = cnn_tfidf_text_vectorizer.fit_transform(cnn_df['Tokens'])
cnn_tfidf_text_vectors.shape

(93, 4208)

In [69]:
fox_tfidf_text_vectorizer = TfidfVectorizer(stop_words=list(sw), min_df=2, max_df=0.9)
fox_tfidf_text_vectors = tfidf_text_vectorizer.fit_transform(fox_df['Tokens'])
fox_tfidf_text_vectors.shape

(38, 1953)

## Fitting a Non-Negative Matrix Factorization Model

In [57]:
cnn_nmf_model = NMF(n_components=5, random_state=314)
cnn_W_nmf_matrix = cnn_nmf_model.fit_transform(cnn_tfidf_text_vectors)
cnn_H_nmf_matrix = cnn_nmf_model.components_

In [70]:
fox_nmf_model = NMF(n_components=5, random_state=315)
fox_W_nmf_matrix = fox_nmf_model.fit_transform(fox_tfidf_text_vectors)
fox_H_nmf_matrix = fox_nmf_model.components_

In [59]:
# assertion statements to ensure the document-topic and topic-feature matrices have the intended shapes
assert cnn_W_nmf_matrix.shape == (93, 5), f"Expected shape (93, 5), but got {cnn_W_nmf_matrix.shape}"
assert cnn_H_nmf_matrix.shape == (5, 4208), f"Expected shape (5, 4208), but got {cnn_H_nmf_matrix.shape}"
assert fox_W_nmf_matrix.shape == (38, 5), f"Expected shape (38, 5), but got {fox_W_nmf_matrix.shape}"
assert fox_H_nmf_matrix.shape == (5, 1953), f"Expected shape (5, 1953), but got {fox_H_nmf_matrix.shape}"

In [60]:
display_topics(cnn_nmf_model, cnn_tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  senate (1.15)
  ukraine (1.13)
  aid (1.11)
  border (1.04)
  bill (0.97)

Topic 01
  biden (1.48)
  classified (1.43)
  report (1.41)
  hur (1.14)
  bidenâ (1.12)

Topic 02
  court (1.83)
  trump (1.20)
  supreme (1.05)
  case (0.92)
  trumpâ (0.80)

Topic 03
  suozzi (2.11)
  pilip (1.62)
  santos (1.08)
  york (0.94)
  election (0.89)

Topic 04
  trump (1.49)
  haley (0.88)
  biden (0.65)
  nato (0.59)
  voters (0.51)


In [68]:
display_topics(fox_nmf_model, fox_tfidf_text_vectorizer.get_feature_names_out())

NotFittedError: Vocabulary not fitted or provided

## Fitting an LSA Model

In [62]:
cnn_svd_model = TruncatedSVD(n_components=5, random_state=320)
cnn_W_svd_matrix = cnn_svd_model.fit_transform(cnn_tfidf_text_vectors)
cnn_H_svd_matrix = cnn_svd_model.components_

In [63]:
fox_svd_model = TruncatedSVD(n_components=5, random_state=321)
fox_W_svd_matrix = fox_svd_model.fit_transform(fox_tfidf_text_vectors)
fox_H_svd_matrix = fox_svd_model.components_

In [64]:
assert cnn_W_svd_matrix.shape == (93, 5), f"Expected shape (93, 5), but got {cnn_W_svd_matrix.shape}"
assert cnn_H_svd_matrix.shape == (5, 4208), f"Expected shape (5, 4208), but got {cnn_H_svd_matrix.shape}"
assert fox_W_svd_matrix.shape == (38, 5), f"Expected shape (38, 5), but got {fox_W_svd_matrix.shape}"
assert fox_H_svd_matrix.shape == (5, 1953), f"Expected shape (5, 1953), but got {fox_H_svd_matrix.shape}"

In [49]:
display_topics(cnn_svd_model, cnn_tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  trump (1.08)
  election (0.52)
  court (0.50)
  trumpâ (0.46)
  bidenâ (0.45)

Topic 01
  senate (38.71)
  aid (31.84)
  ukraine (31.34)
  border (31.02)
  bill (28.53)

Topic 02
  report (8.08)
  classified (7.94)
  hur (6.49)
  bidenâ (6.25)
  documents (6.08)

Topic 03
  suozzi (90.83)
  pilip (69.21)
  democrats (46.95)
  santos (45.48)
  voters (40.81)

Topic 04
  nato (65.81)
  trump (45.53)
  haley (40.57)
  comments (20.68)
  carolina (20.26)


In [50]:
display_topics(fox_svd_model, fox_tfidf_text_vectorizer.get_feature_names_out())

NotFittedError: Vocabulary not fitted or provided

## Fitting an LDA Model

Finally, fit a five-topic LDA model using the count vectors (`count_text_vectors` from above).

In [71]:
cnn_lda_model = LatentDirichletAllocation(n_components=5, random_state=40)
cnn_W_lda_matrix = cnn_lda_model.fit_transform(cnn_count_text_vectors)
cnn_H_lda_matrix = cnn_lda_model.components_

In [72]:
fox_lda_model = LatentDirichletAllocation(n_components=5, random_state=41)
fox_W_lda_matrix = fox_lda_model.fit_transform(fox_count_text_vectors)
fox_H_lda_matrix = fox_lda_model.components_

In [73]:
assert cnn_W_lda_matrix.shape == (93, 5), f"Expected shape (93, 5), but got {cnn_W_lda_matrix.shape}"
assert cnn_H_lda_matrix.shape == (5, 4208), f"Expected shape (5, 4208), but got {cnn_H_lda_matrix.shape}"
assert fox_W_lda_matrix.shape == (38, 5), f"Expected shape (38, 5), but got {fox_W_lda_matrix.shape}"
assert fox_H_lda_matrix.shape == (5, 1953), f"Expected shape (5, 1953), but got {fox_H_lda_matrix.shape}"

In [74]:
display_topics(cnn_lda_model, cnn_count_text_vectorizer.get_feature_names_out())


Topic 00
  house (1.43)
  republicans (1.03)
  republican (0.72)
  president (0.69)
  border (0.68)

Topic 01
  trump (2.57)
  senate (1.07)
  biden (1.05)
  republican (0.89)
  ukraine (0.78)

Topic 02
  biden (2.35)
  president (1.25)
  bidenâ (1.14)
  report (0.97)
  classified (0.78)

Topic 03
  trump (1.98)
  campaign (1.03)
  kennedy (0.98)
  dnc (0.67)
  american (0.57)

Topic 04
  court (2.07)
  trump (2.05)
  election (1.31)
  trumpâ (1.09)
  case (1.07)


In [75]:
display_topics(fox_lda_model, fox_count_text_vectorizer.get_feature_names_out())


Topic 00
  border (2.01)
  bill (1.43)
  senate (1.27)
  aid (1.17)
  ukraine (1.14)

Topic 01
  biden (5.32)
  bobulinski (3.69)
  hunter (3.33)
  joe (2.80)
  business (1.84)

Topic 02
  trump (3.08)
  president (1.65)
  nato (1.56)
  donald (1.00)
  media (0.98)

Topic 03
  trump (1.97)
  republican (1.38)
  senate (1.35)
  new (1.29)
  gop (0.93)

Topic 04
  biden (4.37)
  president (2.46)
  special (1.45)
  report (1.32)
  counsel (1.07)


In [76]:
cnn_lda_display = pyLDAvis.lda_model.prepare(cnn_lda_model, cnn_count_text_vectors, cnn_count_text_vectorizer, sort_topics=False)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow st

In [77]:
fox_lda_display = pyLDAvis.lda_model.prepare(fox_lda_model, fox_count_text_vectors, fox_count_text_vectorizer, sort_topics=False)

In [78]:
pyLDAvis.display(cnn_lda_display)

In [79]:
pyLDAvis.display(fox_lda_display)

Conclusions from visualizations:
