In [1]:
# Import the dependencies 
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Set the column width to 200.
pd.set_option('max_colwidth', 200)

In [2]:
# Load the news_articles.csv into a DataFrame.
news_articles_df = pd.read_csv('Resources/news_articles.csv')
# Display the first 20 headlines 
news_articles_df.head(10)

Unnamed: 0,headline
0,Is 22 Too Young To Marry A 36-Year-Old? 'The Bachelor' Investigates
1,The Only Shopping Guide For Cyber Monday You Need
2,Taylor Swift Dances When No One Can See Her In New 'Delicate' Video
3,How To Say 'Cheers' In 20 Languages (AUDIO)
4,'Welcome To Hell': Rio Police Warn They Can't Promise Olympic Protection
5,Conservative Pundit Points Out Where Real Blame For GOP’s ‘Descent Into Madness’ Lies
6,We Asked The American Public To Settle 5 Of The Internet's Dumbest Debates
7,'Teen Mom OG's' Catelynn Lowell Heads To Treatment Over Suicidal Thoughts
8,The Major Problem With Electric Cars | TIME.com
9,Why Is Nobel-Winning Economist Richard Thaler So Jovial?


## Preprocess the Text

In [3]:
# Remove digits and non-alphabetic characters
news_articles_df['headline'] = news_articles_df['headline'].apply(lambda x: re.sub(r'[^a-zA-Z\s ]', '', str(x)))
news_articles_df.head(10)

Unnamed: 0,headline
0,Is Too Young To Marry A YearOld The Bachelor Investigates
1,The Only Shopping Guide For Cyber Monday You Need
2,Taylor Swift Dances When No One Can See Her In New Delicate Video
3,How To Say Cheers In Languages AUDIO
4,Welcome To Hell Rio Police Warn They Cant Promise Olympic Protection
5,Conservative Pundit Points Out Where Real Blame For GOPs Descent Into Madness Lies
6,We Asked The American Public To Settle Of The Internets Dumbest Debates
7,Teen Mom OGs Catelynn Lowell Heads To Treatment Over Suicidal Thoughts
8,The Major Problem With Electric Cars TIMEcom
9,Why Is NobelWinning Economist Richard Thaler So Jovial


## Create a TF-IDF matrix from our documents.

In [4]:
# Create an instance of the TfidfVectorizer and set the max_df to 0.95 and min_df to 10, and use the English stopwords to be ignored.
tfidf = TfidfVectorizer(max_df=0.95, min_df=10, stop_words='english')
tfidf

In [5]:
# Transform each row from the headlines Series to a DTM.
dtm = tfidf.fit_transform(news_articles_df["headline"])
# Get the shape of the DTM.
print(dtm.shape)

(23377, 3149)


In [6]:
# Print the sparse matrix of the transformed data.
# We have 23,377 documents, the first number in the tuple represents the document number.
# The second number in the tuple represents the index of the word in the vocabulary created by fit_transform.
# The last number represents the value of the TF-IDF score for the vocabulary word.
print(dtm)

  (0, 183)	0.6345657383532866
  (0, 3131)	0.5447013164784799
  (0, 3138)	0.5482944460185232
  (1, 1881)	0.35546248241584566
  (1, 1831)	0.4901265408973161
  (1, 677)	0.46003283951735585
  (1, 1220)	0.4113174150555592
  (1, 2510)	0.5026033904983209
  (2, 2981)	0.4263827647977379
  (2, 1892)	0.3375155487496725
  (2, 2725)	0.6290303789178769
  (2, 2749)	0.5555193737702372
  (3, 168)	0.8128170960317599
  (3, 2414)	0.5825189854403862
  (4, 1949)	0.3359899169063007
  (4, 2158)	0.4724418120156065
  (4, 2081)	0.38124866781045463
  (4, 2327)	0.3818919620518235
  (4, 1281)	0.4127191813776179
  (4, 3060)	0.4498670580656673
  (5, 1618)	0.35810060060128407
  (5, 1681)	0.3894063836492395
  (5, 1182)	0.4127062165437718
  (5, 277)	0.4026181555145373
  (5, 2228)	0.30472503847033977
  :	:
  (23371, 1702)	0.4830356653587415
  (23371, 2781)	0.4443808362877062
  (23371, 2277)	0.3677085237137301
  (23371, 2963)	0.4044727605382417
  (23371, 255)	0.3064420691643532
  (23372, 2630)	0.452129735249142
  (23372, 

In [7]:
# Get the feature names (words) from the TfidfVectorizer
feature_names = tfidf.get_feature_names_out()

# Get all the non-zero elements from the first row.
non_zero_elements = dtm.toarray()[0]

# Get the indices for each non-zero element.
non_zero_indices = non_zero_elements.nonzero()[0]

# Print out the word and the number of times the word is in the row. 
for idx in non_zero_indices:
    print(f"Word: {feature_names[idx]} | Word index {idx} | Value = {non_zero_elements[idx]}")

Word: bachelor | Word index 183 | Value = 0.6345657383532866
Word: yearold | Word index 3131 | Value = 0.5447013164784799
Word: young | Word index 3138 | Value = 0.5482944460185232


## Applying NMF

In [8]:
# Initialize the NMF and set the number of topics to 7. 
nmf_model = NMF(n_components=7,random_state=42)
# Fit the model with our DTM data. 
nmf_model.fit(dtm)

In [9]:
# Get the length of the array of each topic. It should be the same as the vocabulary.
for index,topic in enumerate(nmf_model.components_):
    print(len(nmf_model.components_[index]))

3149
3149
3149
3149
3149
3149
3149


In [10]:
# Get the array of the first topic 
first_topic = nmf_model.components_[0]
# This is the ranking of each word in the array. Lower values have less impact than higher values.
print(first_topic)

[0.0001327  0.00022051 0.         ... 0.00101038 0.         0.        ]


In [11]:
# Get the indices of the top ten words for the first topic (e.g., top 10 words for topic 0):
top_word_indices = first_topic.argsort()[-10:][::-1]
print(top_word_indices)

[ 247 3114   94 1079   97 1336 1960 3115 2295  210]


In [12]:
# Get the top ten words from the indices. 
for index in top_word_indices:
    print(tfidf.get_feature_names_out()[index])

best
world
america
food
americas
hotels
order
worlds
restaurants
bars


In [13]:
# Print the top 20 words for each topic
for index,topic in enumerate(nmf_model.components_):
    print(f'The top 30 words for topic #{index+1}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-30:]])
    print('\n')

The top 30 words for topic #1
['way', 'things', 'hotel', 'friday', 'cheese', 'huffpost', 'foods', 'ways', 'black', 'test', 'eat', 'deathmatch', 'according', 'beaches', 'deals', 'taste', 'time', 'worst', 'places', 'cities', 'bars', 'restaurants', 'worlds', 'order', 'hotels', 'americas', 'food', 'america', 'world', 'best']


The top 30 words for topic #2
['trailer', 'tech', 'rumors', 'time', 'ad', 'shows', 'love', 'heres', 'netflix', 'prime', 'year', 'game', 'hulu', 'orleans', 'like', 'amazon', 'world', 'just', 'youtube', 'videos', 'years', 'iphone', 'video', 'thats', 'city', 'apple', 'week', 'watch', 'york', 'new']


The top 30 words for topic #3
['breakfast', 'chocolate', 'fall', 'taste', 'coffee', 'italian', 'instagram', 'dinner', 'time', 'easy', 'ice', 'worst', 'summer', 'things', 'ways', 'cream', 'need', 'favorite', 'order', 'better', 'eat', 'cook', 'cheese', 'love', 'dessert', 'like', 'food', 'make', 'recipes', 'photos']


The top 30 words for topic #4
['giuliani', 'wont', 'fox', '

### Taking our best guess at the topics.
---
- TOPIC 1: **Entertainment**
- TOPIC 2: **Technology**
- TOPIC 3: **Food and Drink**
- TOPIC 4: **Politics**
- TOPIC 5: **Business**
- TOPIC 6: **Sports**
- TOPIC 7: **Travel**

## Assigning the Topic to the Headline

In [14]:
# Transform our DTM so we get an array with the (number_of_documents, number_of_topics).
topic_results = nmf_model.transform(dtm)

# Get the shape of the topic results
topic_results.shape

(23377, 7)

In [15]:
# Get the sorted indices for each topic in the first headline.
sorted_indices = np.argsort(-topic_results[0])
# Print the ranking of topics for the headline
print("Ranking of topics for the first headline:")
for rank, topic_index in enumerate(sorted_indices):
    print(f"   Rank {rank+1}: Topic {topic_index+1}, Probability: {topic_results[0, topic_index]:.6f}")

Ranking of topics for the first headline:
   Rank 1: Topic 7, Probability: 0.002277
   Rank 2: Topic 4, Probability: 0.001495
   Rank 3: Topic 2, Probability: 0.001041
   Rank 4: Topic 5, Probability: 0.000894
   Rank 5: Topic 3, Probability: 0.000472
   Rank 6: Topic 1, Probability: 0.000284
   Rank 7: Topic 6, Probability: 0.000208


In [16]:
# Read in our original news headlines. 
news_articles_df_2 = pd.read_csv('Resources/news_articles.csv')
# Combine the original data with the topic label. 
news_articles_df_2['topic'] = (topic_results.argmax(axis=1)+1)

In [17]:
# Get the first 10 rows. 
news_articles_df_2.head(10)

Unnamed: 0,headline,topic
0,Is 22 Too Young To Marry A 36-Year-Old? 'The Bachelor' Investigates,7
1,The Only Shopping Guide For Cyber Monday You Need,7
2,Taylor Swift Dances When No One Can See Her In New 'Delicate' Video,2
3,How To Say 'Cheers' In 20 Languages (AUDIO),4
4,'Welcome To Hell': Rio Police Warn They Can't Promise Olympic Protection,4
5,Conservative Pundit Points Out Where Real Blame For GOP’s ‘Descent Into Madness’ Lies,4
6,We Asked The American Public To Settle 5 Of The Internet's Dumbest Debates,4
7,'Teen Mom OG's' Catelynn Lowell Heads To Treatment Over Suicidal Thoughts,4
8,The Major Problem With Electric Cars | TIME.com,4
9,Why Is Nobel-Winning Economist Richard Thaler So Jovial?,4


In [18]:
# Get the last 10 rows. 
news_articles_df_2.tail(10)

Unnamed: 0,headline,topic
23367,"These Are 33 Of The Best, Most Iconic American Foods",1
23368,Does Your Marketing Plan Need an Exit Strategy?,7
23369,"Summer Fancy Food Show, Part I",3
23370,7 Reasons to Include Galapagos Islands on Your Bucket List,7
23371,"Biden To Republicans Threatening To Challenge Vaccine, Testing Mandates: ‘Have At It’",4
23372,Biden's Health Agenda Starts With Reversing Everything Trump Did In The Last 4 Years,4
23373,You Know Where You Are From the Very First Bite,7
23374,"9 Cheeses We Would Happily Marry, If That Was Allowed",6
23375,Donald Trump Has A Surprising Response To Golfer Rory McIlroy's Criticism,4
23376,Fast Food Strikes Hit Cities Throughout The Country,3
