# Topic Modelling for News

![](https://images.unsplash.com/photo-1495020689067-958852a7765e?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=1050&q=80)

Photo by [Roman Kraft](https://unsplash.com/photos/_Zua2hyvTBk)

This exercise is about modelling the main topics of a database of News headlines.

Begin by importing the needed libraries:

In [214]:
# TODO: import needed libraries
import numpy as np
import pandas as pd

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from gensim.corpora import Dictionary
from gensim.matutils import corpus2csc
from gensim.models import TfidfModel, LsiModel, LdaModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from IPython.display import IFrame

[nltk_data] Downloading package punkt to /Users/Coope/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/Coope/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Coope/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Coope/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Load the data in the file `random_headlines.csv`

In [202]:
# TODO: load the dataset
df = pd.read_csv("random_headlines.csv")
df.head()

Unnamed: 0,publish_date,headline_text
0,20120305,ute driver hurt in intersection crash
1,20081128,6yo dies in cycling accident
2,20090325,bumper olive harvest expected
3,20100201,replica replaces northernmost sign
4,20080225,woods targets perfect season


This is always a good idea to perform some EDA (exploratory data analytics) on a dataset...

In [203]:
# TODO: Perform a short EDA
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   publish_date   20000 non-null  int64 
 1   headline_text  20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


Now perform all the needed preprocessing on those headlines: case lowering, tokenization, punctuation removal, stopwords removal, stemming/lemmatization.

In [204]:
# TODO: Preprocess the input data
ColumnName = "headline_text"
filtered_tokens = [[PorterStemmer().stem(token) for token in word_tokenize(sentence) 
                    if token.lower() not in set(string.punctuation) and 
                    token not in set(stopwords.words("english"))] 
                    for sentence in df[ColumnName]]
filtered_Df = pd.DataFrame({"stemmed":filtered_tokens})

In [205]:
# TODO: Compute the BOW using Gensim
dictionary = Dictionary(filtered_tokens)
bow_corpus = [dictionary.doc2bow(tokens) for tokens in filtered_tokens]
bow_df = pd.DataFrame(corpus2csc(bow_corpus).toarray().transpose(), 
         columns=[dictionary[i] for i in range(len(dictionary))])

In [206]:
# TODO: Compute TF-IDF
tfidf_model = TfidfModel(bow_corpus)
corpus_tfidf = tfidf_model[bow_corpus]

In [207]:
# TODO: Compute LSA
num_topics = 4
lsa_model = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)
corpus_lsa = lsa_model[corpus_tfidf]

# TODO: Print the 3 or 4 most significant words of each topic
num_words = 4
topics = lsa_model.print_topics(num_topics=num_topics, num_words=num_words)

for topic_num, topic in topics:
    print(f"Topic {topic_num}: {topic}")

  sparsetools.csc_matvecs(
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m


Topic 0: 0.458*"man" + 0.390*"polic" + 0.316*"charg" + 0.148*"court"
Topic 1: -0.435*"second" + -0.410*"90" + -0.339*"abc" + -0.301*"news"
Topic 2: -0.379*"man" + -0.272*"charg" + -0.262*"second" + -0.252*"90"
Topic 3: -0.773*"polic" + 0.232*"man" + 0.222*"charg" + -0.151*"probe"


In [208]:
# TODO: Compute LDA
lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)
corpus_lda = lda_model[bow_corpus]

# TODO: Print the 3 or 4 most significant words of each topic
topics = lda_model.print_topics(num_words=4)
for topic_num, topic in topics:
    print(f"Topic {topic_num}: {topic}")

Topic 0: 0.023*"polic" + 0.017*"man" + 0.012*"charg" + 0.010*"court"
Topic 1: 0.009*"call" + 0.008*"interview" + 0.008*"govt" + 0.008*"fund"
Topic 2: 0.010*"plan" + 0.009*"council" + 0.007*"mine" + 0.006*"chang"
Topic 3: 0.011*"us" + 0.007*"open" + 0.006*"win" + 0.006*"new"


In [211]:
# Prepare the LDA visualization data
lda_vis_data = gensimvis.prepare(lda_model, bow_corpus, dictionary)

# Save the visualization as an HTML file
pyLDAvis.save_html(lda_vis_data, 'lda_visualization.html')


In [215]:
# Display the saved HTML file in a Jupyter notebook
IFrame('lda_visualization.html', width=800, height=600)


Depending on your results, you can try to fine tune the algorithm: number of topics, hyperparameters...
And check with others their results.