In [5]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [6]:
!pip install gensim
!pip install pyLDAvis
!pip install wordcloud



In [8]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Gensim & visualization
import gensim
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Sklearn for NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# WordCloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [9]:
import pandas as pd

# Load RSS data
df = pd.read_csv("/content/bbc_news.csv")   # adjust filename

print(df.head())


                                               title  \
0  Ukraine: Angry Zelensky vows to punish Russian...   
1  War in Ukraine: Taking cover in a town under a...   
2         Ukraine war 'catastrophic for global food'   
3  Manchester Arena bombing: Saffie Roussos's par...   
4  Ukraine conflict: Oil price soars to highest l...   

                         pubDate  \
0  Mon, 07 Mar 2022 08:01:56 GMT   
1  Sun, 06 Mar 2022 22:49:58 GMT   
2  Mon, 07 Mar 2022 00:14:42 GMT   
3  Mon, 07 Mar 2022 00:05:40 GMT   
4  Mon, 07 Mar 2022 08:15:53 GMT   

                                               guid  \
0  https://www.bbc.co.uk/news/world-europe-60638042   
1  https://www.bbc.co.uk/news/world-europe-60641873   
2      https://www.bbc.co.uk/news/business-60623941   
3            https://www.bbc.co.uk/news/uk-60579079   
4      https://www.bbc.co.uk/news/business-60642786   

                                                link  \
0  https://www.bbc.co.uk/news/world-europe-606380...   
1  

In [10]:
# Combine title and description
df["text"] = df["title"].fillna("") + " " + df["description"].fillna("")
texts = df["text"].values


In [11]:
df.head()

Unnamed: 0,title,pubDate,guid,link,description,text
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...,Ukraine: Angry Zelensky vows to punish Russian...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as...",War in Ukraine: Taking cover in a town under a...
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...,Ukraine war 'catastrophic for global food' One...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...,Manchester Arena bombing: Saffie Roussos's par...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...,Ukraine conflict: Oil price soars to highest l...


In [12]:
import nltk, re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    return tokens

processed_texts = [preprocess(doc) for doc in texts]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [13]:
from gensim import corpora, models

# Dictionary & Corpus
dictionary = corpora.Dictionary(processed_texts)
corpus = [dictionary.doc2bow(text) for text in processed_texts]

# Train LDA
lda_model = models.LdaModel(corpus=corpus,
                            id2word=dictionary,
                            num_topics=5,   # try 5–10 topics
                            random_state=42,
                            passes=10,
                            per_word_topics=True)

# Show topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")


Topic 0: 0.018*"win" + 0.018*"england" + 0.015*"world" + 0.011*"cup" + 0.011*"league" + 0.009*"manchester" + 0.009*"final" + 0.008*"city" + 0.007*"first" + 0.007*"champion"
Topic 1: 0.010*"euro" + 0.009*"year" + 0.008*"king" + 0.007*"star" + 0.006*"south" + 0.006*"general" + 0.006*"tour" + 0.006*"bbc" + 0.005*"show" + 0.005*"former"
Topic 2: 0.012*"say" + 0.008*"israel" + 0.008*"people" + 0.008*"gaza" + 0.008*"ukraine" + 0.007*"war" + 0.006*"day" + 0.006*"attack" + 0.006*"bbc" + 0.005*"israeli"
Topic 3: 0.018*"say" + 0.011*"election" + 0.009*"government" + 0.008*"party" + 0.008*"trump" + 0.008*"new" + 0.007*"minister" + 0.007*"labour" + 0.007*"could" + 0.006*"leader"
Topic 4: 0.016*"say" + 0.013*"police" + 0.012*"woman" + 0.010*"man" + 0.008*"death" + 0.007*"died" + 0.007*"family" + 0.006*"dy" + 0.005*"child" + 0.005*"home"


In [14]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()  # Jupyter
vis = gensimvis.prepare(lda_model, corpus, dictionary)
vis


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Join tokens back into strings
docs = [" ".join(tokens) for tokens in processed_texts]

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words="english")
tfidf = tfidf_vectorizer.fit_transform(docs)

nmf_model = NMF(n_components=5, random_state=42)
nmf_model.fit(tfidf)

feature_names = tfidf_vectorizer.get_feature_names_out()

def display_topics(model, feature_names, no_top_words):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}: ", " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(nmf_model, feature_names, 10)


Topic 0:  say people police year strike israel gaza woman attack cost
Topic 1:  world cup england womens final win australia wale france euro
Topic 2:  ukraine war russia russian ukrainian putin invasion president kyiv attack
Topic 3:  league city manchester premier win united champion liverpool arsenal man
Topic 4:  election party labour minister general tory vote sunak leader rishi
