<a href="https://colab.research.google.com/github/2303A51786/nlp/blob/main/ass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Colab: install required packages (run once)
!pip install -q scikit-learn nltk pandas matplotlib wordcloud

# Imports
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.preprocessing import normalize
from collections import Counter
import matplotlib.pyplot as plt


In [5]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# Try a few likely filenames (change if your file is elsewhere)
candidates = ['/mnt/data/bbc_news.csv', 'bbc_news.csv', 'english_news_dataset.csv', '/content/bbc_news.csv']
file_found = None
for fn in candidates:
    if os.path.exists(fn):
        file_found = fn
        break

if file_found is None:
    raise FileNotFoundError(f"Could not find input CSV. Put your dataset in the Colab working directory and name it one of: {candidates}")

print("Loaded file:", file_found)
df = pd.read_csv(file_found)

# Inspect columns and pick text column automatically if possible
print("Columns:", df.columns.tolist())
# Try common column names
for candidate_col in ['text','article','content','news','body','headline']:
    if candidate_col in df.columns:
        text_col = candidate_col
        break
else:
    # fallback: take the longest text-like column
    text_col = df.select_dtypes(include=['object']).columns[0]

print("Using text column:", text_col)
df = df[[text_col]].dropna().reset_index(drop=True)
df = df.rename(columns={text_col: 'text'})
print("Number of documents:", len(df))
df.head(3)


Loaded file: bbc_news.csv
Columns: ['title', 'pubDate', 'guid', 'link', 'description']
Using text column: title
Number of documents: 42115


Unnamed: 0,text
0,Ukraine: Angry Zelensky vows to punish Russian...
1,War in Ukraine: Taking cover in a town under a...
2,Ukraine war 'catastrophic for global food'


In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', ' ', text)           # remove urls
    text = re.sub(r'\d+', ' ', text)               # remove numbers
    text = re.sub(r'[^\w\s]', ' ', text)           # remove punctuation
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words and len(t) > 2]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

# Apply preprocessing (this may take a few seconds)
df['clean_text'] = df['text'].apply(preprocess_text)

# quick check
print(df['clean_text'].str.split().apply(len).describe())
df.head(3)


count    42115.000000
mean         6.949472
std          1.934265
min          1.000000
25%          6.000000
50%          7.000000
75%          8.000000
max         18.000000
Name: clean_text, dtype: float64


Unnamed: 0,text,clean_text
0,Ukraine: Angry Zelensky vows to punish Russian...,ukraine angry zelensky vow punish russian atro...
1,War in Ukraine: Taking cover in a town under a...,war ukraine taking cover town attack
2,Ukraine war 'catastrophic for global food',ukraine war catastrophic global food


In [None]:
n_topics = 5
max_features = 5000

# TF-IDF for NMF
tfidf_vectorizer = TfidfVectorizer(max_features=max_features, min_df=5, max_df=0.9)
tfidf = tfidf_vectorizer.fit_transform(df['clean_text'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

nmf_model = NMF(n_components=n_topics, random_state=42, init='nndsvda', max_iter=400)
W = nmf_model.fit_transform(tfidf)  # document-topic matrix (N x k)
H = nmf_model.components_           # topic-term matrix (k x features)

def print_top_words(topic_matrix, feature_names, top_n=10, model_name="Model"):
    print("\n" + "="*40)
    print(f"{model_name} top {top_n} words per topic")
    print("="*40)
    for topic_idx, topic in enumerate(topic_matrix):
        top_indices = topic.argsort()[-top_n:][::-1]
        top_words = [feature_names[i] for i in top_indices]
        print(f"Topic {topic_idx+1}: {', '.join(top_words)}")

print_top_words(H, tfidf_feature_names, top_n=10, model_name="NMF (TF-IDF)")

# LDA (CountVectorizer)
count_vectorizer = CountVectorizer(max_features=max_features, min_df=5, max_df=0.9)
count = count_vectorizer.fit_transform(df['clean_text'])
count_feature_names = count_vectorizer.get_feature_names_out()

lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42, learning_method='batch', max_iter=20)
lda_W = lda_model.fit_transform(count)  # document-topic probabilities
lda_H = lda_model.components_

print_top_words(lda_H, count_feature_names, top_n=10, model_name="LDA (Count)")



NMF (TF-IDF) top 10 words per topic
Topic 1: ukraine, war, russia, russian, putin, invasion, attack, zelensky, kyiv, map
Topic 2: world, cup, england, final, woman, win, australia, beat, rugby, wale
Topic 3: man, year, win, new, day, woman, city, first, dy, league
Topic 4: say, israel, gaza, strike, hamas, attack, hostage, killed, israeli, people
Topic 5: cost, election, living, energy, rate, bill, price, inflation, rise, rising


In [8]:
def top_word_sets(topic_matrix, feature_names, top_n=10):
    sets = []
    for topic in topic_matrix:
        top_indices = topic.argsort()[-top_n:][::-1]
        sets.append(set([feature_names[i] for i in top_indices]))
    return sets

nmf_top_sets = top_word_sets(H, tfidf_feature_names, top_n=10)
lda_top_sets = top_word_sets(lda_H, count_feature_names, top_n=10)

# overlap table: nmf_topic x lda_topic -> number of shared top words
overlap = np.zeros((n_topics, n_topics), dtype=int)
for i in range(n_topics):
    for j in range(n_topics):
        overlap[i, j] = len(nmf_top_sets[i].intersection(lda_top_sets[j]))

print("Overlap matrix (rows=NMF topics, cols=LDA topics):\n(Values = number of shared top-10 words)\n")
print(pd.DataFrame(overlap, index=[f"NMF_{i+1}" for i in range(n_topics)],
                   columns=[f"LDA_{j+1}" for j in range(n_topics)]))


Overlap matrix (rows=NMF topics, cols=LDA topics):
(Values = number of shared top-10 words)

       LDA_1  LDA_2  LDA_3  LDA_4  LDA_5
NMF_1      0      0      1      0      4
NMF_2      1      0      1      0      6
NMF_3      4      1      2      2      2
NMF_4      1      2      5      0      0
NMF_5      1      2      0      3      0


In [9]:
# choose first NMF topic's top 2 words
nmf_topic0_top = sorted(list(nmf_top_sets[0]), key=lambda w: H[0, tfidf_vectorizer.vocabulary_.get(w, 0)], reverse=True)
# There is a potential mismatch because tfidf_vectorizer.vocabulary_ uses lowercase tokens; ensure words exist
top2 = list(nmf_top_sets[0])[:2]
print("Selected words from NMF Topic 1 (top 2):", top2)

def max_synset_similarity(word1, word2, sim_func='wup'):
    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)
    if not synsets1 or not synsets2:
        return None, []
    best = (-1, None)
    details = []
    for s1 in synsets1:
        for s2 in synsets2:
            if sim_func == 'wup':
                score = s1.wup_similarity(s2)
            else:
                score = s1.path_similarity(s2)
            if score is None:
                sc = -1.0
            else:
                sc = float(score)
            details.append((s1, s2, sc))
            if sc > best[0]:
                best = (sc, (s1, s2))
    return best[0], details

w1, w2 = top2[0], (top2[1] if len(top2)>1 else top2[0])
wup_score, details = max_synset_similarity(w1, w2, sim_func='wup')
path_score, _ = max_synset_similarity(w1, w2, sim_func='path')

print(f"\nWu-Palmer similarity between '{w1}' and '{w2}':", wup_score)
print(f"Path similarity between '{w1}' and '{w2}':", path_score)

# Show the best synset pair
if details:
    # find best
    best_detail = max(details, key=lambda x: x[2])
    print("\nBest synset pair and score (wup):")
    print("Synset 1:", best_detail[0], "-", best_detail[0].definition())
    print("Synset 2:", best_detail[1], "-", best_detail[1].definition())
    print("Score:", best_detail[2])

# Interpretation:
if wup_score is None:
    print("\nInterpretation: Could not find synsets for one or both words in WordNet.")
else:
    if wup_score >= 0.9:
        interp = "very close / essentially synonyms in at least one sense"
    elif wup_score >= 0.7:
        interp = "closely related"
    elif wup_score >= 0.4:
        interp = "moderately related"
    else:
        interp = "weakly related or not closely related"
    print("\nInterpretation (based on Wu-Palmer):", interp)


Selected words from NMF Topic 1 (top 2): ['zelensky', 'putin']

Wu-Palmer similarity between 'zelensky' and 'putin': None
Path similarity between 'zelensky' and 'putin': None

Interpretation: Could not find synsets for one or both words in WordNet.


In [None]:
# For each NMF topic, pick the document index with highest topic weight
doc_indices = []
for topic_idx in range(n_topics):
    doc_idx = np.argmax(W[:, topic_idx])
    doc_indices.append(doc_idx)
# Keep unique and pick first 3 unique topics (if topics < 3 unique docs, fallback to first/last)
unique_indices = []
for idx in doc_indices:
    if idx not in unique_indices:
        unique_indices.append(idx)
    if len(unique_indices) == 3:
        break
if len(unique_indices) < 3:
    # fallback: take first, middle, last
    unique_indices = [0, len(df)//2, len(df)-1]

selected_idxs = unique_indices[:3]
print("Selected document indices:", selected_idxs)

# prepare token sets for Jaccard (use preprocess tokens)
def token_set_from_text(text):
    tokens = text.split()
    return set(tokens)

doc_texts = df.loc[selected_idxs, 'clean_text'].tolist()
doc_tokens = [token_set_from_text(t) for t in doc_texts]

# pairwise Jaccard
from itertools import combinations
pairs = list(combinations(range(3), 2))

def jaccard(a, b):
    if len(a)==0 and len(b)==0:
        return 1.0
    inter = len(a & b)
    uni = len(a | b)
    return inter / uni if uni>0 else 0.0

scores = {}
for (i,j) in pairs:
    score = jaccard(doc_tokens[i], doc_tokens[j])
    scores[(i,j)] = score
    print(f"Jaccard similarity (doc{selected_idxs[i]} vs doc{selected_idxs[j]}): {score:.4f}")

# identify most and least similar
most_pair = max(scores.items(), key=lambda x: x[1])
least_pair = min(scores.items(), key=lambda x: x[1])
(print("\nMost similar pair:", most_pair[0], "score=", most_pair[1]))
(print("Least similar pair:", least_pair[0], "score=", least_pair[1]))

# Show short explanation using topic top words for each document
def top_topic_for_doc(doc_idx, W_matrix):
    topic = np.argmax(W_matrix[doc_idx])
    return topic

for i in range(3):
    doc_idx = selected_idxs[i]
    top_topic = top_topic_for_doc(doc_idx, W)
    print(f"\nDocument {doc_idx} — top NMF topic: {top_topic+1}")
    print("Top NMF words for that topic:", ", ".join(sorted(list(nmf_top_sets[top_topic]))))

print("\nExplanation (brief):")
# Reasoning: pair with higher Jaccard likely share more keywords / same topic words
for (i,j), sc in scores.items():
    reason = []
    common = doc_tokens[i].intersection(doc_tokens[j])
    # show up to 10 common words
    reason_words = list(common)[:10]
    print(f"doc{selected_idxs[i]} vs doc{selected_idxs[j]} => Jaccard {sc:.4f}; common tokens (sample): {reason_words}")
