<a href="https://colab.research.google.com/github/2303a51019/NLP/blob/main/NLP_LAB_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Task 1: Topic Modeling with NMF

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, download
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [17]:
# 2️⃣ Load Dataset (BBC News or any text dataset)
df = pd.read_csv("/content/bbc_news.csv")  # ensure this file exists in the working directory
texts = df['description'].astype(str).tolist()

In [18]:
# 3️⃣ Preprocess Text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words]
    return " ".join(words)

processed_texts = [preprocess(t) for t in texts]


In [19]:
# 4️⃣ Apply NMF for 5 Topics
tfidf_vectorizer = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.9)
tfidf = tfidf_vectorizer.fit_transform(processed_texts)

nmf_model = NMF(n_components=5, random_state=42)
nmf_topics = nmf_model.fit_transform(tfidf)

# Display top 10 words per topic
feature_names = tfidf_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf_model.components_):
    print(f"\nNMF Topic {topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))


NMF Topic 1:
say police family man official former president want attack could

NMF Topic 2:
world england cup woman win final watch first wale championship

NMF Topic 3:
year bbc people uk new two first one government ukraine

NMF Topic 4:
day seven past going attention closely paying image selection taken

NMF Topic 5:
league manchester city premier united champion win liverpool arsenal season


In [20]:
# 5️⃣ Compare with LDA
count_vectorizer = CountVectorizer(max_features=2000, min_df=5, max_df=0.9)
count = count_vectorizer.fit_transform(processed_texts)

lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topics = lda_model.fit_transform(count)

feature_names_lda = count_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda_model.components_):
    print(f"\nLDA Topic {topic_idx + 1}:")
    print(" ".join([feature_names_lda[i] for i in topic.argsort()[:-11:-1]]))


LDA Topic 1:
say uk people could year england cost pay new service

LDA Topic 2:
bbc say ukraine russia russian tell war president israel city

LDA Topic 3:
world england cup win league first final day manchester woman

LDA Topic 4:
say minister government election uk party leader former first new

LDA Topic 5:
say police people year two died show one woman found


Task 2: WordNet Similarity

In [21]:
# 1️⃣ Import WordNet Tools
from nltk.corpus import wordnet as wn

# 2️⃣ Choose two words from same topic
word1 = "economy"
word2 = "finance"

# 3️⃣ Get Synsets (Word Senses)
syn1 = wn.synsets(word1)[0]
syn2 = wn.synsets(word2)[0]

# 4️⃣ Compute Path and Wu-Palmer Similarity
path_sim = syn1.path_similarity(syn2)
wup_sim = syn1.wup_similarity(syn2)

# 5️⃣ Display Results
print(f"\nPath Similarity between '{word1}' and '{word2}': {path_sim}")
print(f"Wu-Palmer Similarity between '{word1}' and '{word2}': {wup_sim}")

if wup_sim > 0.6:
    print("These words are semantically close (similar meaning).")
else:
    print("These words are not closely related semantically.")



Path Similarity between 'economy' and 'finance': 0.09090909090909091
Wu-Palmer Similarity between 'economy' and 'finance': 0.2857142857142857
These words are not closely related semantically.


Task 3: Pairwise Document Similarity

In [22]:
# 1️⃣ Select Three Documents
doc1 = processed_texts[0]
doc2 = processed_texts[1]
doc3 = processed_texts[2]

# 2️⃣ Convert Documents to Sets of Words
set1, set2, set3 = set(doc1.split()), set(doc2.split()), set(doc3.split())

# 3️⃣ Define Jaccard Similarity Function
def jaccard_similarity(a, b):
    intersection = len(a.intersection(b))
    union = len(a.union(b))
    return intersection / union

# 4️⃣ Compute Pairwise Similarities
sim_12 = jaccard_similarity(set1, set2)
sim_13 = jaccard_similarity(set1, set3)
sim_23 = jaccard_similarity(set2, set3)

# 5️⃣ Display Results
print("\nJaccard Similarities:")
print(f"Doc1 & Doc2: {sim_12:.3f}")
print(f"Doc1 & Doc3: {sim_13:.3f}")
print(f"Doc2 & Doc3: {sim_23:.3f}")

# Identify which pair is most and least similar
sims = {'Doc1-Doc2': sim_12, 'Doc1-Doc3': sim_13, 'Doc2-Doc3': sim_23}
most_similar = max(sims, key=sims.get)
least_similar = min(sims, key=sims.get)

print(f"\nMost similar pair: {most_similar}")
print(f"Least similar pair: {least_similar}")



Jaccard Similarities:
Doc1 & Doc2: 0.000
Doc1 & Doc3: 0.053
Doc2 & Doc3: 0.000

Most similar pair: Doc1-Doc3
Least similar pair: Doc1-Doc2
