<a href="https://colab.research.google.com/github/2303A51577/Natural-language-processing/blob/main/lab8_1577.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:


from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import numpy as np
from sklearn.metrics import pairwise_distances
from itertools import combinations
from collections import Counter

# If not already downloaded, download NLTK data (only needed once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # WordNet multilingual data (helps sometimes)

# Load 20 newsgroups dataset (remove headers/footers/quotes to focus on content)
newsgroups = fetch_20newsgroups(subset='all', remove=('headers','footers','quotes'))
documents = newsgroups.data  # list of raw strings
print(f"Loaded {len(documents)} documents.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Loaded 18846 documents.


In [2]:
# Block 2: preprocessing function (returns list of cleaned documents)
# Run after Block 1.

# Option A: spaCy lemmatization (preferred if spaCy installed)
USE_SPACY = False  # set True if you installed spaCy and the model

if USE_SPACY:
    import spacy
    nlp = spacy.load("en_core_web_sm", disable=["ner","parser"])
    def preprocess_doc_spacy(doc, extra_stop=set()):
        doc = doc.lower()
        # remove urls and emails and numeric tokens
        doc = re.sub(r'(http\S+)|(\S+@\S+)|\d+', ' ', doc)
        doc = re.sub(r'[^a-z\s]', ' ', doc)
        sp = nlp(doc)
        lemmas = [token.lemma_ for token in sp if len(token.lemma_)>1 and not token.is_stop]
        return " ".join([w for w in lemmas if w not in extra_stop])

# Option B: NLTK WordNet lemmatizer (always works)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def simple_preprocess(doc, extra_stop=set()):
    # Lowercase
    doc = doc.lower()
    # remove urls/emails/numbers
    doc = re.sub(r'(http\S+)|(\S+@\S+)|\d+', ' ', doc)
    # keep only letters
    doc = re.sub(r'[^a-z\s]', ' ', doc)
    tokens = doc.split()
    # remove stopwords and short tokens and lemmatize
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    lemmas = [lemmatizer.lemmatize(t) for t in tokens]
    lemmas = [w for w in lemmas if w not in extra_stop]
    return " ".join(lemmas)

# Apply preprocessing to all documents (this may take some time)
clean_docs = [simple_preprocess(doc) for doc in documents]
print("Preprocessing done. Example cleaned doc:")
print(clean_docs[0][:400])


Preprocessing done. Example cleaned doc:
sure bashers pen fan pretty confused lack kind post recent pen massacre devil actually bit puzzled bit relieved however going put end non pittsburghers relief bit praise pen man killing devil worse thought jagr showed much better regular season stats also lot fun watch playoff bowman let jagr lot fun next couple game since pen going beat pulp jersey anyway disappointed see islander lose final regu


In [3]:


n_features = 10000  # vocabulary size; adjust downward if memory limited

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5, max_features=n_features)
tfidf = tfidf_vectorizer.fit_transform(clean_docs)  # shape (n_docs, n_features)

count_vectorizer = CountVectorizer(max_df=0.95, min_df=5, max_features=n_features)
counts = count_vectorizer.fit_transform(clean_docs)

print("TF-IDF shape:", tfidf.shape)
print("Count shape:", counts.shape)


TF-IDF shape: (18846, 10000)
Count shape: (18846, 10000)


In [4]:
# Block 4: fit NMF and LDA, extract top words
# Run after Block 3.

n_topics = 5
n_top_words = 10

# NMF on TF-IDF
nmf = NMF(n_components=n_topics, random_state=42, init='nndsvda', max_iter=400)
W = nmf.fit_transform(tfidf)  # document-topic matrix
H = nmf.components_          # topic-term matrix

# LDA on counts
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42, learning_method='batch', max_iter=10)
lda_doc_topic = lda.fit_transform(counts)
lda_components = lda.components_

# Utility to get top words
def show_top_words(model_components, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model_components):
        top_indices = topic.argsort()[::-1][:n_top_words]
        top_words = [feature_names[i] for i in top_indices]
        topics.append(top_words)
    return topics

tfidf_features = tfidf_vectorizer.get_feature_names_out()
count_features = count_vectorizer.get_feature_names_out()

nmf_topics = show_top_words(H, tfidf_features, n_top_words)
lda_topics = show_top_words(lda_components, count_features, n_top_words)

print("=== NMF topics (top words) ===")
for i, words in enumerate(nmf_topics):
    print(f"Topic {i+1}:", ", ".join(words))

print("\n=== LDA topics (top words) ===")
for i, words in enumerate(lda_topics):
    print(f"Topic {i+1}:", ", ".join(words))


=== NMF topics (top words) ===
Topic 1: would, one, people, like, think, get, right, time, know, thing
Topic 2: window, file, thanks, please, program, anyone, know, mail, do, driver
Topic 3: game, team, year, player, hockey, season, play, baseball, win, last
Topic 4: drive, scsi, disk, card, hard, controller, ide, floppy, mac, meg
Topic 5: god, christian, jesus, bible, believe, christ, say, belief, faith, sin

=== LDA topics (top words) ===
Topic 1: window, one, file, use, drive, get, would, problem, like, know
Topic 2: one, would, people, god, like, think, know, say, time, thing
Topic 3: game, one, would, year, get, team, time, gun, people, like
Topic 4: system, key, file, space, information, available, use, program, data, also
Topic 5: max, armenian, state, year, people, would, government, right, president, one


In [5]:


def topic_overlap(top_words_a, top_words_b):
    set_a = set(top_words_a)
    set_b = set(top_words_b)
    return len(set_a & set_b) / len(set_a | set_b)  # Jaccard-like overlap

overlap_matrix = np.zeros((n_topics, n_topics))
for i in range(n_topics):
    for j in range(n_topics):
        overlap_matrix[i,j] = topic_overlap(nmf_topics[i], lda_topics[j])

print("Overlap matrix (rows=NMF topics, cols=LDA topics):")
print(np.round(overlap_matrix, 2))

# For each NMF topic, show best matching LDA topic
for i in range(n_topics):
    best_j = overlap_matrix[i].argmax()
    print(f"NMF Topic {i+1} best matches LDA Topic {best_j+1} with overlap {overlap_matrix[i,best_j]:.2f}")


Overlap matrix (rows=NMF topics, cols=LDA topics):
[[0.33 0.67 0.43 0.   0.25]
 [0.18 0.05 0.   0.11 0.  ]
 [0.   0.   0.18 0.   0.05]
 [0.05 0.   0.   0.   0.  ]
 [0.   0.11 0.   0.   0.  ]]
NMF Topic 1 best matches LDA Topic 2 with overlap 0.67
NMF Topic 2 best matches LDA Topic 1 with overlap 0.18
NMF Topic 3 best matches LDA Topic 3 with overlap 0.18
NMF Topic 4 best matches LDA Topic 1 with overlap 0.05
NMF Topic 5 best matches LDA Topic 2 with overlap 0.11


In [7]:


from nltk.corpus import wordnet as wn

def max_similarity_between_words(word1, word2, metric='wup'):
    # get all synsets for each word (noun synsets preferred but we'll check all)
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    if not synsets1 or not synsets2:
        return None, None, None  # not found

    best_sim = -1.0
    best_pair = (None, None)
    best_path = None
    for s1 in synsets1:
        for s2 in synsets2:
            if metric == 'wup':
                sim = s1.wup_similarity(s2)
            elif metric == 'path':
                sim = s1.path_similarity(s2)
            else:
                raise ValueError("metric must be 'wup' or 'path'")
            if sim is None:
                continue
            if sim > best_sim:
                best_sim = sim
                best_pair = (s1, s2)
    return best_sim, best_pair[0], best_pair[1]

# Example: choose two words from a chosen topic (modify indices as you like)
# We'll pick the first NMF topic's top two words for demonstration:
word_a = nmf_topics[0][0]
word_b = nmf_topics[0][1]
print("Comparing:", word_a, "AND", word_b)

wup_sim, s1, s2 = max_similarity_between_words(word_a, word_b, metric='wup')
path_sim, _, _ = max_similarity_between_words(word_a, word_b, metric='path')

print("Wu-Palmer similarity:", wup_sim)
print("Path similarity:", path_sim)
print("Best synset pair:", s1, s2)


Comparing: would AND one
Wu-Palmer similarity: None
Path similarity: None
Best synset pair: None None


In [8]:


def jaccard(a, b):
    set_a = set(a.split())
    set_b = set(b.split())
    if not set_a and not set_b:
        return 1.0
    if not set_a or not set_b:
        return 0.0
    return len(set_a & set_b) / len(set_a | set_b)

# Choose three document indices (you can change these to inspect different docs)
doc_indices = [10, 25, 200]  # example indices; change as you like

docs_selected = [clean_docs[i] for i in doc_indices]
for idx, text in zip(doc_indices, docs_selected):
    # show top keywords by term frequency in that doc
    tokens = text.split()
    most_common = Counter(tokens).most_common(10)
    print(f"\nDocument index: {idx}")
    print("Top tokens:", [t for t,c in most_common])

# compute pairwise Jaccard
pairs = list(combinations(range(3), 2))
scores = {}
for (i,j) in pairs:
    score = jaccard(docs_selected[i], docs_selected[j])
    scores[(doc_indices[i], doc_indices[j])] = score
    print(f"Jaccard similarity between doc {doc_indices[i]} and doc {doc_indices[j]}: {score:.3f}")

# identify most and least similar
most_sim_pair = max(scores.items(), key=lambda x:x[1])
least_sim_pair = min(scores.items(), key=lambda x:x[1])

print("\nMost similar pair:", most_sim_pair)
print("Least similar pair:", least_sim_pair)



Document index: 10
Top tokens: ['blood', 'used', 'lamb', 'hard', 'task', 'culture', 'animal', 'sacrifice', 'something', 'related']

Document index: 25
Top tokens: ['anyone', 'brief', 'blurb', 'manned', 'lunar', 'exploration', 'confernce', 'may', 'crystal', 'city']

Document index: 200
Top tokens: ['jesus', 'peace']
Jaccard similarity between doc 10 and doc 25: 0.000
Jaccard similarity between doc 10 and doc 200: 0.000
Jaccard similarity between doc 25 and doc 200: 0.000

Most similar pair: ((10, 25), 0.0)
Least similar pair: ((10, 25), 0.0)


In [9]:
# Block 8: create simple tables (pandas) for topics and doc-topic scores (optional)
import pandas as pd

# NMF topic table
nmf_topic_table = pd.DataFrame({
    f"Topic_{i+1}": nmf_topics[i] for i in range(n_topics)
})
nmf_topic_table.index = [f"Top_{i+1}" for i in range(n_top_words)]
print(nmf_topic_table)

# Document-topic matrix quick look (for first 10 docs)
doc_topic_df = pd.DataFrame(W[:10], columns=[f"Topic_{i+1}" for i in range(n_topics)])
print(doc_topic_df.head())


       Topic_1  Topic_2   Topic_3     Topic_4    Topic_5
Top_1    would   window      game       drive        god
Top_2      one     file      team        scsi  christian
Top_3   people   thanks      year        disk      jesus
Top_4     like   please    player        card      bible
Top_5    think  program    hockey        hard    believe
Top_6      get   anyone    season  controller     christ
Top_7    right     know      play         ide        say
Top_8     time     mail  baseball      floppy     belief
Top_9     know       do       win         mac      faith
Top_10   thing   driver      last         meg        sin
    Topic_1   Topic_2   Topic_3   Topic_4   Topic_5
0  0.007299  0.000000  0.094179  0.001667  0.000457
1  0.000000  0.033600  0.000159  0.048721  0.000000
2  0.042391  0.000000  0.000000  0.000000  0.000000
3  0.000000  0.000000  0.000000  0.089181  0.000000
4  0.011746  0.026967  0.000000  0.101664  0.001811
