<a href="https://colab.research.google.com/github/2403a54127-lab/Natural-language-processing/blob/main/NLP_Lab7_VASANTHA_2403A54127.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [61]:
documents = [
    # Sports
    "India won the cricket match yesterday.",
    "The football team scored three goals.",
    "Virat Kohli played an amazing innings.",
    "The Olympic games include many sports.",
    "The tennis player won the championship.",

    # Politics
    "The election results were announced today.",
    "The government passed a new policy.",
    "The president gave a speech in parliament.",
    "Political debates are important in democracy.",
    "The minister discussed economic reforms.",

    # Health
    "Regular exercise improves heart health.",
    "Doctors recommend drinking more water.",
    "A healthy diet prevents diseases.",
    "Hospitals provide medical treatment.",
    "Vaccination protects against infections.",

    # Technology
    "Artificial intelligence is transforming industries.",
    "Python is popular for machine learning.",
    "Cyber security protects computer systems.",
    "The new smartphone has advanced features.",
    "Cloud computing stores data online."
]



In [62]:
print("Total documents:", len(documents))

Total documents: 20


Preprocessing

In [63]:
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

clean_docs = [preprocess(doc) for doc in documents]

TF-IDF Representation

In [64]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(clean_docs)


ðŸ”¹ METHOD 1 â€” Cosine Similarity (TF-IDF)

In [65]:
cos_sim = cosine_similarity(tfidf_matrix)

print("\n=== COSINE SIMILARITY ===\n")

for i in range(len(documents)):
    for j in range(i+1, len(documents)):
        print("Doc 1:", documents[i])
        print("Doc 2:", documents[j])
        print("Score:", round(cos_sim[i][j], 3))
        print("-" * 60)


=== COSINE SIMILARITY ===

Doc 1: India won the cricket match yesterday.
Doc 2: The football team scored three goals.
Score: 0.0
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: Virat Kohli played an amazing innings.
Score: 0.0
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: The Olympic games include many sports.
Score: 0.0
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: The tennis player won the championship.
Score: 0.0
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: The election results were announced today.
Score: 0.0
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: The government passed a new policy.
Score: 0.0
------------------------------------------------

In [66]:
print("Similarity between doc 0 and doc 2:",
      cosine_sim[0][2])


Similarity between doc 0 and doc 2: 0.0


METHOD 2 â€” Jaccard Similarity

In [67]:
def jaccard(s1, s2):
    a = set(s1.split())
    b = set(s2.split())
    return len(a & b) / len(a | b)

print("\n=== JACCARD SIMILARITY ===\n")

for i in range(len(clean_docs)):
    for j in range(i+1, len(clean_docs)):
        score = jaccard(clean_docs[i], clean_docs[j])
        print("Doc 1:", documents[i])
        print("Doc 2:", documents[j])
        print("Score:", round(score, 3))
        print("-" * 60)


=== JACCARD SIMILARITY ===

Doc 1: India won the cricket match yesterday.
Doc 2: The football team scored three goals.
Score: 0.0
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: Virat Kohli played an amazing innings.
Score: 0.0
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: The Olympic games include many sports.
Score: 0.0
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: The tennis player won the championship.
Score: 0.0
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: The election results were announced today.
Score: 0.0
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: The government passed a new policy.
Score: 0.0
-----------------------------------------------

METHOD 3 â€” WordNet Semantic Similarity

In [68]:
def sentence_wordnet_similarity(s1, s2):
    words1 = s1.split()
    words2 = s2.split()
    scores = []

    for w1 in words1:
        for w2 in words2:
            syn1 = wn.synsets(w1)
            syn2 = wn.synsets(w2)
            if syn1 and syn2:
                sim = syn1[0].wup_similarity(syn2[0])
                if sim is not None:
                    scores.append(sim)

    return sum(scores)/len(scores) if scores else 0

print("\n=== WORDNET SIMILARITY ===\n")

count = 0
for i in range(len(clean_docs)):
    for j in range(i+1, len(clean_docs)):
        score = sentence_wordnet_similarity(clean_docs[i], clean_docs[j])
        print("Doc 1:", documents[i])
        print("Doc 2:", documents[j])
        print("Score:", round(score, 3))
        print("-" * 60)

        count += 1
        if count == 10:   # only first 10 pairs (lab requirement)
            break
    if count == 10:
        break


=== WORDNET SIMILARITY ===

Doc 1: India won the cricket match yesterday.
Doc 2: The football team scored three goals.
Score: 0.164
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: Virat Kohli played an amazing innings.
Score: 0.151
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: The Olympic games include many sports.
Score: 0.165
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: The tennis player won the championship.
Score: 0.226
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: The election results were announced today.
Score: 0.177
------------------------------------------------------------
Doc 1: India won the cricket match yesterday.
Doc 2: The government passed a new policy.
Score: 0.158
-----------------------------------

METHOD 4 â€” Compare All Three for One Pair

In [69]:
i = 0
j = 2

print("Cosine:", cosine_sim[i][j])
print("Jaccard:", jaccard(clean_docs[i],
                                      clean_docs[j]))
print("WordNet:", sentence_wordnet_similarity(clean_docs[i],
                                              clean_docs[j]))

Cosine: 0.0
Jaccard: 0.0
WordNet: 0.15098595576536752
