<a href="https://colab.research.google.com/github/2403a52225-maker/NLP/blob/main/Assignment_7_4_2403A52225_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Libraries**

In [5]:

import numpy as np
import pandas as pd
import nltk
import string
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer


**Create Dataset**

In [6]:
# Creating dataset manually

data = {
    "sentence1": [
        "The cat is sleeping on the mat.",
        "He is driving a car.",
        "The weather is very cold today.",
        "A doctor treated the patient.",
        "The sun rises in the east.",
        "She bought a new laptop.",
        "The boy is playing football.",
        "The food tastes delicious.",
        "The teacher explained the lesson.",
        "The movie was very interesting.",

        # Paraphrased
        "The cat sleeps on the mat.",
        "He drives a vehicle.",
        "It is extremely cold today.",
        "A physician treated the sick person.",
        "The sun always rises from the east.",
        "She purchased a new computer.",
        "The child is playing soccer.",
        "The meal is very tasty.",
        "The instructor explained the topic.",
        "The film was really engaging.",

        # Unrelated
        "The airplane is flying high.",
        "I love eating mangoes.",
        "The river flows through the forest.",
        "My phone battery is low.",
        "The train arrived late.",
        "Birds are chirping in the morning.",
        "He is learning guitar.",
        "The shop is closed today.",
        "The festival was celebrated grandly.",
        "The dog barked loudly."
    ],

    "sentence2": [
        "The cat is sleeping on the mat.",
        "He is driving a car.",
        "The weather is very cold today.",
        "A doctor treated the patient.",
        "The sun rises in the east.",
        "She bought a new laptop.",
        "The boy is playing football.",
        "The food tastes delicious.",
        "The teacher explained the lesson.",
        "The movie was very interesting.",

        # Paraphrased
        "The cat is on the mat sleeping.",
        "He is operating a car.",
        "Today the weather is very chilly.",
        "The doctor helped the patient recover.",
        "The east is where the sun rises.",
        "She got herself a new laptop.",
        "The boy enjoys playing football.",
        "The food is really tasty.",
        "The teacher taught the lesson.",
        "The movie was quite interesting.",

        # Unrelated
        "My brother is reading a book.",
        "The computer is running slow.",
        "The baby is crying loudly.",
        "The mountain is covered with snow.",
        "The shopkeeper sold all vegetables.",
        "The dog is sleeping peacefully.",
        "The laptop screen is broken.",
        "The flowers bloomed beautifully.",
        "The car engine stopped working.",
        "The stars shine brightly at night."
    ]
}

df = pd.DataFrame(data)
df.head()



Unnamed: 0,sentence1,sentence2
0,The cat is sleeping on the mat.,The cat is sleeping on the mat.
1,He is driving a car.,He is driving a car.
2,The weather is very cold today.,The weather is very cold today.
3,A doctor treated the patient.,A doctor treated the patient.
4,The sun rises in the east.,The sun rises in the east.


**Preprocessing**

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

df["clean1"] = df["sentence1"].apply(preprocess)
df["clean2"] = df["sentence2"].apply(preprocess)

df.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,sentence1,sentence2,clean1,clean2
0,The cat is sleeping on the mat.,The cat is sleeping on the mat.,cat sleeping mat,cat sleeping mat
1,He is driving a car.,He is driving a car.,driving car,driving car
2,The weather is very cold today.,The weather is very cold today.,weather cold today,weather cold today
3,A doctor treated the patient.,A doctor treated the patient.,doctor treated patient,doctor treated patient
4,The sun rises in the east.,The sun rises in the east.,sun rise east,sun rise east


**TF-IDF Represntation**

In [8]:
vectorizer = TfidfVectorizer()
combined = df["clean1"].tolist() + df["clean2"].tolist()

tfidf_matrix = vectorizer.fit_transform(combined)

**Cosine Similarity**

In [9]:
cosine_scores = []

for i in range(len(df)):
    v1 = tfidf_matrix[i]
    v2 = tfidf_matrix[i + len(df)]
    score = cosine_similarity(v1, v2)[0][0]
    cosine_scores.append(score)

df["cosine_similarity"] = cosine_scores
df[["sentence1", "sentence2", "cosine_similarity"]].head(10)


Unnamed: 0,sentence1,sentence2,cosine_similarity
0,The cat is sleeping on the mat.,The cat is sleeping on the mat.,1.0
1,He is driving a car.,He is driving a car.,1.0
2,The weather is very cold today.,The weather is very cold today.,1.0
3,A doctor treated the patient.,A doctor treated the patient.,1.0
4,The sun rises in the east.,The sun rises in the east.,1.0
5,She bought a new laptop.,She bought a new laptop.,1.0
6,The boy is playing football.,The boy is playing football.,1.0
7,The food tastes delicious.,The food tastes delicious.,1.0
8,The teacher explained the lesson.,The teacher explained the lesson.,1.0
9,The movie was very interesting.,The movie was very interesting.,1.0


**Jaccard Similarity**

In [10]:
def jaccard_similarity(s1, s2):
    set1 = set(s1.split())
    set2 = set(s2.split())
    return len(set1 & set2) / len(set1 | set2)

df["jaccard_similarity"] = df.apply(
    lambda row: jaccard_similarity(row["clean1"], row["clean2"]), axis=1
)

df[["sentence1", "sentence2", "jaccard_similarity"]].head(10)


Unnamed: 0,sentence1,sentence2,jaccard_similarity
0,The cat is sleeping on the mat.,The cat is sleeping on the mat.,1.0
1,He is driving a car.,He is driving a car.,1.0
2,The weather is very cold today.,The weather is very cold today.,1.0
3,A doctor treated the patient.,A doctor treated the patient.,1.0
4,The sun rises in the east.,The sun rises in the east.,1.0
5,She bought a new laptop.,She bought a new laptop.,1.0
6,The boy is playing football.,The boy is playing football.,1.0
7,The food tastes delicious.,The food tastes delicious.,1.0
8,The teacher explained the lesson.,The teacher explained the lesson.,1.0
9,The movie was very interesting.,The movie was very interesting.,1.0


**WordNet Similarity**

In [11]:
def wordnet_sentence_similarity(s1, s2):
    tokens1 = s1.split()
    tokens2 = s2.split()

    total_score = 0
    count = 0

    for w1 in tokens1:
        syn1 = wordnet.synsets(w1)
        if not syn1:
            continue

        best_score = 0
        for w2 in tokens2:
            syn2 = wordnet.synsets(w2)
            if not syn2:
                continue

            score = syn1[0].wup_similarity(syn2[0])
            if score and score > best_score:
                best_score = score

        if best_score:
            total_score += best_score
            count += 1

    return total_score / count if count > 0 else 0

df["wordnet_similarity"] = df.apply(
    lambda row: wordnet_sentence_similarity(row["clean1"], row["clean2"]), axis=1
)

df[["sentence1", "sentence2", "wordnet_similarity"]].head(10)


Unnamed: 0,sentence1,sentence2,wordnet_similarity
0,The cat is sleeping on the mat.,The cat is sleeping on the mat.,1.0
1,He is driving a car.,He is driving a car.,1.0
2,The weather is very cold today.,The weather is very cold today.,1.0
3,A doctor treated the patient.,A doctor treated the patient.,1.0
4,The sun rises in the east.,The sun rises in the east.,1.0
5,She bought a new laptop.,She bought a new laptop.,1.0
6,The boy is playing football.,The boy is playing football.,1.0
7,The food tastes delicious.,The food tastes delicious.,1.0
8,The teacher explained the lesson.,The teacher explained the lesson.,1.0
9,The movie was very interesting.,The movie was very interesting.,1.0


**Comparison Summar**y

In [12]:
df[["cosine_similarity", "jaccard_similarity", "wordnet_similarity"]].describe()

Unnamed: 0,cosine_similarity,jaccard_similarity,wordnet_similarity
count,30.0,30.0,30.0
mean,0.418399,0.402222,0.698335
std,0.459116,0.459734,0.277002
min,0.0,0.0,0.27862
25%,0.0,0.0,0.422146
50%,0.234988,0.183333,0.711364
75%,1.0,1.0,1.0
max,1.0,1.0,1.0
