<a href="https://colab.research.google.com/github/2403a52241-svg/NLP/blob/main/lab_7_2241_b09_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Import Required Libraries


import pandas as pd
import numpy as np
import string
import nltk

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
# Prepare Dataset


sentence_pairs = [

# -------- 10 Identical --------
("The cat is sleeping on the sofa.",
 "The cat is sleeping on the sofa."),

("I love machine learning.",
 "I love machine learning."),

("Python is a popular programming language.",
 "Python is a popular programming language."),

("She enjoys reading books.",
 "She enjoys reading books."),

("The sun rises in the east.",
 "The sun rises in the east."),

("Artificial intelligence is the future.",
 "Artificial intelligence is the future."),

("The dog barked loudly.",
 "The dog barked loudly."),

("He plays cricket every day.",
 "He plays cricket every day."),

("Water boils at 100 degrees Celsius.",
 "Water boils at 100 degrees Celsius."),

("The train arrived on time.",
 "The train arrived on time."),
 ("The doctor treated the patient.",
 "The physician helped the sick person."),

("I am happy today.",
 "I feel joyful this day."),

("She bought a new car.",
 "She purchased a brand new vehicle."),

("He is very intelligent.",
 "He is extremely smart."),

("The child is playing in the garden.",
 "The kid is playing outside in the yard."),

("The meeting was canceled.",
 "The conference was called off."),

("He completed the task quickly.",
 "He finished the job rapidly."),

("The movie was interesting.",
 "The film was very engaging."),

("She is a good teacher.",
 "She is an excellent instructor."),

("The weather is very cold.",
 "It is freezing outside."),

# -------- 10 Unrelated --------
("I love playing football.",
 "The sun is very bright today."),

("Machine learning is fascinating.",
 "The pizza tastes delicious."),

("She went to the market.",
 "The computer crashed suddenly."),
 ("He likes swimming.",
 "Mathematics is a complex subject."),

("The baby is crying.",
 "Cars are parked outside."),

("I enjoy coding in Python.",
 "The mountain is very tall."),

("The teacher explained the lesson.",
 "Ice cream melts quickly."),

("The phone battery is low.",
 "The river flows rapidly."),

("Students are studying in the library.",
 "The airplane landed safely.")
]

df = pd.DataFrame(sentence_pairs, columns=["Sentence1", "Sentence2"])
df.head()


Unnamed: 0,Sentence1,Sentence2
0,The cat is sleeping on the sofa.,The cat is sleeping on the sofa.
1,I love machine learning.,I love machine learning.
2,Python is a popular programming language.,Python is a popular programming language.
3,She enjoys reading books.,She enjoys reading books.
4,The sun rises in the east.,The sun rises in the east.


In [9]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)
df["Clean1"] = df["Sentence1"].apply(preprocess)
df["Clean2"] = df["Sentence2"].apply(preprocess)

df.head()

Unnamed: 0,Sentence1,Sentence2,Clean1,Clean2
0,The cat is sleeping on the sofa.,The cat is sleeping on the sofa.,cat sleeping sofa,cat sleeping sofa
1,I love machine learning.,I love machine learning.,love machine learning,love machine learning
2,Python is a popular programming language.,Python is a popular programming language.,python popular programming language,python popular programming language
3,She enjoys reading books.,She enjoys reading books.,enjoys reading book,enjoys reading book
4,The sun rises in the east.,The sun rises in the east.,sun rise east,sun rise east


In [10]:
vectorizer = TfidfVectorizer()
all_sentences = pd.concat([df["Clean1"], df["Clean2"]])
vectorizer.fit(all_sentences)



in a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [11]:
cosine_scores = []
for i in range(len(df)):
    vec1 = vectorizer.transform([df["Clean1"][i]])
    vec2 = vectorizer.transform([df["Clean2"][i]])
    score = cosine_similarity(vec1, vec2)[0][0]
    cosine_scores.append(score)
df["Cosine_Similarity"] = cosine_scores
df[["Sentence1","Sentence2","Cosine_Similarity"]].head(10)


Unnamed: 0,Sentence1,Sentence2,Cosine_Similarity
0,The cat is sleeping on the sofa.,The cat is sleeping on the sofa.,1.0
1,I love machine learning.,I love machine learning.,1.0
2,Python is a popular programming language.,Python is a popular programming language.,1.0
3,She enjoys reading books.,She enjoys reading books.,1.0
4,The sun rises in the east.,The sun rises in the east.,1.0
5,Artificial intelligence is the future.,Artificial intelligence is the future.,1.0
6,The dog barked loudly.,The dog barked loudly.,1.0
7,He plays cricket every day.,He plays cricket every day.,1.0
8,Water boils at 100 degrees Celsius.,Water boils at 100 degrees Celsius.,1.0
9,The train arrived on time.,The train arrived on time.,1.0


In [12]:
def jaccard_similarity(text1, text2):
    set1 = set(text1.split())
    set2 = set(text2.split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0
jaccard_scores = []
for i in range(len(df)):
    score = jaccard_similarity(df["Clean1"][i], df["Clean2"][i])
    jaccard_scores.append(score)
df["Jaccard_Similarity"] = jaccard_scores
df[["Sentence1","Sentence2","Jaccard_Similarity"]].head(10)



Unnamed: 0,Sentence1,Sentence2,Jaccard_Similarity
0,The cat is sleeping on the sofa.,The cat is sleeping on the sofa.,1.0
1,I love machine learning.,I love machine learning.,1.0
2,Python is a popular programming language.,Python is a popular programming language.,1.0
3,She enjoys reading books.,She enjoys reading books.,1.0
4,The sun rises in the east.,The sun rises in the east.,1.0
5,Artificial intelligence is the future.,Artificial intelligence is the future.,1.0
6,The dog barked loudly.,The dog barked loudly.,1.0
7,He plays cricket every day.,He plays cricket every day.,1.0
8,Water boils at 100 degrees Celsius.,Water boils at 100 degrees Celsius.,1.0
9,The train arrived on time.,The train arrived on time.,1.0


In [14]:
def wordnet_similarity(word1, word2):
    syn1 = wordnet.synsets(word1)
    syn2 = wordnet.synsets(word2)
    if syn1 and syn2:
        return syn1[0].wup_similarity(syn2[0])
    return None
print("WordNet Similarity for First 10 Pairs:\n")
for i in range(10):
    words1 = df["Clean1"][i].split()
    words2 = df["Clean2"][i].split()
    scores = []
    for w1 in words1:
        for w2 in words2:
            sim = wordnet_similarity(w1, w2)
            if sim is not None:
                scores.append(sim)
    if scores:
        print(f"Pair {i+1}: ", np.mean(scores))
    else:
        print(f"Pair {i+1}: 0")

WordNet Similarity for First 10 Pairs:

Pair 1:  0.452300785634119
Pair 2:  0.45608465608465604
Pair 3:  0.3824726078015551
Pair 4:  0.4326167659500993
Pair 5:  0.4444444444444444
Pair 6:  0.5190796857463524
Pair 7:  0.44576719576719576
Pair 8:  0.39244711042311664
Pair 9:  0.3825008675534991
Pair 10:  0.4425925925925926


In [15]:
df[["Cosine_Similarity","Jaccard_Similarity"]].describe()

Unnamed: 0,Cosine_Similarity,Jaccard_Similarity
count,29.0,29.0
mean,0.361787,0.356322
std,0.4753,0.477103
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,1.0,1.0
max,1.0,1.0


In [16]:
df.head(15)


Unnamed: 0,Sentence1,Sentence2,Clean1,Clean2,Cosine_Similarity,Jaccard_Similarity
0,The cat is sleeping on the sofa.,The cat is sleeping on the sofa.,cat sleeping sofa,cat sleeping sofa,1.0,1.0
1,I love machine learning.,I love machine learning.,love machine learning,love machine learning,1.0,1.0
2,Python is a popular programming language.,Python is a popular programming language.,python popular programming language,python popular programming language,1.0,1.0
3,She enjoys reading books.,She enjoys reading books.,enjoys reading book,enjoys reading book,1.0,1.0
4,The sun rises in the east.,The sun rises in the east.,sun rise east,sun rise east,1.0,1.0
5,Artificial intelligence is the future.,Artificial intelligence is the future.,artificial intelligence future,artificial intelligence future,1.0,1.0
6,The dog barked loudly.,The dog barked loudly.,dog barked loudly,dog barked loudly,1.0,1.0
7,He plays cricket every day.,He plays cricket every day.,play cricket every day,play cricket every day,1.0,1.0
8,Water boils at 100 degrees Celsius.,Water boils at 100 degrees Celsius.,water boil 100 degree celsius,water boil 100 degree celsius,1.0,1.0
9,The train arrived on time.,The train arrived on time.,train arrived time,train arrived time,1.0,1.0
