<a href="https://colab.research.google.com/github/2403a52030-sketch/NLP-LAB/blob/main/NLP_LAB_7_2403A52030.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
documents = [
    "The football team won the championship match",
    "Cricket players trained hard for the tournament",
    "The athlete broke the world record in sprinting",
    "Basketball requires teamwork and coordination",
    "The coach planned a new strategy for the game",
    "The government passed a new education policy",
    "Elections were held to choose the new president",
    "The parliament debated economic reforms",
    "Political leaders addressed the nation",
    "The senator proposed a healthcare bill",
    "Doctors recommend regular exercise for good health",
    "The hospital introduced advanced medical equipment",
    "Mental health awareness is increasing globally",
    "Vaccination helps prevent infectious diseases",
    "A balanced diet improves overall wellbeing",
    "Artificial intelligence is transforming technology",
    "Cybersecurity is important for data protection",
    "Smartphones use advanced processors",
    "Machine learning improves prediction accuracy",
    "Cloud computing enables scalable applications"
]

df = pd.DataFrame({"Text": documents})
df.head()


Unnamed: 0,Text
0,The football team won the championship match
1,Cricket players trained hard for the tournament
2,The athlete broke the world record in sprinting
3,Basketball requires teamwork and coordination
4,The coach planned a new strategy for the game


In [5]:
import nltk
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

df["Processed_Text"] = df["Text"].apply(preprocess)
df.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,Text,Processed_Text
0,The football team won the championship match,football team championship match
1,Cricket players trained hard for the tournament,cricket player trained hard tournament
2,The athlete broke the world record in sprinting,athlete broke world record sprinting
3,Basketball requires teamwork and coordination,basketball requires teamwork coordination
4,The coach planned a new strategy for the game,coach planned new strategy game


In [6]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["Processed_Text"])

tfidf_matrix.shape


(20, 88)

In [7]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_df = pd.DataFrame(cosine_sim)
cosine_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.135906,0.135906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
print("Doc 0 & 1:", cosine_sim[0][1])
print("Doc 0 & 2:", cosine_sim[0][2])
print("Doc 5 & 6:", cosine_sim[5][6])
print("Doc 10 & 14:", cosine_sim[10][14])
print("Doc 15 & 18:", cosine_sim[15][18])


Doc 0 & 1: 0.0
Doc 0 & 2: 0.0
Doc 5 & 6: 0.13590619775559398
Doc 10 & 14: 0.0
Doc 15 & 18: 0.0


In [9]:
def jaccard_similarity(text1, text2):
    set1 = set(text1.split())
    set2 = set(text2.split())
    return len(set1 & set2) / len(set1 | set2)

for i in range(5):
    print(f"Jaccard (Doc 0 & Doc {i}):",
          jaccard_similarity(df["Processed_Text"][0],
                             df["Processed_Text"][i]))


Jaccard (Doc 0 & Doc 0): 1.0
Jaccard (Doc 0 & Doc 1): 0.0
Jaccard (Doc 0 & Doc 2): 0.0
Jaccard (Doc 0 & Doc 3): 0.0
Jaccard (Doc 0 & Doc 4): 0.0


In [10]:
def wordnet_similarity(word1, word2):
    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)
    if synsets1 and synsets2:
        return synsets1[0].wup_similarity(synsets2[0])
    return None

pairs = [
    ("doctor", "physician"),
    ("football", "sport"),
    ("hospital", "clinic"),
    ("health", "wellbeing"),
    ("president", "leader")
]

for w1, w2 in pairs:
    print(w1, "&", w2, ":", wordnet_similarity(w1, w2))


doctor & physician : 1.0
football & sport : 0.8888888888888888
hospital & clinic : 0.11764705882352941
health & wellbeing : 0.9473684210526315
president & leader : 0.5714285714285714
