<a href="https://colab.research.google.com/github/BandaAkshitha/Natural-Language-Processing/blob/main/NLP_lab07.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import re
import string

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Similarity measures
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

Dataset

In [14]:
documents = [
    # Sports
    "The football team won the championship match.",
    "Cricket players practice daily for tournaments.",
    "The athlete broke the world record in sprinting.",
    "Basketball requires speed and teamwork.",
    "The tennis match was exciting and competitive.",
    "Olympic games bring athletes from around the world.",

    # Politics
    "The government passed a new education policy.",
    "The election campaign focused on economic reforms.",
    "Parliament debated the new healthcare bill.",
    "The president addressed the nation on security.",
    "Political parties are preparing for elections.",
    "The minister announced new tax regulations.",

    # Health
    "Doctors recommend regular exercise for good health.",
    "A balanced diet improves overall well-being.",
    "The hospital introduced advanced medical equipment.",
    "Vaccination prevents serious diseases.",
    "Mental health awareness is increasing globally.",
    "Nurses provide essential patient care.",

    # Technology
    "Artificial intelligence is transforming industries.",
    "Cybersecurity protects systems from hackers.",
    "Machine learning improves data analysis.",
    "Software development requires programming skills.",
    "Cloud computing enables online storage solutions.",
    "Smartphones have advanced communication technology."
]

df = pd.DataFrame(documents, columns=["Text"])
print(df)

                                                 Text
0       The football team won the championship match.
1     Cricket players practice daily for tournaments.
2    The athlete broke the world record in sprinting.
3             Basketball requires speed and teamwork.
4      The tennis match was exciting and competitive.
5   Olympic games bring athletes from around the w...
6       The government passed a new education policy.
7   The election campaign focused on economic refo...
8         Parliament debated the new healthcare bill.
9     The president addressed the nation on security.
10     Political parties are preparing for elections.
11        The minister announced new tax regulations.
12  Doctors recommend regular exercise for good he...
13       A balanced diet improves overall well-being.
14  The hospital introduced advanced medical equip...
15             Vaccination prevents serious diseases.
16    Mental health awareness is increasing globally.
17             Nurses provid

Text Preprocessing

In [15]:
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):

    text = text.lower()

    text = re.sub(r'[^a-zA-Z\s]', '', text)

    tokens = word_tokenize(text)

    tokens = [word for word in tokens if word not in stop_words]

    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)

df["Processed_Text"] = df["Text"].apply(preprocess)
print(df)

                                                 Text  \
0       The football team won the championship match.   
1     Cricket players practice daily for tournaments.   
2    The athlete broke the world record in sprinting.   
3             Basketball requires speed and teamwork.   
4      The tennis match was exciting and competitive.   
5   Olympic games bring athletes from around the w...   
6       The government passed a new education policy.   
7   The election campaign focused on economic refo...   
8         Parliament debated the new healthcare bill.   
9     The president addressed the nation on security.   
10     Political parties are preparing for elections.   
11        The minister announced new tax regulations.   
12  Doctors recommend regular exercise for good he...   
13       A balanced diet improves overall well-being.   
14  The hospital introduced advanced medical equip...   
15             Vaccination prevents serious diseases.   
16    Mental health awareness i

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


TF-IDF Representation

In [6]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["Processed_Text"])

Cosine Similarity

In [17]:
cosine_sim = cosine_similarity(tfidf_matrix)

# Display similarity matrix
cosine_df = pd.DataFrame(cosine_sim)
cosine_df
print("Cosine Similarity between Doc 0 and Doc 1:", cosine_sim[0][1])

Cosine Similarity between Doc 0 and Doc 1: 0.0


Jaccard Similarity

In [18]:
def jaccard_similarity(text1, text2):
    set1 = set(text1.split())
    set2 = set(text2.split())
    return len(set1.intersection(set2)) / len(set1.union(set2))

print("Jaccard Similarity between Doc 0 and Doc 1:",
      jaccard_similarity(df["Processed_Text"][0], df["Processed_Text"][1]))

Jaccard Similarity between Doc 0 and Doc 1: 0.0


WordNet Semantic Similarity

In [19]:
def wordnet_similarity(word1, word2):
    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)

    max_score = 0
    for s1 in synsets1:
        for s2 in synsets2:
            score = s1.wup_similarity(s2)
            if score and score > max_score:
                max_score = score
    return max_score

print("Similarity between doctor and physician:",
      wordnet_similarity("doctor", "physician"))
pairs = [
    ("doctor", "physician"),
    ("government", "minister"),
    ("football", "basketball"),
    ("computer", "technology"),
    ("diet", "health")
]

for w1, w2 in pairs:
    print(f"{w1} - {w2} :", wordnet_similarity(w1, w2))

Similarity between doctor and physician: 1.0
doctor - physician : 1.0
government - minister : 0.5555555555555556
football - basketball : 0.9
computer - technology : 0.14285714285714285
diet - health : 0.2857142857142857
