<a href="https://colab.research.google.com/github/2403A52058/NLP_LABASSIGNMENTS/blob/main/NLP_LAB(07)_2403A52058.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Required Libraries

In [3]:
# Import basic libraries
import numpy as np          # Numerical computations
import pandas as pd         # Dataset handling
import re                   # Text cleaning

# Import NLP libraries
import nltk
from nltk.corpus import stopwords     # Stopword removal
from nltk.tokenize import word_tokenize  # Tokenization
from nltk.stem import WordNetLemmatizer  # Lemmatization
from nltk.corpus import wordnet       # Semantic similarity

# Import ML libraries
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF
from sklearn.metrics.pairwise import cosine_similarity       # Cosine similarity



Download NLTK Resources

In [11]:
# Download required NLTK datasets
nltk.download('punkt')       # Tokenizer
nltk.download('stopwords')   # Stopwords list
nltk.download('wordnet')     # WordNet database
nltk.download('punkt_tab')   # Required for tokenization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

Create Dataset

In [None]:
# Creating dataset with multiple topics
documents = [
    # Sports
    "The football team won the championship",
    "Cricket players trained hard for the match",
    "The athlete broke the world record",
    "The coach discussed game strategy",
    "The tournament attracted many fans",

    # Politics
    "The government passed a new law",
    "Elections were held across the country",
    "The president addressed the nation",
    "Political parties debated policies",
    "The parliament session was intense",

    # Health
    "Doctors recommend regular exercise",
    "The hospital treated many patients",
    "Healthy diet improves immunity",
    "The physician prescribed medicine",
    "Mental health awareness is important",

    # Technology
    "Artificial intelligence is transforming industries",
    "The smartphone has advanced features",
    "Cybersecurity protects digital data",
    "Software development requires coding skills",
    "Machine learning improves predictions"
]

# Convert to DataFrame
df = pd.DataFrame({"Document": documents})

# Display dataset sample
df.head()


Text Preprocessing Function

In [7]:
# Load stopwords and initialize lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    text = text.lower()                       # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)      # Remove punctuation and numbers
    tokens = word_tokenize(text)              # Tokenize text
    tokens = [w for w in tokens if w not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(w) for w in tokens]   # Lemmatize words
    return " ".join(tokens)                   # Join tokens


Apply Preprocessing

In [12]:
# Apply preprocessing to all documents
df["Cleaned_Document"] = df["Document"].apply(preprocess_text)

# Display cleaned text
df[["Document", "Cleaned_Document"]].head()


Unnamed: 0,Document,Cleaned_Document
0,The football team won the championship,football team championship
1,Cricket players trained hard for the match,cricket player trained hard match
2,The athlete broke the world record,athlete broke world record
3,The coach discussed game strategy,coach discussed game strategy
4,The tournament attracted many fans,tournament attracted many fan


TF-IDF Representation

In [13]:
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Convert text to numerical form
tfidf_matrix = vectorizer.fit_transform(df["Cleaned_Document"])

# Display shape of TF-IDF matrix
tfidf_matrix.shape   # (documents, unique words)


(20, 75)

Cosine Similarity Calculation

In [14]:
# Compute cosine similarity between all document pairs
cosine_sim = cosine_similarity(tfidf_matrix)

# Sample cosine similarity outputs
print("Doc 0 vs Doc 1:", cosine_sim[0][1])     # Sports vs Sports
print("Doc 0 vs Doc 10:", cosine_sim[0][10])   # Sports vs Health
print("Doc 15 vs Doc 19:", cosine_sim[15][19]) # Technology vs Technology


Doc 0 vs Doc 1: 0.0
Doc 0 vs Doc 10: 0.0
Doc 15 vs Doc 19: 0.0


Jaccard Similarity Function

In [15]:
# Jaccard similarity function
def jaccard_similarity(doc1, doc2):
    set1 = set(doc1.split())          # Convert document 1 to word set
    set2 = set(doc2.split())          # Convert document 2 to word set
    return len(set1 & set2) / len(set1 | set2)  # Jaccard formula


Jaccard Similarity Output

In [16]:
# Display sample Jaccard similarity scores
print("Doc 0 vs Doc 1:", jaccard_similarity(df["Cleaned_Document"][0],
                                            df["Cleaned_Document"][1]))
print("Doc 0 vs Doc 10:", jaccard_similarity(df["Cleaned_Document"][0],
                                             df["Cleaned_Document"][10]))
print("Doc 12 vs Doc 14:", jaccard_similarity(df["Cleaned_Document"][12],
                                              df["Cleaned_Document"][14]))


Doc 0 vs Doc 1: 0.0
Doc 0 vs Doc 10: 0.0
Doc 12 vs Doc 14: 0.0


WordNet Semantic Similarity Function

In [17]:
# WordNet similarity using Wu-Palmer method
def wordnet_similarity(word1, word2):
    syn1 = wordnet.synsets(word1)     # Get synsets for word1
    syn2 = wordnet.synsets(word2)     # Get synsets for word2
    if not syn1 or not syn2:
        return 0                      # No similarity if word not found
    return syn1[0].wup_similarity(syn2[0]) or 0


WordNet Similarity Output

In [18]:
# Display semantic similarity for word pairs
print("doctor - physician:", wordnet_similarity("doctor", "physician"))
print("football - sport:", wordnet_similarity("football", "sport"))
print("government - politics:", wordnet_similarity("government", "politics"))
print("smartphone - device:", wordnet_similarity("smartphone", "device"))
print("exercise - health:", wordnet_similarity("exercise", "health"))


doctor - physician: 1.0
football - sport: 0.8888888888888888
government - politics: 0.3333333333333333
smartphone - device: 0
exercise - health: 0.2
