<a href="https://colab.research.google.com/github/2403a52029-lab/NLP_LAB-ASSIGNMENTS/blob/main/NLP_LAB_07_2403A52029.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Basic libraries
import numpy as np # Numerical computing library
import pandas as pd # Data manipulation and analysis library
import string # String constant module

# NLP libraries
import nltk # Natural Language Toolkit for text processing
from nltk.corpus import stopwords, wordnet # For stop words removal and WordNet lexicon
from nltk.stem import WordNetLemmatizer # For word lemmatization

# Vectorization and similarity
from sklearn.feature_extraction.text import TfidfVectorizer # For converting text to TF-IDF features
from sklearn.metrics.pairwise import cosine_similarity # For calculating cosine similarity between vectors

In [12]:
# Download required NLTK resources
nltk.download('punkt') # Required for tokenization
nltk.download('stopwords') # Required for removing common words
nltk.download('wordnet') # Required for lemmatization and WordNet similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
documents = [
    "The cricket team won the match by scoring many runs",
    "Football players trained hard for the tournament",
    "The government announced new economic policies",
    "The election results were declared yesterday",
    "Doctors recommend regular exercise for good health",
    "A balanced diet improves overall fitness",
    "Artificial intelligence is transforming technology",
    "Machine learning improves computer systems",
    "The parliament passed a new healthcare bill",
    "The patient was treated by a skilled physician",
    "Smartphones use advanced processors",
    "Cybersecurity is important in modern technology",
    "Athletes need discipline and practice",
    "Political debates influence public opinion",
    "Hospitals use technology for better diagnosis",
    "New software updates improve performance",
    "Vaccines help prevent serious diseases",
    "Sports require physical and mental strength",
    "The minister addressed the press conference",
    "Wearable devices track health data"
]

df = pd.DataFrame({'Text': documents}) # Create a Pandas DataFrame from the list of documents
df.head() # Display the first 5 rows of the DataFrame

Unnamed: 0,Text
0,The cricket team won the match by scoring many...
1,Football players trained hard for the tournament
2,The government announced new economic policies
3,The election results were declared yesterday
4,Doctors recommend regular exercise for good he...


In [14]:
nltk.download('punkt_tab') # Download 'punkt_tab' for NLTK tokenizer
stop_words = set(stopwords.words('english')) # Define a set of English stop words
lemmatizer = WordNetLemmatizer() # Initialize WordNetLemmatizer for lemmatization

def preprocess(text):
    text = text.lower() # Convert text to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits)) # Remove punctuation and digits
    tokens = nltk.word_tokenize(text) # Tokenize the text into words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] # Lemmatize words and remove stop words
    return " ".join(tokens) # Join the processed tokens back into a string

df['Clean_Text'] = df['Text'].apply(preprocess) # Apply the preprocessing function to the 'Text' column
df.head() # Display the first 5 rows of the DataFrame with the new 'Clean_Text' column

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Text,Clean_Text
0,The cricket team won the match by scoring many...,cricket team match scoring many run
1,Football players trained hard for the tournament,football player trained hard tournament
2,The government announced new economic policies,government announced new economic policy
3,The election results were declared yesterday,election result declared yesterday
4,Doctors recommend regular exercise for good he...,doctor recommend regular exercise good health


In [15]:
vectorizer = TfidfVectorizer() # Initialize TF-IDF Vectorizer
tfidf_matrix = vectorizer.fit_transform(df['Clean_Text']) # Fit the vectorizer and transform the clean text into TF-IDF features

In [16]:
cosine_sim = cosine_similarity(tfidf_matrix) # Calculate cosine similarity between all document TF-IDF vectors

cosine_df = pd.DataFrame(cosine_sim) # Convert the similarity matrix to a Pandas DataFrame
cosine_df.head() # Display the first 5 rows of the cosine similarity DataFrame

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.135906,0.0,0.0,0.0,0.0,0.0,0.0,0.135906,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147205


In [17]:
def jaccard_similarity(text1, text2):
    set1 = set(text1.split()) # Convert first text to a set of words
    set2 = set(text2.split()) # Convert second text to a set of words
    return len(set1 & set2) / len(set1 | set2) # Calculate Jaccard similarity

jaccard_scores = [] # Initialize an empty list to store Jaccard scores

for i in range(len(df)): # Iterate through each document
    for j in range(i+1, len(df)): # Compare with subsequent documents to avoid duplicates and self-comparison
        score = jaccard_similarity(df['Clean_Text'][i], df['Clean_Text'][j]) # Calculate Jaccard score
        jaccard_scores.append((i, j, score)) # Append the document indices and score

jaccard_df = pd.DataFrame(jaccard_scores, columns=['Doc1', 'Doc2', 'Jaccard_Score']) # Create a DataFrame from the scores
jaccard_df.head() # Display the first 5 rows of the Jaccard similarity DataFrame

Unnamed: 0,Doc1,Doc2,Jaccard_Score
0,0,1,0.0
1,0,2,0.0
2,0,3,0.0
3,0,4,0.0
4,0,5,0.0


In [18]:
def wordnet_similarity(word1, word2):
    syn1 = wordnet.synsets(word1) # Get WordNet synsets for the first word
    syn2 = wordnet.synsets(word2) # Get WordNet synsets for the second word
    if syn1 and syn2:
        return syn1[0].wup_similarity(syn2[0]) # Return Wu-Palmer similarity if synsets exist
    return None # Return None if no synsets are found for either word

pairs = [
    ("doctor", "physician"),
    ("health", "fitness"),
    ("government", "minister"),
    ("technology", "software"),
    ("sports", "athletes"),
    ("hospital", "patient"),
    ("computer", "machine"),
    ("disease", "illness"),
    ("election", "vote"),
    ("diet", "nutrition")
] # Define a list of word pairs to compare

for w1, w2 in pairs:
    print(f"{w1} - {w2} : {wordnet_similarity(w1, w2)}") # Calculate and print WordNet similarity for each pair

doctor - physician : 1.0
health - fitness : 0.375
government - minister : 0.13333333333333333
technology - software : 0.23529411764705882
sports - athletes : 0.14285714285714285
hospital - patient : 0.4444444444444444
computer - machine : 0.9411764705882353
disease - illness : 0.9473684210526315
election - vote : 0.625
diet - nutrition : 0.3333333333333333
