# Imports

In [6]:
import nltk
from nltk.corpus import wordnet
from nltk.wsd import lesk

# Question 1

In [7]:
nltk.download('wordnet')

# Function to calculate semantic similarity between two words
def word_similarity(word1, word2):
    # Get the synsets for each word
    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)
    
    if not synsets1 or not synsets2:
        return None
    
    max_similarity = 0.0
    
    # Calculate the similarity between synsets
    for synset1 in synsets1:
        for synset2 in synsets2:
            similarity = synset1.wup_similarity(synset2)  # Wu-Palmer Similarity
            if similarity is not None and similarity > max_similarity:
                max_similarity = similarity
    
    return max_similarity

# Example usage
word1 = "car"
word2 = "vehicle"
similarity = word_similarity(word1, word2)
if similarity is not None:
    print(f"Semantic similarity between '{word1}' and '{word2}': {similarity:.2f}")
else:
    print(f"No synsets found for one or both words.")

# Example to find synonyms (synsets), hyponyms, and hypernyms for a word
word = "car"
synsets = wordnet.synsets(word)
if synsets:
    print(f"Synsets for '{word}':")
    for synset in synsets:
        print(f"- {synset.name()}: {synset.definition()}")
    
    # Hyponyms (more specific terms)
    hyponyms = synsets[0].hyponyms()
    if hyponyms:
        print(f"\nHyponyms (more specific terms) for '{word}':")
        for hyponym in hyponyms:
            print(f"- {hyponym.name()}: {hyponym.definition()}")
    
    # Hypernyms (more general terms)
    hypernyms = synsets[0].hypernyms()
    if hypernyms:
        print(f"\nHypernyms (more general terms) for '{word}':")
        for hypernym in hypernyms:
            print(f"- {hypernym.name()}: {hypernym.definition()}")
else:
    print(f"No synsets found for '{word}'.")

Semantic similarity between 'car' and 'vehicle': 0.89
Synsets for 'car':
- car.n.01: a motor vehicle with four wheels; usually propelled by an internal combustion engine
- car.n.02: a wheeled vehicle adapted to the rails of railroad
- car.n.03: the compartment that is suspended from an airship and that carries personnel and the cargo and the power plant
- car.n.04: where passengers ride up and down
- cable_car.n.01: a conveyance for passengers or freight on a cable railway

Hyponyms (more specific terms) for 'car':
- ambulance.n.01: a vehicle that takes people to and from hospitals
- beach_wagon.n.01: a car that has a long body and rear door with space behind rear seat
- bus.n.04: a car that is old and unreliable
- cab.n.03: a car driven by a person whose job is to take passengers where they want to go in exchange for money
- compact.n.03: a small and economical car
- convertible.n.01: a car that has top that can be folded or removed
- coupe.n.01: a car with two doors and front seats a

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aghaffar23\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Question 2

In [8]:
# Get synsets for the word "car" as a noun ('n')
car_synsets = wordnet.synsets('car', 'n')

# Create a list of synsets along with their lemma counts
synset_counts = [(synset, synset.lemmas()[0].count()) for synset in car_synsets]

# Sort the synsets by lemma counts in descending order
sorted_synsets = sorted(synset_counts, key=lambda x: x[1], reverse=True)

# Print synsets and their counts in descending order of frequency
for synset, count in sorted_synsets:
    print(f"Synset: {synset.name()}: {synset.definition()}, Lemma Count: {count}")

Synset: car.n.01: a motor vehicle with four wheels; usually propelled by an internal combustion engine, Lemma Count: 71
Synset: car.n.02: a wheeled vehicle adapted to the rails of railroad, Lemma Count: 2
Synset: car.n.03: the compartment that is suspended from an airship and that carries personnel and the cargo and the power plant, Lemma Count: 0
Synset: car.n.04: where passengers ride up and down, Lemma Count: 0
Synset: cable_car.n.01: a conveyance for passengers or freight on a cable railway, Lemma Count: 0


# Question 3

In [27]:
import numpy as np
from nltk.corpus import genesis
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

genesis_ic = wn.ic(genesis, False, 0.0)

# Function to calculate the semantic similarity between two sentences
def sentence_similarity(sentence1, sentence2):
    # Tokenize the sentences into words
    words1 = word_tokenize(sentence1)
    words2 = word_tokenize(sentence2)

    # Get synsets for each word in the sentences
    synsets1 = [wordnet.synsets(word) for word in words1]
    synsets2 = [wordnet.synsets(word) for word in words2]

    # Calculate the maximum similarity between pairs of synsets
    max_similarity = 0.0
    for synset1 in synsets1:
        for synset2 in synsets2:
            if synset1 and synset2:
                similarity = max(s1.wup_similarity(s2) for s1 in synset1 for s2 in synset2)
                if similarity is not None and similarity > max_similarity:
                    max_similarity = similarity
    
    return max_similarity

def wup(S1, S2):
    return S1.wup_similarity(S2)

def Resnik(S1, S2):
    return S1.res_similarity(S2, genesis_ic)

options = {0 : wup,
           1 : Resnik,
          }

def preProcess(preprocess, sentence):
    Stopwords = list(set(nltk.corpus.stopwords.words('english')))
    stemmer = SnowballStemmer("english")#we will avoid the stemming because it will give a pbm with sysnset search
    
    words = word_tokenize(sentence)
    #words = [stemmer.stem(word) for word in words] 
    if preprocess:
        words = [word.lower() for word in words if word.isalpha() and word not in Stopwords] #get rid of numbers and Stopwords
 
    return words



def word_similarity(w1,w2,num):
    S1 = wn.synsets(w1)[0]
    S2 = wn.synsets(w2)[0]
#    print(w1,w2)
#    print(S1,'\n_________________\n',S2)
    if S1 and S2:
       similarity = options[num](S1, S2)
       if similarity:
          return round(similarity,2)
    return 0

def Similarity(preprocess, T1, T2, num):
    words1 = preProcess(preprocess, T1)
    words2 = preProcess(preprocess, T2)
    
    tf = TfidfVectorizer(use_idf=True)
    tf.fit_transform([' '.join(words1), ' '.join(words2)])

    Idf = dict(zip(tf.get_feature_names_out(), tf.idf_))
    
    Sim = 0
    Sim_score1 = 0
    Sim_score2 = 0
    
    for w1 in words1:
        Max = 0
        for w2 in words2:
            score = word_similarity(w1,w2,num)
            if Max < score:
               Max = score
        Sim_score1 += Max*Idf[w1]
    Sim_score1 /= sum([Idf[w1] for w1 in words1])
    
#     print(round(Sim_score1,2))
    for w2 in words2:
        Max = 0
        for w1 in words1:
            score = word_similarity(w1,w2,num)
            if Max < score:
               Max = score
        Sim_score2 += Max*Idf[w2]
        
    Sim_score2 /= sum([Idf[w1] for w2 in words2])
#     print(round(Sim_score2,2))

    Sim = (Sim_score1+Sim_score2)/2
    
    return round(Sim,2)

# Example sentences
T1 = "Students feel unhappy today about the class today."
T2 = "Many students felt concepts of class test relevant."

# Calculate and print the semantic similarity
similarity = sentence_similarity(T1, T2)
similarity_wup = Similarity(1, T1, T2, 0)
# similarity_resnik = Similarity(T1, T2, 1)
print(f"Semantic similarity between sentences: {round(similarity, 2)}")
print('Wup Similarity(T1, T2) =',similarity_wup)
# print('Resnik Similarity(T1, T2) =',similarity_resnik)

Semantic similarity between sentences: 1.0
Wup Similarity(T1, T2) = 0.59


# Question 4

In [28]:
# Example sentences

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_sentence(sentence, remove_stopwords=False, stem_words=False):
    # Tokenize the sentence into words
    words = word_tokenize(sentence.lower())  # Convert to lowercase

    # Remove stopwords if specified
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]

    # Stem words if specified
    if stem_words:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]

    return words

# Calculate and print the semantic similarity with preprocessing
T1_p = preprocess_sentence(T1, remove_stopwords=True, stem_words=True)
T2_p = preprocess_sentence(T2, remove_stopwords=True, stem_words=True)
T1_p_s = ' '.join(str(e) for e in T1_p)
T2_p_s = ' '.join(str(e) for e in T2_p)
similarity_with_preprocessing = sentence_similarity(T1_p_s, T2_p_s)
print(f"Semantic similarity with preprocessing: {similarity_with_preprocessing:.2f}")

# Calculate and print the semantic similarity without preprocessing
similarity_without_preprocessing = sentence_similarity(T1, T2)
print(f"Semantic similarity without preprocessing: {similarity_without_preprocessing:.2f}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aghaffar23\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aghaffar23\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Semantic similarity with preprocessing: 1.00
Semantic similarity without preprocessing: 1.00


# Question 5

In [29]:
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz

# Function to preprocess and lemmatize a sentence
def preprocess_and_lemmatize(sentence):
    # Tokenize the sentence into words
    words = word_tokenize(sentence.lower())  # Convert to lowercase

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize using WordNet lemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Join the lemmatized words into a sentence
    lemmatized_sentence = ' '.join(lemmatized_words)

    return lemmatized_sentence

# Example sentences
T1 = "Students feel unhappy today about the class today."
T2 = "Many students felt concepts of class test relevant."

similarity_score = fuzz.ratio(T1, T2)

# Preprocess and lemmatize the sentences
preprocessed_T1 = preprocess_and_lemmatize(T1)
preprocessed_T2 = preprocess_and_lemmatize(T2)

# Calculate the FuzzyWuzzy similarity score
similarity_score_p = fuzz.ratio(preprocessed_T1, preprocessed_T2)
print(f"FuzzyWuzzy similarity score between sentences before preprocess: {similarity_score}")
print(f"FuzzyWuzzy similarity score between sentences after preprocess: {similarity_score_p}")

FuzzyWuzzy similarity score between sentences before preprocess: 53
FuzzyWuzzy similarity score between sentences after preprocess: 60


# Question 6 and 7

In [30]:
import gensim
from gensim.test.utils import datapath
import numpy as np

# Tokenize the sentences into words
words_T1 = word_tokenize(T1.lower())  # Convert to lowercase
words_T2 = word_tokenize(T2.lower())  # Convert to lowercase

# Function to calculate the average word embedding vector for a sentence
def average_embedding(sentence_words, model):
    vectors = [model[word] for word in sentence_words if word in model]
    if not vectors:
        return None
    return sum(vectors) / len(vectors)

In [31]:
model_fasttext = gensim.models.fasttext.load_facebook_vectors(datapath('C:\\Users\\aghaffar23\\OneDrive - Oulun yliopisto\\Work\\Uni\\Courses\\NLP\\codes\\cc.en.300.bin\\cc.en.300.bin'))

In [32]:
# Calculate average word embedding vectors for the sentences
embedding_T1 = average_embedding(words_T1, model_fasttext)
embedding_T2 = average_embedding(words_T2, model_fasttext)

# Calculate cosine similarity between the two sentence embeddings
embedding_T1 = {index: value for index, value in enumerate(embedding_T1)}
embedding_T2 = {index: value for index, value in enumerate(embedding_T2)}
# similarity_score = np.dot(embedding_T1, embedding_T2) / (np.linalg.norm(embedding_T1) * np.linalg.norm(embedding_T2))
similarity_score = gensim.matutils.cossim(embedding_T1, embedding_T2)
print(f"Cosine similarity between sentences using fasttext model: {similarity_score}")

Cosine similarity between sentences using fasttext model: 0.8119011127826077


In [33]:
# I ran python -m  gensim.scripts.glove2word2vec -i glove.6B.300d.txt -o glove.6B.300d.word2vec.txt code to convert txt file to the txt file below to read it with this command
model_glove = gensim.models.KeyedVectors.load_word2vec_format('C:\\Users\\aghaffar23\\OneDrive - Oulun yliopisto\\Work\\Uni\\Courses\\NLP\\codes\\glove.6B.300d.word2vec.txt', binary=False)  

In [34]:
# Calculate average word embedding vectors for the sentences
embedding_T1 = average_embedding(words_T1, model_glove)
embedding_T2 = average_embedding(words_T2, model_glove)

embedding_T1 = {index: value for index, value in enumerate(embedding_T1)}
embedding_T2 = {index: value for index, value in enumerate(embedding_T2)}

# Calculate cosine similarity between the two sentence embeddings
similarity_score = gensim.matutils.cossim(embedding_T1, embedding_T2)
print(f"Cosine similarity between sentences using Glove model: {similarity_score}")

Cosine similarity between sentences using Glove model: 0.8663384252910884


In [35]:
model_word2vec = gensim.models.KeyedVectors.load_word2vec_format('C:\\Users\\aghaffar23\\OneDrive - Oulun yliopisto\\Work\\Uni\\Courses\\NLP\\codes\\glove.6B.300d.word2vec.txt', binary=False)

In [36]:
# Calculate average word embedding vectors for the sentences
embedding_T1 = average_embedding(words_T1, model_word2vec)
embedding_T2 = average_embedding(words_T2, model_word2vec)

embedding_T1 = {index: value for index, value in enumerate(embedding_T1)}
embedding_T2 = {index: value for index, value in enumerate(embedding_T2)}

# Calculate cosine similarity between the two sentence embeddings
similarity_score = gensim.matutils.cossim(embedding_T1, embedding_T2)
print(f"Cosine similarity between sentences using Word2vec model: {similarity_score}")

Cosine similarity between sentences using Word2vec model: 0.8663384252910884


# Question 8

In [40]:
import pandas as pd

# Function to calculate the average word embedding vector for a sentence
def average_embedding(sentence_words, model):
    vectors = [model[word] for word in sentence_words if word in model]
    if not vectors:
        return None
    return sum(vectors) / len(vectors)

csv_file = 'questions.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file)

# Replace 'column1' and 'column2' with the actual column names you want to read
column1_name = 'question1'
column2_name = 'question2'

# Extract the columns as lists of strings
sentences1_list = df[column1_name].astype(str).tolist()
sentences2_list = df[column2_name].astype(str).tolist()

sim = {}
for indx, T1 in enumerate(sentences1_list):
#     if indx>10:
#         break
    T2 = sentences2_list[indx]
    result1 = []
    # Tokenize the sentences into words
    words_T1 = word_tokenize(T1.lower())  # Convert to lowercase
    words_T2 = word_tokenize(T2.lower())  # Convert to lowercase

    # Calculate average word embedding vectors for the sentences
    embedding_T1 = average_embedding(words_T1, model_fasttext)
    embedding_T2 = average_embedding(words_T2, model_fasttext)

    embedding_T1 = {index: value for index, value in enumerate(embedding_T1)}
    embedding_T2 = {index: value for index, value in enumerate(embedding_T2)}

    # Calculate cosine similarity between the two sentence embeddings
    similarity_score = gensim.matutils.cossim(embedding_T1, embedding_T2)
    result1.append(similarity_score)
#     print(f"Cosine similarity between sentences using fasttext model: {similarity_score}")

    # Calculate average word embedding vectors for the sentences
    embedding_T1 = average_embedding(words_T1, model_glove)
    embedding_T2 = average_embedding(words_T2, model_glove)

    embedding_T1 = {index: value for index, value in enumerate(embedding_T1)}
    embedding_T2 = {index: value for index, value in enumerate(embedding_T2)}

    # Calculate cosine similarity between the two sentence embeddings
    similarity_score = gensim.matutils.cossim(embedding_T1, embedding_T2)
    result1.append(similarity_score)
#     print(f"Cosine similarity between sentences using Glove model: {similarity_score}")

    # Calculate average word embedding vectors for the sentences
    embedding_T1 = average_embedding(words_T1, model_word2vec)
    embedding_T2 = average_embedding(words_T2, model_word2vec)

    embedding_T1 = {index: value for index, value in enumerate(embedding_T1)}
    embedding_T2 = {index: value for index, value in enumerate(embedding_T2)}

    # Calculate cosine similarity between the two sentence embeddings
    similarity_score = gensim.matutils.cossim(embedding_T1, embedding_T2)
    result1.append(similarity_score)
#     print(f"Cosine similarity between sentences using Word2vec model: {similarity_score}")

    similarity_score = fuzz.ratio(T1, T2)
    result1.append(similarity_score)
    # Preprocess and lemmatize the sentences
    preprocessed_T1 = preprocess_and_lemmatize(T1)
    preprocessed_T2 = preprocess_and_lemmatize(T2)

    # Calculate the FuzzyWuzzy similarity score
    similarity_score_p = fuzz.ratio(preprocessed_T1, preprocessed_T2)
#     print(f"FuzzyWuzzy similarity score between sentences before preprocess: {similarity_score}")
#     print(f"FuzzyWuzzy similarity score between sentences after preprocess: {similarity_score_p}")
    result1.append(similarity_score_p)

    similarity_score = sentence_similarity(T1, T2)
#     print(f"Semantic similarity using Mihalacea between sentences: {similarity_score}")
    result1.append(similarity_score)

#     similarity_wup = Similarity(1, T1, T2, 0)
#     result1.append(similarity_wup)
#     print(f"Semantic similarity using WUP between sentences: {similarity_wup}")
    sim[indx] = result1
#     print(f"Sentences No. {indx} is done!")
print(80*"-")
print("Similarities with different metrics for all of the questions are stored in sim dictionary just first 10 of list we show:")

for i in range(10):
    print(sim[i+1])

--------------------------------------------------------------------------------
Similarities with different metrics for all of the questions are stored in sim dictionary just first 10 of list we show:
[0.9042288550402525, 0.8826061661557302, 0.8826061661557302, 65, 65, 1.0]
[0.7958676359362555, 0.8999295536947399, 0.8999295536947399, 55, 57, 1.0]
[0.5380886783321475, 0.7154154328444283, 0.7154154328444283, 28, 24, 1.0]
[0.7987387016393281, 0.8481633348352544, 0.8481633348352544, 45, 43, 1.0]
[0.9240177045349875, 0.9499700800375436, 0.9499700800375436, 66, 64, 1.0]
[0.6253272739721533, 0.6715399182452603, 0.6715399182452603, 22, 26, 0.6666666666666666]
[0.9643473296476769, 0.9699451765344302, 0.9699451765344302, 59, 79, 1.0]
[0.7392092219678874, 0.9087268318739488, 0.9087268318739488, 85, 68, 1.0]
[0.8146498289631632, 0.854215765124131, 0.854215765124131, 51, 50, 1.0]
[0.8083993017154593, 0.733711311109538, 0.733711311109538, 36, 40, 0.7692307692307693]
