In [1]:
!pip install nltk



In [27]:
# Importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import random

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import make_pipeline
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

In [28]:
# nltk - wordnet and lexical database: Setting up WordNet in NLTK
import nltk
nltk.download('wordnet')

# For wordent with different language
nltk.download('omw-1.4')

# For tokenization
nltk.download('punkt')

# POS tagger: part of speech
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

# For stopwords
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to C:\Users\sheila
[nltk_data]     brown\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\sheila
[nltk_data]     brown\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\sheila
[nltk_data]     brown\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sheila brown\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\sheila brown\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to C:\Users\sheila
[nltk_

True

In [4]:
# Implement lesk Algorithm - is simple knowledge-based WSD method - lesk()
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn

In [5]:
# Importing CSV file
df = pd.read_csv('WSD_2905.csv')
pd.set_option('display.max_rows', None)
df.head(10)

Unnamed: 0,sn,sentence/context,polysemy_word
0,1.0,I have bank account.,bank
1,2.0,Loan amount is approved by the bank.,bank
2,3.0,He returned to office after he deposited cash ...,bank
3,4.0,They started using new software in their bank.,bank
4,5.0,he went to bank balance inquiry.,bank
5,6.0,I wonder why some bank have more interest rate...,bank
6,7.0,You have to deposit certain percentage of your...,bank
7,8.0,He took loan from a Bank.,bank
8,9.0,he is waking along the river bank.,bank
9,10.0,The red boat in the bank is already sold.,bank


In [15]:
# Selecting data with 'bank' as the polysemy word
word = 'bank'
bank_df = df[df['polysemy_word'] == word]
bank_df = bank_df.reset_index(drop=True, inplace=False)

# Iterating over the DataFrame to set the definition
for index, row in bank_df.iterrows():
    synsets = wn.synsets(word) 
    if index <= 7:
        bank_df.loc[index, 'definition'] = synsets[1].name()
    else:
        bank_df.loc[index, 'definition'] = synsets[0].name()

# Display the updated DataFrame
bank_df

Unnamed: 0,sn,sentence/context,polysemy_word,definition
0,1.0,I have bank account.,bank,depository_financial_institution.n.01
1,2.0,Loan amount is approved by the bank.,bank,depository_financial_institution.n.01
2,3.0,He returned to office after he deposited cash ...,bank,depository_financial_institution.n.01
3,4.0,They started using new software in their bank.,bank,depository_financial_institution.n.01
4,5.0,he went to bank balance inquiry.,bank,depository_financial_institution.n.01
5,6.0,I wonder why some bank have more interest rate...,bank,depository_financial_institution.n.01
6,7.0,You have to deposit certain percentage of your...,bank,depository_financial_institution.n.01
7,8.0,He took loan from a Bank.,bank,depository_financial_institution.n.01
8,9.0,he is waking along the river bank.,bank,bank.n.01
9,10.0,The red boat in the bank is already sold.,bank,bank.n.01


In [7]:
# Selecting data with 'plant' as the polysemy word
word = 'plant'
plant_df = df[df['polysemy_word'] == word]
plant_df = plant_df.reset_index(drop=True, inplace=False)

# Iterating over the DataFrame to set the definition
for index, row in plant_df.iterrows():
    synsets = wn.synsets(word) 
    if index <= 3:
        plant_df.loc[index, 'definition'] = synsets[0].name()
    elif 3 < index <= 7:
        plant_df.loc[index, 'definition'] = synsets[1].name()

# Display the updated DataFrame
plant_df

Unnamed: 0,sn,sentence/context,polysemy_word,definition
0,493.0,They built a large plant to manufacture automo...,plant,plant.n.01
1,494.0,He laid at that plant.,plant,plant.n.01
2,495.0,Our company is planning to build a new chemica...,plant,plant.n.01
3,496.0,Fear of pollution discouraged people from buil...,plant,plant.n.01
4,497.0,Plant dies without water.,plant,plant.n.02
5,498.0,Take care of your plant in the garden.,plant,plant.n.02
6,499.0,You have various plant flower.,plant,plant.n.02
7,500.0,Plant grows only in the tropical regions.,plant,plant.n.02


In [17]:
# Selecting data with 'bark' as sense word
word = 'bark'
bark_df = df[df['polysemy_word'] == word]
bark_df = bark_df.reset_index(drop=True, inplace=False)

# Iterating over the DataFrame to set the definition
for index, row in bark_df.iterrows():
    synsets = wn.synsets(word) 
    if index <= 4:
        bark_df.loc[index, 'definition'] = synsets[0].name()
    elif 4 < index <= 9:
        bark_df.loc[index, 'definition'] = synsets[1].name()

# Display the updated DataFrame
bark_df

Unnamed: 0,sn,sentence/context,polysemy_word,definition
0,1931.0,The bark of this tree is very rough.,bark,bark.n.01
1,1932.0,Bears like to scratch their back on tree bark.,bark,bark.n.01
2,1933.0,Bears often scratch their backs on the bark of...,bark,bark.n.01
3,1934.0,They stripped the tree of its bark.,bark,bark.n.01
4,1935.0,The collected bark of tree.,bark,bark.n.01
5,1936.0,That dog is trained to bark at stranger.,bark,bark.n.02
6,1937.0,The dog ‘s bark is against the intruder.,bark,bark.n.02
7,1938.0,Dog that bark a lot usually aren t dangerous.,bark,bark.n.02
8,1939.0,The bark of that dog wouldn t even scare off a...,bark,bark.n.02
9,1940.0,Sometimes I hear dog ‘s bark in the middle of ...,bark,bark.n.02


In [18]:
# Selecting data with 'bass' as sense word
word = 'bass'
bass_df = df[df['polysemy_word'] == word]
bass_df = bass_df.reset_index(drop=True, inplace=False)

# Iterating over the DataFrame to set the definition
for index, row in bass_df.iterrows():
    synsets = wn.synsets(word) 
    if index <= 5:
        bass_df.loc[index, 'definition'] = synsets[0].name()
    elif 5 < index <= 9:
        bass_df.loc[index, 'definition'] = synsets[3].name()

# Display the updated DataFrame
bass_df

Unnamed: 0,sn,sentence/context,polysemy_word,definition
0,48.0,He can play the bass.,bass,bass.n.01
1,49.0,My bass string broke.,bass,bass.n.01
2,50.0,"It is all about the bass, no treble.",bass,bass.n.01
3,51.0,Most bass guitar strings are made of nickel-wr...,bass,bass.n.01
4,52.0,Some bass has Cobalt string.,bass,bass.n.01
5,53.0,The bass of female voice is different than tha...,bass,bass.n.01
6,54.0,I caught a bass in my net.,bass,sea_bass.n.01
7,55.0,I quite like to eat bass.,bass,sea_bass.n.01
8,56.0,The bass is very nutritious for the person suf...,bass,sea_bass.n.01
9,57.0,Bass is a North American delicacy.,bass,sea_bass.n.01


In [19]:
# Selecting data with 'stable' as sense word
word = 'stable'
stable_df = df[df['polysemy_word'] == word]
stable_df = stable_df.reset_index(drop=True, inplace=False)

# Iterating over the DataFrame to set the definition
for index, row in stable_df.iterrows():
    synsets = wn.synsets(word) 
    if index <= 2:
        stable_df.loc[index, 'definition'] = synsets[0].name()
    elif 2 < index <= 9:
        stable_df.loc[index, 'definition'] = synsets[3].name()

# Display the updated DataFrame
stable_df

Unnamed: 0,sn,sentence/context,polysemy_word,definition
0,107.0,He owns a horse stable.,stable,stable.n.01
1,108.0,The horse is in the stable.,stable,stable.n.01
2,109.0,A horse is tied in the stable.,stable,stable.n.01
3,110.0,He is not stable after the new circumstance.,stable,stable.s.02
4,111.0,The price in the market is stable for the last...,stable,stable.s.02
5,112.0,"Do not worry, the ladder is stable.",stable,stable.s.02
6,113.0,Although there is fluctuation in economy of Ne...,stable,stable.s.02
7,114.0,Some elements are stable in chemical reaction.,stable,stable.s.02
8,115.0,Most of the intermediate product in chemical r...,stable,stable.s.02
9,116.0,Some chemical compounds are very stable.,stable,stable.s.02


In [31]:
# Concatenate all dataframes into one dataframe
combined_df = pd.concat([bank_df, plant_df, bark_df, bass_df, stable_df], axis=0)
combined_df = combined_df.reset_index(drop=True, inplace=False)
combined_df

Unnamed: 0,sn,sentence/context,polysemy_word,definition
0,1.0,I have bank account.,bank,depository_financial_institution.n.01
1,2.0,Loan amount is approved by the bank.,bank,depository_financial_institution.n.01
2,3.0,He returned to office after he deposited cash ...,bank,depository_financial_institution.n.01
3,4.0,They started using new software in their bank.,bank,depository_financial_institution.n.01
4,5.0,he went to bank balance inquiry.,bank,depository_financial_institution.n.01
5,6.0,I wonder why some bank have more interest rate...,bank,depository_financial_institution.n.01
6,7.0,You have to deposit certain percentage of your...,bank,depository_financial_institution.n.01
7,8.0,He took loan from a Bank.,bank,depository_financial_institution.n.01
8,9.0,he is waking along the river bank.,bank,bank.n.01
9,10.0,The red boat in the bank is already sold.,bank,bank.n.01


In [32]:
# Keywords
bank_savings_list = ['account', 'loan', 'amount', 'deposited', 'software', 'cash', 'money', 'balance', 'interest', 'deposit']
bank_river_list = ['boat', 'river', 'sea', 'vacation']

plant_power_list = ['manufacture', 'laid', 'chemical', 'power']
plant_tree_list = ['water', 'garden', 'flower', 'grows']

bark_tree_list = ['tree', 'trees']
bark_dog_list = ['dog', 'dogs', "dog's"]

bass_music_list = ['play', 'string', 'treble', 'strings', 'voice']
bass_fish_list = ['caught', 'net', 'eat', 'nutritious', 'delicacy']

stable_horse_list = ['horse']
stable_condition_list = ['circumstance', 'price', 'worry', 'economy', 'chemical']

In [33]:
def get_best_sense(word, context):
    synsets = wn.synsets(word)    
    tokens = nltk.word_tokenize(context)
    
    match word:
        case 'bank': 
            for token in tokens:
                if token in bank_river_list:
                    selected_sense = synsets[0]
                    break
                elif token in bank_savings_list:
                    selected_sense = synsets[1]
                    break
                else:
                    selected_sense = None

        case 'plant':
            for token in tokens:
                if token in plant_power_list:
                    selected_sense = synsets[0]
                    break
                elif token in plant_tree_list:
                    selected_sense = synsets[1]
                    break
                else:
                    selected_sense = None          

        case 'bark':
            for token in tokens:
                if token in bark_tree_list:
                    selected_sense = synsets[0]
                    break
                elif token in bark_dog_list:
                    selected_sense = synsets[1]
                    break
                else:
                    selected_sense = None            
    
        case 'bass':            
            for token in tokens:
                if token in bass_music_list:
                    selected_sense = synsets[0]
                    break
                elif token in bass_fish_list:
                    selected_sense = synsets[4]
                    break
                else:
                    selected_sense = None      
    
        case 'stable':            
            for token in tokens:
                if token in stable_horse_list:
                    selected_sense = synsets[0]
                    break
                elif token in stable_condition_list:
                    selected_sense = synsets[4]
                    break
                else:
                    selected_sense = None

        case default:
            pass

    return selected_sense

In [23]:
# Checking the available synsets
word = 'stable'
synsets = wn.synsets(word)
print(f'All possible senses for {word}: ')
for i, synset in enumerate(synsets, 1):
    print(f'{i}: {synset.name()}: {synset.definition()}')

All possible senses for stable: 
1: stable.n.01: a farm building for housing horses or other livestock
2: stable.v.01: shelter in a stable
3: stable.a.01: resistant to change of position or condition
4: stable.s.02: firm and dependable; subject to little fluctuation
5: stable.s.03: not taking part readily in chemical change
6: stable.s.04: maintaining equilibrium
7: static.s.03: showing little if any change


In [24]:
def choose_df(df):
    predictions = []
    
    for index, row in df.iterrows():
        best_sense = get_best_sense(row[2], row[1])
        # print(row[2])
        # print(best_sense)
    
        # Check
        if best_sense:
            print(f"Sentence: {row[1]}")
            # print(f"Best sense of 'bank': {best_sense.name()}")
            # print(f"Definition: {best_sense.definition()}")
            # print()
            predictions.append(best_sense.name())
        else:    
            print(f"Sentence: {row[1]}")
            # print(f"No suitable sense found for '{row[2]}'.")
            # print()
            predictions.append(None)
        print(predictions)
        print()

    return predictions

In [34]:
word_sense = input('Choose a word from the following (bank, plant, bark, bass, stable): ')
print(word_sense)
match word_sense:
    case 'bank':
        y_test = bank_df['definition'].tolist()
        y_pred = choose_df(bank_df)
    
    case 'plant':
        y_test = plant_df['definition'].tolist()
        y_pred = choose_df(plant_df)   
    
    case 'bark':
        y_test = bark_df['definition'].tolist()
        y_pred = choose_df(bark_df)  
    
    case 'bass':            
        y_test = bass_df['definition'].tolist()
        y_pred = choose_df(bass_df)
    
    case 'stable':
        y_test = stable_df['definition'].tolist()
        y_pred = choose_df(stable_df)
    
    case default:
        print('Invalid word.')

print()
print(y_test)
print(y_pred)

Choose a word from the following (bank, plant, bark, bass, stable):  plant


plant
Sentence: They built a large plant to manufacture automobile.
['plant.n.01']

Sentence: He laid at that plant.
['plant.n.01', 'plant.n.01']

Sentence: Our company is planning to build a new chemical plant in Russia.
['plant.n.01', 'plant.n.01', 'plant.n.01']

Sentence: Fear of pollution discouraged people from building homes near power plant.
['plant.n.01', 'plant.n.01', 'plant.n.01', 'plant.n.01']

Sentence: Plant dies without water.
['plant.n.01', 'plant.n.01', 'plant.n.01', 'plant.n.01', 'plant.n.02']

Sentence: Take care of your plant in the garden.
['plant.n.01', 'plant.n.01', 'plant.n.01', 'plant.n.01', 'plant.n.02', 'plant.n.02']

Sentence: You have various plant flower.
['plant.n.01', 'plant.n.01', 'plant.n.01', 'plant.n.01', 'plant.n.02', 'plant.n.02', 'plant.n.02']

Sentence: Plant grows only in the tropical regions.
['plant.n.01', 'plant.n.01', 'plant.n.01', 'plant.n.01', 'plant.n.02', 'plant.n.02', 'plant.n.02', 'plant.n.02']


['plant.n.01', 'plant.n.01', 'plant.n.01

  best_sense = get_best_sense(row[2], row[1])
  print(f"Sentence: {row[1]}")


In [None]:
bank_sentences = [
    'The fisherman sat quietly on the grassy bank of the river, waiting for a bite',
    'The bank offered a low-interest loan to help small businesses during the economic downturn',
    'After the storm, the bank of the stream was eroded, making the water much deeper',
    'I need to visit the bank to open a new checking account and get a debit card',
    'The hiking trail wound along the bank of the creek, offering stunning views of the forest'    
]

bank_labels = [
    'bank.n.01',
    'depository_financial_institution.n.01',
    'bank.n.01',
    'depository_financial_institution.n.01',
    'bank.n.01',
]

plant_sentences = [
    'A fire broke out at the chemical plant, prompting an immediate evacuation',
    'The steel plant emits a lot of smoke, which is a concern for nearby residents',
    'The botanist studied the rare plant, which only grows in high-altitude regions',
    'The flower is a tall plant that turns its head to follow the sun',
    'They planted a row of young trees in their garden which they are hoping they would grow into a lush plant hedge.'
]

plant_labels = [
    'plant.n.01',
    'plant.n.02',
    'plant.n.01',
    'plant.n.02',
    'plant.n.02'
]

bark_sentences = [
    'The bark of the oak tree was rough to the touch, and small insects crawled over it',
    'The woodcutter examined the bark carefully to determine if the tree was healthy',
    'The loud bark of the dog startled everyone in the room, echoing through the house',
    'The dog let out a harsh bark when it heard a suspicious person',
    'The thick bark of the redwood tree protects it from forest fires'
]

bark_labels = [
    
]

# Training a model

In [29]:
df.head()

Unnamed: 0,sn,sentence/context,polysemy_word
0,1.0,I have bank account.,bank
1,2.0,Loan amount is approved by the bank.,bank
2,3.0,He returned to office after he deposited cash ...,bank
3,4.0,They started using new software in their bank.,bank
4,5.0,he went to bank balance inquiry.,bank


In [None]:
# TfidfVectorizer: Create a pipeline for classification
def clean_text(text):
    token = word_tokenize(text)    
    c_stopwords = set(stopwords.words('english'))    
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []
    stemmed_words = []
    
    for word in token:
        lower_word = word.lower()
        if lower_word not in c_stopwords:
            lemmatized_words.append(lemmatizer.lemmatize(lower_word))
    
    for word in lemmatized_words:
        stemmed_words.append(stemmer.stem(word))
    
    clean_text = ' '.join(stemmed_words)
    
    return clean_text

clean_texts = []
sentences = df['sentence/context'].tolist()
labels = df[]
for index, row in df.iterrows():
    sentence = row[1].to
    text = clean_text(text)
    clean_texts.append(text)

model = make_pipeline(TfidfVectorizer(stop_words='english'), MultinomialNB())
model.fit(clean_texts, labels)

# Metric Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f'''EVALUATION METRICS
Accuracy: {accuracy}
Precision: {precision}
Recall Score: {recall}
F1 Score: {f1}''')

In [None]:
sns.set_theme(style='darkgrid', palette='mako', font='sans-serif', font_scale=1, color_codes=True, rc=None)
sns.heatmap(cm, annot=True, fmt='.2f', cmap='crest')