In [3]:
!pip install nltk


Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.4.0-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.4.28-cp38-cp38-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/41.9 kB ? eta -:--:--
     ------------------ ------------------- 20.5/41.9 kB 320.0 kB/s eta 0:00:01
     -------------------------------------- 41.9/41.9 kB 502.9 kB/s eta 0:00:00
Collecting tqdm (from nltk)
  Using cached tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Downloading regex-2024.4.28-cp38-cp38-win_amd64.whl (269 kB)
   ---------------------------------------- 0.0/269.0 kB ? eta -:--:--
   ---------------- ----------------------- 112.6/269.0 kB 2.2 MB/s eta 0:00:01
   ------------------------------------ --- 245

In [4]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Sample sentence
text = 'I didn’t say he stole the money.'
sentences = nltk.sent_tokenize(text.lower())

def preprocess_text(sentences):
    """Remove punctuation, stop words, and apply lemmatization."""
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()

    processed_sentences = []
    for sentence in sentences:
        # Remove punctuation
        words = tokenizer.tokenize(sentence)

        # Remove stop words and lemmatize
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        processed_sentences.append(lemmatized_words)
    
    return processed_sentences

processed_sentences = preprocess_text(sentences)
print("Processed Sentences:", processed_sentences)

def tag_pos(sentences):
    """Tag sentences with parts of speech."""
    return [nltk.pos_tag(sentence) for sentence in sentences]

tagged_sentences = tag_pos(processed_sentences)
print("POS Tagged Sentences:", tagged_sentences)

class SimplifiedLesk:
    def __init__(self):
        self.stopwords = set(stopwords.words('english'))

    def disambiguate(self, word, sentence):
        """Return the best sense from wordnet for the word in the given sentence."""
        word_senses = wn.synsets(word)
        best_sense = word_senses[0]  # Assume that the first sense is most frequent
        max_overlap = 0
        context = set(word_tokenize(sentence))
        
        for sense in word_senses:
            signature = self.tokenized_gloss(sense)
            overlap = self.compute_overlap(signature, context)
            if overlap > max_overlap:
                max_overlap = overlap
                best_sense = sense
        
        return best_sense

    def tokenized_gloss(self, sense):
        """Return set of token in gloss and examples."""
        tokens = set(word_tokenize(sense.definition()))
        for example in sense.examples():
            tokens.update(set(word_tokenize(example)))
        return tokens

    def compute_overlap(self, signature, context):
        """Returns the number of words in common between two sets."""
        gloss = signature.difference(self.stopwords)
        return len(gloss.intersection(context))

# Sample use of Simplified Lesk Algorithm
sentence = "I banked on the wrong company."
word = "bank"
lesk = SimplifiedLesk()
best_sense = lesk.disambiguate(word, sentence)
print("Best Sense:", best_sense.definition())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aakas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aakas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aakas\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aakas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processed Sentences: [['say', 'stole', 'money']]
POS Tagged Sentences: [[('say', 'VB'), ('stole', 'JJ'), ('money', 'NN')]]
Best Sense: have confidence or faith in
