In [1]:
import nltk
from nltk.corpus import senseval
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
nltk.download('senseval')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package senseval to
[nltk_data]     /Users/akeresh/nltk_data...
[nltk_data]   Package senseval is already up-to-date!
[nltk_data] Downloading package punkt to /Users/akeresh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/akeresh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
def load_senseval2_data(word):
    """
    Load Senseval-2 instances for a given word.
    :param word: The word to load instances for (e.g., 'hard.pos', 'interest.pos', 'line.pos', etc.)
    :return: List of instances for the specified word.
    """
    instances = senseval.instances(word)
    return instances

In [4]:
def refine_senseval_data(data_entries):
    """
    Enhance a collection of Senseval-2 data entries by including POS tagging and creating feature sets.
    
    :param data_entries: A collection of Senseval-2 data entries.
    :return: An enhanced list of data entries with added linguistic features.
    """
    def enhance_entry(entry):
        extracted_words = [word_data[0] if isinstance(word_data, tuple) else word_data for word_data in entry.context]
        tokenized_context = word_tokenize(' '.join(extracted_words))
        
        key_word = extracted_words[entry.position]
        key_word_index = tokenized_context.index(key_word)  
        
        tagged_tokens = pos_tag(tokenized_context)
        
        feature_set = {
            'word': tokenized_context[key_word_index],
            'pos': tagged_tokens[key_word_index][1],
            'prev_word_pos': tagged_tokens[key_word_index - 1][1] if key_word_index > 0 else 'START',
            'next_word_pos': tagged_tokens[key_word_index + 1][1] if key_word_index < len(tokenized_context) - 1 else 'END',
        }
        
        return {
            'features': feature_set,
            'senses': entry.senses
        }
    
    return [enhance_entry(entry) for entry in data_entries]


In [5]:
words = ['hard.pos','line.pos','serve.pos','interest.pos']
for word in words:
    instances = load_senseval2_data(word)
    processed_instances = refine_senseval_data(instances)
    
    # Displaying the number of processed instances and a single instance example
    print(f"Number of processed instances for {word}: {len(processed_instances)}")
    print("Example processed instance:", processed_instances[0])
    print("==========================================")

Number of processed instances for hard.pos: 4333
Example processed instance: {'features': {'word': 'hard', 'pos': 'JJ', 'prev_word_pos': 'VBZ', 'next_word_pos': 'TO'}, 'senses': ('HARD1',)}
Number of processed instances for line.pos: 4146
Example processed instance: {'features': {'word': 'lines', 'pos': 'NNS', 'prev_word_pos': 'NN', 'next_word_pos': 'IN'}, 'senses': ('cord',)}
Number of processed instances for serve.pos: 4378
Example processed instance: {'features': {'word': 'serve', 'pos': 'VB', 'prev_word_pos': 'TO', 'next_word_pos': 'PRP'}, 'senses': ('SERVE10',)}
Number of processed instances for interest.pos: 2368
Example processed instance: {'features': {'word': 'interest', 'pos': 'NN', 'prev_word_pos': 'IN', 'next_word_pos': 'NNS'}, 'senses': ('interest_6',)}


In [6]:
def compute_sense_frequencies(instances):
    """
    Computes the probability of each sense occurring in a list of processed instances.
    
    :param instances: A list of processed instances, each containing 'senses'.
    :return: A dictionary with senses as keys and their probabilities as values.
    """
    sense_frequency = {}
    total_sense_occurrences = 0
    
    for instance in instances:
        senses = instance['senses']
        for sense in senses:
            if sense not in sense_frequency:
                sense_frequency[sense] = 1
            else:
                sense_frequency[sense] += 1
            total_sense_occurrences += 1
    

    sense_probabilities = {sense: frequency / total_sense_occurrences for sense, frequency in sense_frequency.items()}
    
    return sense_probabilities

sense_probabilities = compute_sense_frequencies(processed_instances)
sense_probabilities

{'interest_6': 0.5287162162162162,
 'interest_5': 0.21114864864864866,
 'interest_4': 0.07516891891891891,
 'interest_1': 0.15244932432432431,
 'interest_3': 0.02787162162162162,
 'interest_2': 0.0046452702702702705}

In [7]:
def compute_probabilities_of_features_by_sense(processed_data):
    sense_to_feature_counts = {}
    sense_to_total_counts = {}

    for item in processed_data:
        for sense in item['senses']:
            if sense not in sense_to_feature_counts:
                sense_to_feature_counts[sense] = {}
            if sense not in sense_to_total_counts:
                sense_to_total_counts[sense] = 0
            
            for feature, _ in item['features'].items():
                if feature not in sense_to_feature_counts[sense]:
                    sense_to_feature_counts[sense][feature] = 1
                else:
                    sense_to_feature_counts[sense][feature] += 1
                
                sense_to_total_counts[sense] += 1

    probabilities = {}
    for sense, features in sense_to_feature_counts.items():
        probabilities[sense] = {}
        for feature, count in features.items():
            probabilities[sense][feature] = count / sense_to_total_counts[sense]

    return probabilities

feature_given_sense_probabilities = compute_probabilities_of_features_by_sense(processed_instances)
feature_given_sense_probabilities

{'interest_6': {'word': 0.25,
  'pos': 0.25,
  'prev_word_pos': 0.25,
  'next_word_pos': 0.25},
 'interest_5': {'word': 0.25,
  'pos': 0.25,
  'prev_word_pos': 0.25,
  'next_word_pos': 0.25},
 'interest_4': {'word': 0.25,
  'pos': 0.25,
  'prev_word_pos': 0.25,
  'next_word_pos': 0.25},
 'interest_1': {'word': 0.25,
  'pos': 0.25,
  'prev_word_pos': 0.25,
  'next_word_pos': 0.25},
 'interest_3': {'word': 0.25,
  'pos': 0.25,
  'prev_word_pos': 0.25,
  'next_word_pos': 0.25},
 'interest_2': {'word': 0.25,
  'pos': 0.25,
  'prev_word_pos': 0.25,
  'next_word_pos': 0.25}}

In [8]:
def load_and_prepare_senseval_data(words):
    """
    Load Senseval data for the specified words and preprocess it.
    Returns a DataFrame with features and labels.
    """
    data_frames = []
    for word in words:
        instances = senseval.instances(word)
        processed_instances = refine_senseval_data(instances)
        df = pd.DataFrame(processed_instances)
        data_frames.append(df)
    return pd.concat(data_frames, ignore_index=True)

In [9]:
# words = ['interest', 'line', 'serve']  # Example words
words =  ['hard.pos','line.pos','serve.pos','interest.pos']
data = load_and_prepare_senseval_data(words)

In [10]:
data["features"][0]

{'word': 'hard', 'pos': 'JJ', 'prev_word_pos': 'VBZ', 'next_word_pos': 'TO'}

In [11]:
def train_and_evaluate_model(data):
    X = data['features']
    y = data['senses'].map(lambda x: x[0])

    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
    
    pipeline = Pipeline([
        ('dict_vectorizer', DictVectorizer(sparse=False)),  
        ('classifier', LogisticRegression(solver='liblinear'))
    ])
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=le.inverse_transform(range(len(le.classes_))), zero_division=0))

    print("==========================================")

train_and_evaluate_model(data)

Classification Report:
              precision    recall  f1-score   support

       HARD1       0.85      0.98      0.91       687
       HARD2       0.58      0.36      0.44       105
       HARD3       0.55      0.14      0.22        86
     SERVE10       0.77      0.86      0.81       380
     SERVE12       0.67      0.70      0.69       268
      SERVE2       0.52      0.39      0.44       175
      SERVE6       0.54      0.46      0.50        85
        cord       0.26      0.14      0.19        63
    division       0.56      0.06      0.11        82
   formation       0.63      0.22      0.33        76
  interest_1       0.47      0.69      0.56        55
  interest_2       0.00      0.00      0.00         3
  interest_3       0.00      0.00      0.00        12
  interest_4       0.50      0.30      0.38        33
  interest_5       0.72      0.74      0.73        88
  interest_6       0.91      0.92      0.91       240
       phone       0.32      0.08      0.13        85
    

## The classification report 

Model excels at identifying certain classes (e.g., HARD1, product) with high precision, recall, and f1-scores, indicating strong performance in these areas. Conversely, it struggles with classes like interest_2 and interest_3, showing zero precision and recall, likely due to insufficient training examples or class imbalance. Other classes such as cord and division also present challenges, with low scores across the board, which might be due to the model's difficulty in distinguishing them from similar classes.

The disparities in performance can be attributed to factors such as class imbalance, where some classes have significantly more examples than others, and limitations in feature representation that may not capture all nuances necessary for accurate classification across different senses. Additionally, the choice of model and its complexity may not be fully suited to the task's requirements.

Improving the model could involve addressing class imbalance through techniques like oversampling or undersampling, enhancing feature engineering to capture more contextual information, experimenting with more sophisticated models better suited for text classification, and tuning hyperparameters to optimize performance.