# Data Prep

In [1]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
# Example dataset
dataset = [
    # Sports-related sentences
    "The soccer team scored a goal in the final minute of the game.",
    "The basketball player set a new scoring record which helped the team win the game.",
    "The hockey goalie's quick reflexes prevented a goal, striking down the opponent's chances in the crucial game.",
    "The baseball pitcher's aim was to strike out the opponent and secure the game's result in their favor.",
    "The football coach praised the team's defensive positions, which helped achieve the desired result in the game.",

    # Politics-related sentences
    "The politician outlined his economic goals, striking a balance for the upcoming term.",
    "The party leader set a new fundraising record, which helped secure their position in the political game.",
    "The government's policies put it in a vulnerable position, striking a chord in the international arena.",
    "The president's speech during the rally set the goal for the political game ahead.",
    "The diplomat's strategic positioning helped achieve a positive result in negotiating a peace treaty."
]

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get the WordNet POS tag
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Process the dataset
processed_words = set()
processed_dataset = []
for sentence in dataset:
    # Tokenize the sentence
    tokens = word_tokenize(sentence)
    # Remove punctuation and convert to lower-case
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]
    # Add to the set of unique words
    processed_words.update(lemmatized_tokens)
    # Add to process dataset
    processed_dataset.append(' '.join(lemmatized_tokens))

for processed_sentence in processed_dataset:
    print(processed_sentence)
# Print the unique words
print(processed_words)

the soccer team score a goal in the final minute of the game
the basketball player set a new score record which help the team win the game
the hockey goalie quick reflex prevent a goal strike down the opponent chance in the crucial game
the baseball pitcher aim be to strike out the opponent and secure the game result in their favor
the football coach praise the team defensive position which help achieve the desire result in the game
the politician outline his economic goal strike a balance for the upcoming term
the party leader set a new fundraise record which help secure their position in the political game
the government policy put it in a vulnerable position strike a chord in the international arena
the president speech during the rally set the goal for the political game ahead
the diplomat strategic position help achieve a positive result in negotiate a peace treaty
{'the', 'and', 'outline', 'team', 'political', 'ahead', 'peace', 'treaty', 'balance', 'goalie', 'upcoming', 'strike',

## Contexts Prep

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
# Step 2 & 3: Context Selection and Feature Extraction
target_words = ['goal']
context_size = 3  # Number of words to consider before and after the target word

contexts = []
for sentence in processed_dataset:
    tokens_list = sentence.split()
    for i, word in enumerate(tokens_list):
        if word in target_words:
            start_index = max(0, i - context_size)
            end_index = min(len(tokens_list), i + context_size + 1)
            context = ' '.join(tokens_list[start_index:i]) + f" {word} " + ' '.join(tokens_list[i+1:end_index])
            contexts.append(context)
print(contexts)


['team score a goal in the final', 'reflex prevent a goal strike down the', 'outline his economic goal strike a balance', 'rally set the goal for the political']


In [4]:
import pandas as pd
# Step 4: Model Training
vectorizer = CountVectorizer()
doc_vecs = vectorizer.fit_transform(contexts)
df = pd.DataFrame(doc_vecs.toarray(), columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,balance,down,economic,final,for,goal,his,in,outline,political,prevent,rally,reflex,score,set,strike,team,the
0,0,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,1,1
1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,1
2,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,1,0,0,0,1,0,1,0,0,1,0,0,2


# Init Parameters

In [49]:
# Initialize parameters
num_senses = 2
num_contexts, num_features = doc_vecs.shape
sense_probs = np.random.rand(num_senses)
sense_probs /= np.sum(sense_probs)
print(f"sense_probs \n{sense_probs}") # P(Y)
doc_given_sense_probs = np.random.rand(num_features, num_senses)
doc_given_sense_probs /= np.sum(doc_given_sense_probs, axis=1)[:, np.newaxis]
print(f"doc_given_sense_probs \n{doc_given_sense_probs}") # P(Y|X)

sense_probs 
[0.5341166 0.4658834]
doc_given_sense_probs 
[[0.88171699 0.11828301]
 [0.68808226 0.31191774]
 [0.19744135 0.80255865]
 [0.5669153  0.4330847 ]
 [0.57191202 0.42808798]
 [0.01617425 0.98382575]
 [0.64721788 0.35278212]
 [0.36913138 0.63086862]
 [0.05692972 0.94307028]
 [0.32734545 0.67265455]
 [0.49977954 0.50022046]
 [0.76381956 0.23618044]
 [0.70122699 0.29877301]
 [0.62079483 0.37920517]
 [0.28057818 0.71942182]
 [0.73193443 0.26806557]
 [0.46750064 0.53249936]
 [0.53204746 0.46795254]]


## E-Step
Using Bayes' theorem, the posterior probability of a sense given a document is calculated as:

$ P(\text{sense} | \text{doc}) = \frac{P(\text{doc} | \text{sense}) \cdot P(\text{sense})}{P(\text{doc})} $

where:

- $ P(\text{doc} | \text{sense}) $ is the likelihood of the document given the sense.
- $ P(\text{sense}) $ is the prior probability of the sense.
- $ P(\text{doc}) $ is the marginal probability of the document (a normalization factor).

### Calculation Steps in the E-Step

1. **Calculate the Joint Probability $ P(\text{doc}, \text{sense}) $**:
   - For each document and each sense, compute the product of the sense prior and the likelihood of the document given the sense.

2. **Compute the Marginal Probability $ P(\text{doc}) $**:
   - Sum the joint probabilities over all senses for each document. This serves as the normalization factor to convert joint probabilities into posterior probabilities.

3. **Compute the Posterior Probability $ P(\text{sense} | \text{doc}) $**:
   - Normalize the joint probabilities by dividing by the marginal probabilities.

**E-Step Code**

1. **Initialize `sense_given_doc_probs`**:
   - This matrix will store the joint probability of each document and each sense.

2. **Calculate Joint Probabilities**:
   - For each document and each sense, calculate the product of the sense prior (`sense_probs[s]`) and the likelihood of the document given the sense (`doc_given_sense_probs[t, s]` raised to the power of the word count in the document).

3. **Calculate Marginal Probabilities**:
   - Sum the joint probabilities over all senses for each document to get the marginal probability (`doc_probs`).

4. **Calculate Posterior Probabilities**:
   - Divide the joint probabilities by the marginal probabilities to get the posterior probabilities (`sense_given_doc_probs`).

In [40]:
def expectation(doc_vecs, doc_given_sense_probs, sense_probs):
  # E-Step
    sense_given_doc_probs = np.zeros((num_contexts, num_senses))

    for d in range(num_contexts):
        for s in range(num_senses):
            product = sense_probs[s]
            for t in range(num_features):
                if doc_vecs[d, t] > 0:
                    product *= doc_given_sense_probs[t, s] ** doc_vecs[d, t]
            sense_given_doc_probs[d, s] = product

    doc_probs = np.sum(sense_given_doc_probs, axis=1)[:, np.newaxis]
    sense_given_doc_probs = sense_given_doc_probs / doc_probs
    return sense_given_doc_probs # P(sense | doc)


## M-Step

The Bayesian formula states that:
$ P(\text{doc} | \text{sense}) = \frac{P(\text{sense} | \text{doc}) \cdot P(\text{doc})}{P(\text{sense})} $

In the context of the EM algorithm, we use the posterior probabilities from the E-Step (`P(sense | doc)`) to update our estimates of the parameters.

To update `sense_probs`:
$ P(\text{sense}) = \frac{1}{N} \sum_{d=1}^N P(\text{sense} | \text{doc}_d) $
where $ N $ is the number of documents.

To update `doc_given_sense_probs`:
$ P(\text{word}_t | \text{sense}) = \frac{\sum_{d=1}^N \text{count}(\text{word}_t \text{ in } \text{doc}_d) \cdot P(\text{sense} | \text{doc}_d)}{\sum_{t=1}^V \sum_{d=1}^N \text{count}(\text{word}_t \text{ in } \text{doc}_d) \cdot P(\text{sense} | \text{doc}_d)} $
where $ V $ is the vocabulary size.

In [50]:
def maximization(doc_vecs, sense_given_doc_probs):
  # M-Step
    sense_probs = np.sum(sense_given_doc_probs, axis=0) / num_contexts


    doc_given_sense_probs = np.zeros((num_features, num_senses))

    for s in range(num_senses):
        for t in range(num_features):
            numerator = 0
            denominator = 0
            for d in range(num_contexts):
                if doc_vecs[d, t] > 0:
                    numerator += doc_vecs[d, t] * sense_given_doc_probs[d, s]
                denominator += doc_vecs[d, t] * sense_given_doc_probs[d, s]
            doc_given_sense_probs[t, s] = numerator / np.sum(sense_given_doc_probs[:, s])

    doc_given_sense_probs /= np.sum(doc_given_sense_probs, axis=0)  # Normalize correctly across senses
    return doc_given_sense_probs, sense_probs


## EM Algorithm

In [51]:
# Number of iterations
num_iters = 1

for i in range(num_iters):
  sense_given_doc_probs = expectation(doc_vecs, doc_given_sense_probs, sense_probs)
  doc_given_sense_probs, sense_probs = maximization(doc_vecs, sense_given_doc_probs)
  #print(f"iteration {i}, sense_probs\n{sense_probs}")
  #print(f"doc_given_sense_probs\n{doc_given_sense_probs}") # P(X|Y)


In [52]:
# Final parameters
print("Final sense probabilities:", sense_probs)
print("Final word sense assignment probabilities:")
print(doc_given_sense_probs)
print(f"sense_given_doc_probs\n{sense_given_doc_probs}")

Final sense probabilities: [0.07133226 0.92866774]
Final word sense assignment probabilities:
[[0.00597378 0.04253212]
 [0.13418508 0.03299105]
 [0.00597378 0.04253212]
 [0.01330985 0.0419862 ]
 [0.01131253 0.04213483]
 [0.16478124 0.1596442 ]
 [0.00597378 0.04253212]
 [0.01330985 0.0419862 ]
 [0.00597378 0.04253212]
 [0.01131253 0.04213483]
 [0.13418508 0.03299105]
 [0.01131253 0.04213483]
 [0.13418508 0.03299105]
 [0.01330985 0.0419862 ]
 [0.01131253 0.04213483]
 [0.14015886 0.07552317]
 [0.01330985 0.0419862 ]
 [0.17012    0.1592469 ]]
sense_given_doc_probs
[[0.02304684 0.97695316]
 [0.23234986 0.76765014]
 [0.01034397 0.98965603]
 [0.01958835 0.98041165]]
