In [1]:
import re
import numpy as np
import pandas as pd
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
Input (word indices) --> Embedding Layer --> Lambda Layer --> Dense Output Layer (Softmax)

Weights (W_emb, W_dense)       Biases (b_emb, b_dense)
    |                              |
    V                              V
Embedding Vectors                 logits
    |                              |
    V                              V
   Mean                           Softmax
    |                              |
    V                              V
  Loss                           Predicted Probabilities


In [2]:
# Download stopwords
#nltk.download('stopwords')
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

# Example corpus
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]

In [3]:
# Function to normalize documents by lowercasing and removing special characters and stopwords
def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc


In [4]:
# Normalize the corpus
normalize_corpus = np.vectorize(normalize_document)
corpus_df = normalize_corpus(corpus)

# Tokenize the normalized corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus_df)
word2id = tokenizer.word_index
word2id['PAD'] = 0  # Add a PAD token
id2word = {v: k for k, v in word2id.items()}  # Reverse mapping
vocab_size = len(word2id)  # Vocabulary size

# Convert corpus to sequences of word indices
wids = [tokenizer.texts_to_sequences([doc])[0] for doc in corpus_df]

In [5]:
print(wids)

[[1, 2, 3], [9, 2, 3, 1], [4, 5, 6, 14, 7, 8], [15, 16, 10, 11, 12, 13, 17, 18], [9, 19, 13, 11, 10, 12], [5, 6, 4, 2, 8, 7], [1, 2, 1, 3, 20], [8, 7, 5, 6, 4]]


In [5]:
# Function to generate context and target word pairs
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word = []
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i] 
                                  for i in range(start, end) 
                                  if 0 <= i < sentence_length 
                                  and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, num_classes=vocab_size)
            yield (x, y)

In [14]:
# Hyperparameters
embed_size = 100
window_size = 2
learning_rate = 0.001
epochs = 10

# Initialize embeddings randomly
np.random.seed(0)
W_emb = np.random.randn(vocab_size, embed_size)
print(W_emb.shape)

(21, 100)


In [45]:
# Training loop for the CBOW model
for epoch in range(1, epochs + 1):
    total_loss = 0
    for context, target in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        # Get context embeddings by averaging the embeddings of context words
        context_emb = np.mean(W_emb[context.flatten()], axis=0, keepdims=True)
        
        # Calculate logits by dot product of context embedding and all word embeddings
        #Each value in logits represents the raw score for each word in the vocabulary given the current context.
        #log odds log-odds=log(p/ 1−p)
        logits = np.dot(context_emb, W_emb.T)

        # Calculate probabilities using softmax
        exp_logits = np.exp(logits - np.max(logits))
        probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

        # Ensure target is an integer
        target = int(target[0].argmax())
        target_one_hot = np.zeros(vocab_size)
        target_one_hot[target] = 1
        # Calculate loss
        loss = -np.log(probs[0, target] + 1e-10)
        total_loss += loss

        # Calculate gradient
        dlogits = probs.copy()
        dlogits[0, target] -= 1
        dW_emb = np.dot(dlogits.T, context_emb)  # Shape (vocab_size, embed_size)
        # Update embeddings
        W_emb -= learning_rate * dW_emb  # Update embeddings directly
    
    print('Epoch:', epoch, '\tLoss:', total_loss)


Epoch: 1 	Loss: 944.5029015245902
Epoch: 2 	Loss: 943.2628341141923
Epoch: 3 	Loss: 941.9324901693252
Epoch: 4 	Loss: 940.4854422314074
Epoch: 5 	Loss: 938.8783850254699
Epoch: 6 	Loss: 937.0580778873933
Epoch: 7 	Loss: 934.9785617127623
Epoch: 8 	Loss: 932.6229004323948
Epoch: 9 	Loss: 930.0165375444623
Epoch: 10 	Loss: 927.2230335719763


In [8]:
for context, target in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        # Get context embeddings by averaging the embeddings of context words
        context_emb = np.mean(W_emb[context.flatten()], axis=0, keepdims=True)

In [53]:
print(context_emb)

[[ 4.33869731e-01  2.44147504e-01  4.24064288e-01  8.71231078e-01
   3.12554584e-01 -6.68303417e-01 -3.37111166e-02 -7.64465459e-01
  -3.36393720e-01  3.68509214e-01 -1.83566443e-01  5.36875157e-01
   1.40642873e-01 -2.03243578e-01  6.93473014e-03  2.54619687e-01
   7.58215810e-01  2.56241496e-01  4.35380757e-01 -2.74402862e-01
  -8.70887122e-01 -5.46410039e-04 -1.06167793e-01 -5.74480867e-01
   9.37078459e-01 -1.06692945e+00  2.90726226e-01  5.58800012e-01
   4.72670650e-03  7.71244386e-01 -6.06658931e-01  2.42592162e-01
   5.91583650e-02 -8.39997310e-01 -4.95558546e-01 -2.06798838e-02
   2.01876937e-01  5.30379516e-01 -1.58785115e-02  1.62576903e-01
  -3.64268046e-01 -4.99421086e-01 -5.13209539e-01  9.02599573e-01
  -6.17507853e-01 -3.29466327e-01  6.97938007e-03 -3.60405844e-01
  -4.65876962e-01  1.65977000e-01 -2.15225918e-01 -3.48596904e-01
   9.05046911e-02 -3.97003512e-01 -1.43867961e-01  5.87948685e-01
  -5.07030848e-01 -1.28619730e-01 -6.00076356e-01 -5.43623444e-01
   3.15994

In [46]:
logits = np.dot(context_emb, W_emb.T)
print(logits)

[[ 26.28816868  -1.70639764   7.5373193    7.95847665   4.93819711
   22.49125996  17.4996644   -1.47959475   8.83300924  -2.28078976
    5.23002835  -2.87721974   0.50356768  -5.03261581  -2.09566607
   -6.35822399  -1.41635912  -5.43573681   8.65811141 -10.41213501
    0.98561404]]


In [47]:
exp_logits = np.exp(logits - np.max(logits))
print(exp_logits)

[[1.00000000e+00 6.95207305e-13 7.18802506e-09 1.09525665e-08
  5.34349193e-10 2.24400333e-02 1.52475859e-04 8.72194949e-13
  2.62615632e-08 3.91434407e-13 7.15427872e-10 2.15592053e-13
  6.33709282e-12 2.49779119e-14 4.71039356e-13 6.63516627e-15
  9.29129937e-13 1.66910210e-14 2.20477062e-08 1.15149121e-16
  1.02621922e-11]]


In [48]:
probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

In [49]:
print(probs)

[[9.77906571e-01 6.79847791e-13 7.02921693e-09 1.07105867e-08
  5.22543586e-10 2.19442560e-02 1.49107144e-04 8.52925171e-13
  2.56813552e-08 3.82786279e-13 6.99621617e-10 2.10828885e-13
  6.19708470e-12 2.44260642e-14 4.60632481e-13 6.48857270e-15
  9.08602271e-13 1.63222591e-14 2.15605967e-08 1.12605082e-16
  1.00354651e-11]]


In [50]:
target = int(target[0].argmax())
target_one_hot = np.zeros(vocab_size)
target_one_hot[target] = 1

TypeError: 'int' object is not subscriptable

In [19]:
print(target)

4


In [22]:
total_loss = 0
loss = -np.log(probs[0, target] + 1e-10)
total_loss += loss

print(loss)
print(total_loss)

23.025850924677904
23.025850924677904


In [23]:
dlogits = probs.copy()
dlogits[0, target] -= 1
dW_emb = np.dot(dlogits.T, context_emb)

In [27]:
print(dW_emb)

[[ 5.90101944e-01  2.95847739e-01  5.27363581e-01 ...  6.40093299e-01
  -1.19591257e-01  4.26831213e-01]
 [ 1.03023080e-20  5.16506439e-21  9.20698890e-21 ...  1.11750832e-20
  -2.08788664e-21  7.45184231e-21]
 [ 9.50530767e-17  4.76548809e-17  8.49472390e-17 ...  1.03105638e-16
  -1.92636493e-17  6.87535780e-17]
 ...
 [ 1.26242165e-16  6.32915371e-17  1.12820371e-16 ...  1.36936956e-16
  -2.55844932e-17  9.13131993e-17]
 [ 4.41236978e-26  2.21214256e-26  3.94325616e-26 ...  4.78617018e-26
  -8.94219808e-27  3.19154540e-26]
 [ 9.79633744e-21  4.91139593e-21  8.75481202e-21 ...  1.06262486e-20
  -1.98534561e-21  7.08586480e-21]]


In [37]:
# After training, get the weights
weights = W_emb[:-1]  # Exclude the padding token, which is the last word
words = list(id2word.values())[1:]

print("Weights shape:", weights.shape)


Weights shape: (20, 100)


In [51]:
# Display the first few entries as an example
for i in range( len(words)):
    print(f"Word: {words[i]}, Embedding: {weights[i]}")


Word: blue, Embedding: [ 1.44260631  0.30760541  0.7554979   1.70227294  1.5020663  -0.7401356
  0.78597678  0.07032749 -0.16422806  0.27715857  0.05550069  1.02306317
  0.65419066  0.1274679   0.39378987  0.23704599  1.1199338  -0.14730263
  0.08443118 -0.67438012 -1.9437767   0.63271668  0.73128187 -0.55344176
  1.89746555 -1.0667663   0.01574703 -0.36101645  1.22670284  1.088376
  0.24332892  0.36991714 -0.71709415 -1.511144   -0.11273638  0.05238371
  0.95754241  1.00568487 -0.33394521 -0.29034411 -0.77541633 -1.04758325
 -1.38553675  1.53572921 -0.44099889 -0.2889349  -0.99868571  0.58629031
 -1.21847158 -0.1219166  -0.66243026  0.37028637 -0.44852086 -0.92269269
 -0.0734838   0.282887    0.18618687  0.39490858 -0.34361612 -0.2307234
 -0.52076858 -0.31440852 -0.65481723 -1.32746559  0.26391866 -0.32886124
 -1.26319437  0.35846627 -0.6822984   0.00813982  0.52266202  0.34638231
  0.80694934 -0.94885119  0.32875327 -0.5571705  -0.65449171 -0.35551423
 -0.13520017 -0.10015417 -0.8383

In [54]:
# Compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix)


[[ 0.         12.25736271 12.20502926 12.95156125 13.14771175 13.36592664
  13.65326276 12.17268405 11.57896117 12.31667388 11.59017851 13.38512061
  12.9427277  12.70576757 13.11573418 13.42631909 13.13213807 14.103094
  10.87713575 13.28708538]
 [12.25736271  0.         13.22602525 13.27733807 14.63631947 14.99867332
  14.91168111 13.29425771 13.64323293 14.55375831 12.95277941 14.85461051
  14.38479418 13.73352594 14.47679282 14.07924172 13.68015753 14.68109741
  13.88532565 12.86205633]
 [12.20502926 13.22602525  0.         12.21924085 13.23668663 12.61753303
  13.18679027 13.77409401 13.67952494 14.33103298 13.53656289 12.70272532
  13.67362371 12.80825614 13.95435007 13.83888391 14.52967521 13.63603339
  13.34513726 13.36703017]
 [12.95156125 13.27733807 12.21924085  0.         14.43814243 11.818124
  12.20248985 14.70427831 12.76990224 13.46125669 12.99453952 13.98159758
  13.54135769 13.21313697 13.30343575 13.83813515 13.95364023 13.64741459
  13.71603336 13.19488035]
 [13.147

In [55]:
def find_similar_words(search_term, top_n=5):
    if search_term not in word2id:
        return []  # Return an empty list if search_term is not found
    
    # Get the index of the search_term in the word2id dictionary
    word_idx = word2id[search_term]
    
    # Retrieve the indices of the top_n most similar words (excluding the search_term itself)
    similar_indices = distance_matrix[word_idx-1].argsort()[1:top_n + 1]
    
    # Return the top_n most similar words based on their indices in the id2word dictionary
    similar_words = [id2word[idx+1] for idx in similar_indices]
    
    return similar_words


In [56]:
# Find similar words for specific terms
similar_words = {search_term: find_similar_words(search_term) for search_term in ['fox', 'sky', 'breakfast']}

# Print similar words
print("Similar words:")
for term, similar in similar_words.items():
    print(f"{term}: {similar}")

Similar words:
fox: ['quick', 'beautiful', 'love', 'beans', 'bacon']
sky: ['green', 'love', 'ham', 'dog', 'beautiful']
breakfast: ['bacon', 'dog', 'toast', 'brown', 'today']
