In [1]:
import nltk
from nltk.tokenize import punkt
#import emoji
import re
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Adi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
corpus = 'I am happy because I am learning'

In [3]:
def preprocess(corpus):
    data = re.sub(r'[,!?;-]+','.', corpus)
    data = nltk.word_tokenize(data)
    data = [ch.lower() for ch in data if ch.isalpha() or ch == '.']
    return data

words = preprocess(corpus)
print(words)

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']


In [4]:
def get_windows(words, C):
    windows = []
    for i in range(C,len(words)-C):
        center_word = words[i]
        context_words = words[(i-C):i] + words[(i+1):(i+C+1)]
        yield  context_words, center_word

print(get_windows(words,2))

<generator object get_windows at 0x000002A8EAA5DAD0>


In [5]:
def build_vocab(corpus):
    vocab = {}
    for word in corpus:
        vocab[word] = vocab.get(word,0)+1
    return sorted(vocab.keys())

In [6]:
vocab = build_vocab(words)

In [7]:
def one_hot_vector(sorted_vocab, word):
    vec = np.zeros(len(sorted_vocab))
    #print(vec)
    vec[sorted_vocab.index(word)] = 1
    #print(vec)
    return vec

In [8]:
print(one_hot_vector(vocab, "because"))

[0. 1. 0. 0. 0.]


In [9]:
def one_hot_context_words(context_words, vocab):
    vec = np.mean([one_hot_vector(vocab, word) for word in context_words ], axis=0)
    return vec

In [10]:
print(one_hot_context_words(["i","am","because","i"], vocab))

[0.25 0.25 0.   0.5  0.  ]


In [11]:
def get_windows(words, window_size):
    for i in range(len(words) - window_size):
        context = words[i:i + window_size]
        target = words[i + window_size]
        yield context, target

['i', 'am', 'because', 'i'],[0.25 0.25 0.   0.5  0.  ],happy, [0. 0. 1. 0. 0.]
['am', 'happy', 'i', 'am'],[0.5  0.   0.25 0.25 0.  ],because, [0. 1. 0. 0. 0.]
['happy', 'because', 'am', 'learning'],[0.25 0.25 0.25 0.   0.25],i, [0. 0. 0. 1. 0.]


In [20]:
context_vectors = []
target_vectors = []
for context, target in get_windows(words, 2):
    context_vectors.append(one_hot_context_words(context, vocab))
    target_vectors.append(one_hot_vector(vocab, target))

context_vectors = np.array(context_vectors)
target_vectors = np.array(target_vectors)

In [35]:
class Model(nn.Module):

    def __init__(self, in_features=5, h1=10, out_features=4):
        super().__init__()
        self.fc1 = nn.Linear(in_features, h1)
        self.fc2 = nn.Linear(h1, out_features)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x) 
        return x

In [50]:
context_tensors = torch.tensor(context_vectors, dtype=torch.float32)
target_tensors = torch.tensor(target_vectors, dtype=torch.float32)
model = Model()
epochs = 100
batch_size = 4
losses = []
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)


In [55]:
for j in range(epochs):
    epoch_loss = 0.0
    for i in range(0, len(context_tensors), batch_size):
        context_batch = context_tensors[i:i + batch_size]
        target_batch = target_tensors[i:i + batch_size]

        optimizer.zero_grad()
        output = model(context_batch)
        loss = criterion(output, torch.argmax(target_batch, axis=1))  # Assuming target_batch is one-hot encoded
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        print(f'Epoch [{j+1}/{epochs}], Batch [{i//batch_size + 1}/{len(context_tensors)//batch_size}], Loss: {loss.item():.4f}')

    print(f'Epoch [{j+1}/{epochs}], Loss: {epoch_loss / len(context_tensors):.4f}')
    losses.append(epoch_loss / len(context_tensors))

Epoch [1/100], Batch [1/0], Loss: 0.0785
Epoch [1/100], Loss: 0.0262
Epoch [2/100], Batch [1/0], Loss: 0.0760
Epoch [2/100], Loss: 0.0253
Epoch [3/100], Batch [1/0], Loss: 0.0737
Epoch [3/100], Loss: 0.0246
Epoch [4/100], Batch [1/0], Loss: 0.0715
Epoch [4/100], Loss: 0.0238
Epoch [5/100], Batch [1/0], Loss: 0.0694
Epoch [5/100], Loss: 0.0231
Epoch [6/100], Batch [1/0], Loss: 0.0674
Epoch [6/100], Loss: 0.0225
Epoch [7/100], Batch [1/0], Loss: 0.0654
Epoch [7/100], Loss: 0.0218
Epoch [8/100], Batch [1/0], Loss: 0.0636
Epoch [8/100], Loss: 0.0212
Epoch [9/100], Batch [1/0], Loss: 0.0617
Epoch [9/100], Loss: 0.0206
Epoch [10/100], Batch [1/0], Loss: 0.0600
Epoch [10/100], Loss: 0.0200
Epoch [11/100], Batch [1/0], Loss: 0.0584
Epoch [11/100], Loss: 0.0195
Epoch [12/100], Batch [1/0], Loss: 0.0568
Epoch [12/100], Loss: 0.0189
Epoch [13/100], Batch [1/0], Loss: 0.0552
Epoch [13/100], Loss: 0.0184
Epoch [14/100], Batch [1/0], Loss: 0.0538
Epoch [14/100], Loss: 0.0179
Epoch [15/100], Batch [1

In [52]:
print(losses[len(losses)-1])

0.02698524296283722


In [57]:
test = torch.tensor([0.5,  0.0 ,  0.25, 0.25, 0.0  ], dtype=torch.float32)
idx = int(torch.argmax(model(test)))
print(vocab[idx])

because
