In [219]:
import nltk
from nltk.tokenize import punkt
#import emoji
import re
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Adi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [220]:
def read_corpus(filepath):
    file = open(filepath, encoding="utf8")
    return (file.read())

In [221]:
corpus = read_corpus("Data/shakespeare.txt")

In [222]:
def preprocess(corpus):
    data = re.sub(r'[,!?;-]+','.', corpus)
    data = nltk.word_tokenize(data)
    data = [ch.lower() for ch in data if ch.isalpha() or ch == '.']
    return data

words = preprocess(corpus)
print(words)



In [223]:
def get_windows(words, C):
    for i in range(C,len(words)-C):
        center_word = words[i]
        context_words = words[(i-C):i] + words[(i+1):(i+C+1)]
        yield  context_words, center_word

print(get_windows(words,2))

<generator object get_windows at 0x000002AB021F4A40>


In [224]:
def build_vocab(corpus):
    vocab = {}
    for word in corpus:
        vocab[word] = vocab.get(word,0)+1
    return sorted(vocab.keys()), vocab

In [225]:
vocab, v = build_vocab(words)
v["throw"]

5

In [226]:
def one_hot_vector(sorted_vocab, word):
    vec = np.zeros(len(sorted_vocab))
    #print(vec)
    vec[sorted_vocab.index(word)] = 1
    #print(vec)
    return vec

In [227]:
print(one_hot_vector(vocab, "because"))

[0. 0. 0. ... 0. 0. 0.]


In [228]:
def one_hot_context_words(context_words, vocab):
    vec = np.mean([one_hot_vector(vocab, word) for word in context_words ], axis=0)
    return vec

In [229]:
def get_windows(words, window_size):
    for i in range(len(words) - window_size):
        context = words[i:i + window_size]
        target = words[i + window_size]
        yield context, target

In [230]:
context_vectors = []
target_vectors = []
for context, target in get_windows(words, 2):
    context_vectors.append(one_hot_context_words(context, vocab))
    target_vectors.append(one_hot_vector(vocab, target))

context_vectors = np.array(context_vectors)
target_vectors = np.array(target_vectors)

In [231]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [232]:
class Model(nn.Module):

    def __init__(self, in_features=len(vocab), h1=10, out_features=len(vocab)):
        super().__init__()
        self.fc1 = nn.Linear(in_features, h1)
        self.fc2 = nn.Linear(h1, out_features)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x) 
        return x

In [233]:
context_tensors = torch.tensor(context_vectors, dtype=torch.float32)
target_tensors = torch.tensor(target_vectors, dtype=torch.float32)
model = Model().to(device)
epochs = 50
batch_size = 100
losses = []
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)


In [234]:

model.to(device)

context_tensors = context_tensors.to(device)
target_tensors = target_tensors.to(device)

for j in range(epochs):
    epoch_loss = 0.0
    for i in range(0, len(context_tensors), batch_size):
        context_batch = context_tensors[i:i + batch_size]
        target_batch = target_tensors[i:i + batch_size]

        optimizer.zero_grad()
        output = model(context_batch)
        loss = criterion(output, torch.argmax(target_batch, axis=1)) 
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        
    print(f'Epoch [{j+1}/{epochs}], Loss: {epoch_loss / len(context_tensors):.4f}')
    losses.append(epoch_loss / len(context_tensors))


Epoch [1/50], Loss: 0.0624
Epoch [2/50], Loss: 0.0578
Epoch [3/50], Loss: 0.0565
Epoch [4/50], Loss: 0.0554
Epoch [5/50], Loss: 0.0546
Epoch [6/50], Loss: 0.0539
Epoch [7/50], Loss: 0.0533
Epoch [8/50], Loss: 0.0528
Epoch [9/50], Loss: 0.0524
Epoch [10/50], Loss: 0.0521
Epoch [11/50], Loss: 0.0518
Epoch [12/50], Loss: 0.0515
Epoch [13/50], Loss: 0.0513
Epoch [14/50], Loss: 0.0511
Epoch [15/50], Loss: 0.0509
Epoch [16/50], Loss: 0.0507
Epoch [17/50], Loss: 0.0506
Epoch [18/50], Loss: 0.0505
Epoch [19/50], Loss: 0.0504
Epoch [20/50], Loss: 0.0503
Epoch [21/50], Loss: 0.0502
Epoch [22/50], Loss: 0.0501
Epoch [23/50], Loss: 0.0500
Epoch [24/50], Loss: 0.0499
Epoch [25/50], Loss: 0.0499
Epoch [26/50], Loss: 0.0498
Epoch [27/50], Loss: 0.0497
Epoch [28/50], Loss: 0.0497
Epoch [29/50], Loss: 0.0496
Epoch [30/50], Loss: 0.0496
Epoch [31/50], Loss: 0.0496
Epoch [32/50], Loss: 0.0495
Epoch [33/50], Loss: 0.0495
Epoch [34/50], Loss: 0.0494
Epoch [35/50], Loss: 0.0494
Epoch [36/50], Loss: 0.0494
E

In [235]:
print(losses[len(losses)-1])

0.04906345389520254


In [239]:
test = torch.tensor(one_hot_context_words(["and","monarchs","behold", "the"], vocab), dtype=torch.float32).to(device)

model.eval()

with torch.no_grad():
    output = model(test)
    idx = int(torch.argmax(output))    
predicted_word = vocab[idx]

print(predicted_word)

.


In [None]:
print(len(vocab))

7112
