In [36]:
from datasets import load_dataset
import random
import re
from tqdm import tqdm
import pickle
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import os
import torch.optim as optim
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)


In [2]:
marco_dataset = load_dataset('ms_marco', 'v1.1')

In [3]:
with open('tokenizer.pickle', 'rb') as file:
    tokenizer = pickle.load(file)

In [4]:
def tokenize(word, tokenizer=tokenizer):
    if word in tokenizer.keys():
        return tokenizer[word]
    else:
        return 0


In [5]:
inverse_tokens = {}
def inverse(tokenizer):
    for token in tokenizer:

        inverse_tokens[tokenizer[token]] = token
inverse(tokenizer)

In [6]:
def reverse_tokenize(word,inverse_tokens=inverse_tokens):
    if (int(word) == 0):
        return None
    return inverse_tokens[word]

In [7]:
tokenized_marco_text = []
for split in marco_dataset:
    tokenized_sample = []
    for sample in marco_dataset[split]:
        for words in (sample['passages']['passage_text']):
            words_ = re.sub(r'[^a-zA-Z-\s]', '', words)
            words_ = (words_.lower().split(" "))
            for word in words_:
                tokenized_marco_text.append(tokenize(word))

In [None]:
with open("tokenized_marco/tokenized_marco_text","wb") as file:
    pickle.dump(tokenized_marco_text,file)

with open("tokenized_marco/tokenized_marco_text","rb") as file:
    tokenized_marco_text = pickle.load(file)  #To load the file

In [9]:
class embed_train_dataset(Dataset):
    def __init__(self, words, window=2):
        self.data = words
        self.window = window
    
    def __len__(self):
        return len(self.data)-4
    
    def __getitem__(self, idx):
        idx = idx+self.window    
        sent = self.data[max(0,idx-self.window):min(idx+self.window+1,len(self.data))]    
        if len(sent) > 1:
            rand_idx = random.randint(0,len(sent)-1)
            target = sent[rand_idx]
            del sent[rand_idx]
            #print (sent)
            tokenized = torch.tensor(sent)
            #print (tokenized)
            
            return tokenized, torch.tensor(target)
        



In [10]:
dataset = embed_train_dataset(tokenized_marco_text)
dataloader = DataLoader(dataset, batch_size=1,shuffle=True)

for data in dataloader:
    print (data)
    break



[tensor([[   1, 4582,  277,   38]]), tensor([15])]


In [14]:
len(tokenizer)

76288

In [38]:
class CBOW(nn.Module):
    def __init__(self, vocab_size = 76288, embedding_dim = 256):
        super().__init__()
        self.embed = nn.Embedding(vocab_size,embedding_dim)   
        self.lin = nn.Linear(embedding_dim,vocab_size)

    def forward(self,inputs):
        # print (inputs)
        # print(inputs.shape)
        embs = self.embed(inputs)
        embs = embs.mean(dim=1)
        out = self.lin(embs)
        probs = F.log_softmax(out,dim=1)
        return probs



In [43]:
def train_loop():
    number_epochs = 5

    #train_wiki, val_wiki = train_test_split(words)
    os.makedirs("checkpoints", exist_ok=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print (device)
    dataset = embed_train_dataset(tokenized_marco_text)
    dataloader = DataLoader(dataset, batch_size=128,shuffle=True)
    
    model = CBOW().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.003)
    best_loss = 100000000000000.0
    for epoch in range(number_epochs):
        model.train()
        epoch_loss = 0.0
        for X,Y in tqdm(dataloader):
            X = X.to(device)
            Y = Y.to(device)
            optimizer.zero_grad()
            pred = model(X)
            loss = F.cross_entropy(pred,Y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss
            #print (loss.item())
        epoch_loss = epoch_loss/len(dataloader)
        print(f"Epoch: {epoch}/{number_epochs}, loss: {epoch_loss} ")
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            torch.save(model.state_dict(), f'checkpoints/best.pt')
            print(f"Model improved. Saved.")


In [44]:
train_loop()

cuda


  0%|          | 0/467576 [00:02<?, ?it/s]


KeyboardInterrupt: 