In [1]:
import torch
import torch.nn as nn
import wandb
import datetime
from dotenv import load_dotenv 
# import pandas as pd
import json
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import random
import re
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import torch.optim as optim
import os

#from nltk.tokenize import word_tokenize
from collections import Counter

# from ...src.utils.wiki_tokens.wiki_cleaner import getMorphemeList
# import ../src/utiils/wiki_token/wiki_cleaner
load_dotenv()

torch.manual_seed(42)
random.seed(42)
np.random.seed(42)


In [2]:
with open("../tokens/tokens.json", "r") as file:
    tokenizer =json.load(file)

In [3]:
with open("text", "r") as file:
    wiki = file.read()

In [None]:
words = wiki.split(sep=" ")
window = 2
print (words[0])
words.pop(0)

tokenized_wiki = []

for word in words:
    if word in tokenizer:
        tokenized_wiki.append(tokenizer[word])
        

In [None]:
print (tokenized_wiki[0:100])

In [6]:
class embed_train_dataset(Dataset):
    def __init__(self, words):
        self.data = words
    
    def __len__(self):
        return len(self.data)-4
    
    def __getitem__(self, idx):
        idx = idx+2       
        sent = self.data[max(0,idx-window):min(idx+window+1,len(words))]    
        if len(sent) > 1:
            rand_idx = random.randint(0,len(sent)-1)
            target = sent[rand_idx]
            del sent[rand_idx]
            #print (sent)
            tokenized = torch.tensor(sent)

            
            return tokenized, torch.tensor(target)
        

In [None]:
dataset = embed_train_dataset(tokenized_wiki)
dataloader = DataLoader(dataset, batch_size=1,shuffle=True)

for data in dataloader:
    print (data)
    break



In [8]:
class CBOW(torch.nn.Module):
    def __init__(self):
        super(CBOW, self).__init__()
        self.embeddings = torch.nn.Embedding(num_embeddings = len(tokenizer), embedding_dim = 200)
        self.linear = torch.nn.Linear(in_features = 200, out_features = len(tokenizer))
        
    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds = embeds.mean(dim=1)
        out = self.linear(embeds)
        #probs =  F.log_softmax(out, dim=1)
        return out

In [None]:
ts = datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S')
wandb.login(key=os.environ["WANDB_API"], relogin=True)
wandb.init(project='mlx7-week1-cbow', name=f'{ts}')

In [10]:
def train_loop():
    number_epochs = 5

    #train_wiki, val_wiki = train_test_split(words)
    os.makedirs("checkpoints", exist_ok=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    dataset = embed_train_dataset(tokenized_wiki[0:1280])
    dataloader = DataLoader(dataset, batch_size=128,shuffle=True)
    
    model = CBOW().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    best_loss = 100000000000000.0
    for epoch in range(number_epochs):
        model.train()
        epoch_loss = 0.0
        for X,Y in tqdm(dataloader):
            X = X.to(device)
            Y = Y.to(device)
            optimizer.zero_grad()
            pred = model(X)
            loss = F.cross_entropy(pred,Y)
            loss.backward()
            optimizer.step()
            wandb.log({'loss': loss.item()})
            epoch_loss += loss
            #print (loss.item())
        epoch_loss = epoch_loss/len(dataloader)
        print(f"Epoch: {epoch}/{number_epochs}, loss: {epoch_loss} ")
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            checkpoint_name = f'{ts}.{epoch + 1}.cbow.pth'
            torch.save(model.state_dict(), f'./checkpoints/{checkpoint_name}')
            artifact = wandb.Artifact('model-weights', type='model')
            artifact.add_file(f'./checkpoints/{checkpoint_name}')
            wandb.log_artifact(artifact)
            print(f"Model improved. Saved.")

    wandb.finish()

In [None]:
train_loop()