In [1]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
import pandas as pd
import numpy as np
from numpy import genfromtxt
import torch
import tqdm
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import os
import time
import gc
from torchtext.legacy import data 
from torch.utils.data import Dataset, DataLoader, DistributedSampler
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch.multiprocessing as mp
from multiprocessing import Queue

In [3]:
class CustomTextDataset(Dataset):
    def __init__(self, context_file, target_file):
        self.context = genfromtxt(context_file, delimiter=',', dtype=np.int64) 
        self.target =  genfromtxt(target_file, delimiter=',', dtype=np.int64)
    
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self, idx):
        return (self.context[idx], self.target[idx])

temp_data = CustomTextDataset('CBOW/context.csv', 'CBOW/target.csv')
vocab_obj = torch.load('CBOW/vocab_obj.pth')

In [4]:
'''Training CBOW Embedding Model'''
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, vocab_size)
        self.init_weights()
    
    def init_weights(self):
        initrange = 0.5
        self.embeddings.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        mean = torch.mean(embeds, dim=1,keepdim=True)
        y_hat = self.fc(mean)
        y_hat = torch.squeeze(y_hat, dim=1)
        log_probs = F.log_softmax(y_hat, dim=1)
        return log_probs

In [46]:
Q = Queue()
def train(model, data):
    optimizer = optim.Adam(model.parameters())
    loss_function = nn.NLLLoss()
    total_loss = 0
    losses = []
    for idx, sample in enumerate(data):
        batch_one = sample
        optimizer.zero_grad()
        log_probs = model(batch_one[0])
        loss = loss_function(log_probs, batch_one[1])
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        total_loss += loss.item()
        losses.append(total_loss)
    Q.put(losses[-1])

num_processes = 4
EPOCHS = 5 # epoch
EMBEDDING_DIM = 16
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab_obj.values())

model = CBOW(VOCAB_SIZE,EMBEDDING_DIM)
model.share_memory()
temp = []
loss = 0
for epoch in range(1, EPOCHS + 1):
    print('Epoch:'+str(epoch)+' |'+'■'*epoch+'| avg_loss:'+str(loss), end='')
    os.system("printf '\033c'")
    processes = []
    for rank in range(num_processes):
        data_loader = DataLoader(temp_data, sampler=DistributedSampler(dataset=temp_data,num_replicas=num_processes,rank=rank), batch_size=BATCH_SIZE)
        p = mp.Process(target=train, args=(model, data_loader))
        p.start()
        processes.append(p)
    for p in processes:
        temp.append(Q.get())
        p.join()
    loss=sum(temp)/len(temp)
torch.save(model.embeddings.weight, 'CBOW_NEWS.pth')

Epoch:1 |■| avg_loss:0Epoch:2 |■■| avg_loss:45.49118947982788Epoch:3 |■■■| avg_loss:45.399390161037445Epoch:4 |■■■■| avg_loss:45.30847100416819Epoch:5 |■■■■■| avg_loss:45.216905385255814