In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np
from numpy import genfromtxt
import torch
import tqdm
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import os
import time
import gc
from torch.utils.data import Dataset, DataLoader, DistributedSampler

In [38]:
class CustomTextDataset(Dataset):
    def __init__(self, context_file, target_file):
        self.context = torch.tensor(genfromtxt(context_file, delimiter=',', dtype=np.int64)) 
        self.target =  torch.tensor(genfromtxt(target_file, delimiter=',', dtype=np.int64))
    
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self, idx):
        return (self.context[idx], self.target[idx])

vocab_obj = torch.load('vocab_obj.pth')
# indextow = torch.load('indexDict.pth')

In [39]:
target_path = '/home/ichglaubeya/projects/NLP_Projects/Spacy/fake_or_real/target/'
context_path = '/home/ichglaubeya/projects/NLP_Projects/Spacy/fake_or_real/CBOW/'
targetlist = [t for t in os.listdir(target_path)]
contextlist = [c for c in os.listdir(context_path)]

In [40]:
data_objects = []
for file_one, file_two in zip(contextlist, targetlist):
    dirpath_one=os.path.join(context_path+file_one)
    dirpath_two=os.path.join(target_path+file_two)
    data_objects.append(CustomTextDataset(dirpath_one,dirpath_two))
print('Data has been loaded, starting Training')

In [27]:
'''Training CBOW Embedding Model'''
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, vocab_size)
        self.init_weights()
    
    def init_weights(self):
        initrange = 0.5
        self.embeddings.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        mean = torch.mean(embeds, dim=1,keepdim=True)
        y_hat = self.fc(mean)
        y_hat = torch.squeeze(y_hat, dim=1)
        log_probs = F.log_softmax(y_hat, dim=1)
        return log_probs

In [None]:
def train(model, data):
    optimizer = optim.Adam(model.parameters())
    loss_function = nn.NLLLoss()
    total_loss = 0
    log_interval = 1
    for idx, sample in enumerate(data):
        batch_one = sample
        optimizer.zero_grad()
        log_probs = model(batch_one.context)
        loss = loss_function(log_probs, batch_one.target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        total_loss += loss.item()
        if idx % log_interval == 0 and idx > 0:
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| loss {:6f} '.format(epoch, idx, len(data), total_loss))

EPOCHS = 5 # epoch
EMBEDDING_DIM = 16
VOCAB_SIZE = len(vocab_obj.values())

model = CBOW(VOCAB_SIZE,EMBEDDING_DIM)
for epoch in range(1, EPOCHS + 1):
    train(model, data_objects)
torch.save(model.embeddings.weight, 'CBOW_NEWS.pth')