In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from numpy import genfromtxt
import torch
from torchtext.legacy import data 
from torch.utils.data import Dataset, DataLoader, DistributedSampler
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [4]:
PATH='/mnt/chromeos/GoogleDrive/MyDrive/Datasets/fake_real_data.csv'
stop_words=['the', '”', '\xa0', ' ']
TEXT = data.Field(tokenize='spacy', batch_first=True,  include_lengths=True, stop_words=stop_words)
LABEL = data.LabelField(dtype = torch.float, batch_first=True)

fields = [('text',TEXT),('label', LABEL)]
train_data=data.TabularDataset(path=PATH, format= 'csv', fields=fields, skip_header=True)

In [5]:
TEXT.build_vocab(train_data, min_freq=1)
LABEL.build_vocab(train_data)
word_to_ix = TEXT.vocab.stoi
torch.save(word_to_ix, 'wordDict.pth')
ix_to_word = TEXT.vocab.itos
torch.save(ix_to_word, 'indexDict.pth')

In [5]:
# Part implementation of Word Embedding tutorial as found
# @ https://pytorch.org/tutorials/beginner/nlp
#  /word_embeddings_tutorial.html
# and Jake Wherlock's custom dataset loading tutorial as found @
# https://towardsdatascience.com/how-to-use-datasets-and-dataloader-
# in-pytorch-for-custom-text-data-270eed7f7c00

CONTEXT_SIZE = 2

class CustomTextDataset(Dataset):
    def __init__(self, train_data):
        self.data = train_data
    
    def __len__(self):
        return len(self.data.examples)
    
    def __getitem__(self, idx):
        item = vars(self.data.examples[idx])
        raw_text = item['text']
        data = []
        for i in range(CONTEXT_SIZE, len(raw_text) - CONTEXT_SIZE):
            context = [raw_text[i - j - 1] for j in range(CONTEXT_SIZE)] + [raw_text[i + j + 1] for j in range(CONTEXT_SIZE)]
            target = raw_text[i]
            data.append((context, target))
        return data

temp_data = CustomTextDataset(train_data)
data_loader = DataLoader(temp_data)

In [6]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[i[0]] for i in context]
    return torch.tensor(idxs, dtype=torch.long)

for idx,data_i in enumerate(data_loader):
    context = []
    target = []
    for x in data_i:
        context.append(make_context_vector(x[0], word_to_ix).numpy())
        target.append(torch.tensor([word_to_ix[x[1][0]]], dtype=torch.long).numpy())
    with open('context.csv', 'a+') as f:
        np.savetxt(f, context, delimiter=',', fmt='%d')
    with open('target.csv', 'a+') as f:
        np.savetxt(f, target, delimiter=',', fmt='%d')