In [1]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import spacy
import matplotlib.pyplot as plt
import threading
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

<matplotlib.pyplot._IonContext at 0x2ea556d8b50>

In [2]:
import torchtext

class Vocabulary: 
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>", }
        self.stoi = {"<PAD>":0, "<SOS>":1, "<EOS>":2, "<UNK>":3, }
        self.freq_threshold = freq_threshold
    
    def __len__(self):
        return len(self.itos)
    
    @staticmethod
    def tokenizer_eng(text):
        tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
        return tokenizer(text)
    
    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4
        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                frequencies[word] += 1 
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
    
    def numericalize(self, text): 
        tokenized_text = self.tokenizer_eng(text)
        return [self.stoi[_str] if _str in self.stoi else self.stoi["<UNK>"] for _str in tokenized_text]




In [3]:
class EmbedClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = torch.nn.EmbeddingBag(vocab_size, embed_dim)
        self.fc = torch.nn.Linear(embed_dim, num_class)

    def forward(self, text, off):
        x = self.embedding(text, off)
        return self.fc(x)

In [4]:
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import dask.bag as db
import json
import pandas as pd

class ArxivDataset(Dataset):
    """Arxiv Papers Dataset."""

    def __init__(self, json_file, freq_threshold=5):
        """
        Args:
            json_file (string): path to the json_file containing the arxiv metadata
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        data_bag = db.read_text(json_file).map(json.loads).compute()
        data_bag = data_bag[200000:400001]
        self.df = pd.DataFrame(data_bag)
        self.abstracts = self.df["abstract"]
        self.titles = self.df["title"]
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(self.abstracts.tolist())

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        abstract = self.abstracts[idx]
        title = self.titles[idx]

        numericalized_abstract = [self.vocab.stoi["<SOS>"]]
        numericalized_abstract += self.vocab.numericalize(abstract)
        numericalized_abstract.append(self.vocab.stoi["<EOS>"])

        numericalized_title = [self.vocab.stoi["<SOS>"]]
        numericalized_title += self.vocab.numericalize(title)
        numericalized_title.append(self.vocab.stoi["<EOS>"])

        return torch.tensor(numericalized_title), torch.tensor(numericalized_abstract)
    
def padify(b):
    # b is the list of tuples of length batch_size
    #   - first element of a tuple = label, 
    #   - second = feature (text sequence)
    # build vectorized sequence
    v = [encode(x[1]) for x in b]
    # first, compute max length of a sequence in this minibatch
    l = max(map(len,v))
    return ( # tuple of two tensors - labels and features
        torch.LongTensor([t[0]-1 for t in b]),
        torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
    )

def offsetify(b):
    # first, compute data tensor from all sequences
    x = [torch.tensor(encode(t[1])) for t in b]
    # now, compute the offsets by accumulating the tensor of sequence lengths
    o = [0] + [len(t) for t in x]
    o = torch.tensor(o[:-1]).cumsum(dim=0)
    return ( 
        torch.LongTensor([t[0]-1 for t in b]), # labels
        torch.cat(x), # text 
        o
    )

In [5]:
def get_loader(json_file, batch_size=100, num_workers=8, shuffle=True, pin_memory=True):
    dataset = ArxivDataset(json_file)
    loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, pin_memory=pin_memory,
    collate_fn=padify())
    return loader

dataloader = get_loader("arxiv-metadata-oai-snapshot.json")
