In [1]:
"""Cell that handles basic imports for preprocessing"""
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import spacy
import matplotlib.pyplot as plt
import threading
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

<matplotlib.pyplot._IonContext at 0x20cffd126a0>

In [2]:
'''Cell which creates Arxiv dataset using inheritance from the pytorch dataset model, and defines the padding function.'''
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import dask.bag as db
import json
import pandas as pd
from Vocabulary import Vocabulary, save_vocab

class ArxivDataset(Dataset):
    """Arxiv Papers Dataset."""

    def __init__(self, json_file, freq_threshold=5):
        """
        Args:
            json_file (string): path to the json_file containing the arxiv metadata
        """
        #Reads the json file into a dask bag, which is then converted into a list (utilises parallel computing for fast results) 
        data_bag = db.read_text(json_file).map(json.loads).compute()
        print('Loaded.')
        #converts list into a dataframe
        self.df = pd.DataFrame(data_bag)
        print('Converted to dataframe.')
        #creates collections of all the abstracts and titles in the dataframe
        self.abstracts = self.df["abstract"]
        print('Abstracts set.')
        self.titles = self.df["title"]
        print('Titles set.')
        #builds vocabulary using all abstracts given
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(self.abstracts.tolist())
        print('Vocabulary built')
        #writes vocabulary to a txt file
        save_vocab(self.vocab, 'snapshot-vocab.txt')
        print('Vocabulary written.')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        #gets an abstract and title at a specific location
        abstract = self.abstracts[idx]
        title = self.titles[idx]

        #converts abstract and title to numericalized tensors using the numericalize function defined in vocab
        numericalized_abstract = [self.vocab.stoi["<SOS>"]]
        numericalized_abstract += self.vocab.numericalize(abstract)
        numericalized_abstract.append(self.vocab.stoi["<EOS>"])

        numericalized_title = [self.vocab.stoi["<SOS>"]]
        numericalized_title += self.vocab.numericalize(title)
        numericalized_title.append(self.vocab.stoi["<EOS>"])

        return torch.tensor(numericalized_title), torch.tensor(numericalized_abstract)
    
def padify(b):
    # b is the list of tuples of length batch_size
    #   - first element of a tuple = label, 
    #   - second = feature (text sequence)
    # build vectorized sequence
    v = [Vocabulary.numericalize(x[1]) for x in b]
    # first, compute max length of a sequence in this minibatch
    l = max(map(len,v))
    return ( # tuple of two tensors - labels and features
        torch.LongTensor([t[0]-1 for t in b]),
        torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
    )


In [3]:
'''This cell creates the loader function, which is used to load the data into a usable form for Pytorch.'''
def get_loader(json_file, batch_size=100, num_workers=8, shuffle=True, pin_memory=True):
    dataset = ArxivDataset(json_file)
    loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, pin_memory=pin_memory,
    collate_fn=padify)
    return loader

dataloader = get_loader("arxiv-snapshot.json")


Loaded.
Converted to dataframe.
Abstracts set.
Titles set.
  A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from quark-antiquark,
gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as
all-orders resummation of initial-state gluon radiation valid at
next-to-next-to-leading logarithmic accuracy. The region of phase space is
specified in which the calculation is most reliable. Good agreement is
demonstrated with data from the Fermilab Tevatron, and predictions are made for
more detailed tests with CDF and DO data. Predictions are shown for
distributions of diphoton pairs produced at the energy of the Large Hadron
Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs
boson are contrasted with those produced from QCD processes at the LHC, showing
that enhanced sensitivity to the signal can be obt

In [15]:
'''Cell used to create monash-info.txt.

   I went through the list of Monash classes from the print.json file manually, using this cell to write a new file which 
   would keep all classes within relevant faculties to arxiv. I store each unitcode with its unitname, as well as the class
   description (which contains data I can use in future to compare bag of words representations of abstracts with 
   class descriptions).
   There ended up being over 1000 classes, so in future development
   of this project I may have to shrink this down or take this into account in the machine learning model I ultimately use 
   to make connections between this data and the arxiv data.   
'''

# import json 
# import pandas as pd
# with open('monScraper2/print.json', 'r') as class_source:  
#     file = class_source.read()
#     file = pd.DataFrame(json.loads(file)).values.tolist()
#     with open('monash-info.txt', 'x') as dest:
#         relevant_faculties = ['ASP', 'ATM', 'TRC', 'STA', 'SRU', 'SOI', 'SEM', 'SCM', 'SCI', 'SCE', 'SCC', 'SCB', 'RSE', 
#         'RAD', 'PSY', 'PSS', 'PSC', 'PRU', 'PLT', 'PHY', 'PHS', 'PHR', 'MTH', 'MTE', 'MSM', 'MEC', 'MAT', 'MAE', 'IMM', 'FIT',
#         'ESC', 'ERU', 'ENV', 'ENG', 'ECE', 'EAE', 'CHM', 'CHE', 'BTH', 'BMS', 'BIO', 'ATM']
#         exempt = ['Z', 'W', 'X', 'V', 'U', 'T', 'S', 'P', 'O', 'N', 'M', 'L', 'K', 'J', 'I', 'H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']
#         for i in range(len(file)): 
#             unitCode = file[i][13]
#             if unitCode[0:3] in relevant_faculties or unitCode[0] not in exempt:
#                 if file[i][3]:
#                     #if unitCode[0:3] in relevant_faculties:
#                     unitName = file[i][15]
#                     description = str(file[i][3])
#                     dest.write(f"{unitCode}-{unitName}:")
#                     dest.write(f"{description}")
#                     dest.write('\n')
  
    