In [20]:
import torch
import numpy as np
import json
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Fri Apr  5 09:47:27 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 552.12                 Driver Version: 552.12         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1660 Ti   WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   56C    P8             10W /   80W |     724MiB /   6144MiB |     12%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [16]:
def print_list(l, K=None):
	for i, e in enumerate(l):
		if i == K:
			break
		print(e)
	print()

def load_from_pickle(pickle_file):
	with open(pickle_file, "rb") as pickle_in:
		return pickle.load(pickle_in)

# Data Loading and Exploration

In [11]:
cols = ['id', 'title', 'abstract', 'categories']
data = []
file_name = './arxiv-metadata-oai-snapshot.json'

with open(file_name, encoding='latin-1') as f:
    count = 0
    for line in f:
        doc = json.loads(line)
        lst = [doc['id'], doc['title'], doc['abstract'], doc['categories']]
        data.append(lst)
        count += 1
        if count >= 10000:
            break
        
df = pd.DataFrame(data=data, columns=cols).sample(n=100, random_state=68)

df.head()

Unnamed: 0,id,title,abstract,categories
2007,704.2008,Observations of Manifestations of Skeletal Str...,The analysis of databases of photographic im...,astro-ph
7988,705.3986,Variable Electron-Phonon Coupling in Isolated ...,We report the existence of broad and weakly ...,cond-mat.mtrl-sci
9269,706.0583,Determination of InN-GaN heterostructure band ...,Band discontinuities at the InN-GaN heteroin...,cond-mat.mtrl-sci cond-mat.other
102,704.0103,Generalized regularly discontinuous solutions ...,The physical consistency of the match of pie...,gr-qc
1383,704.1384,Generalizing circles over algebraic extensions,This paper deals with a family of spatial ra...,math.AG


In [22]:
pad_word = "<pad>"
bos_word = "<s>"
eos_word = "</s>"
unk_word = "<unk>"
pad_id = 0
bos_id = 1
eos_id = 2
unk_id = 3

def normalize_sentence(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

class Vocabulary:
    def __init__(self):
        self.word_to_id = {pad_word: pad_id, bos_word: bos_id, eos_word:eos_id, unk_word: unk_id}
        self.word_count = {}
        self.id_to_word = {pad_id: pad_word, bos_id: bos_word, eos_id: eos_word, unk_id: unk_word}
        self.num_words = 4

    def get_ids_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        sent_ids = [bos_id] + [self.word_to_id[word] if word in self.word_to_id \
                               else unk_id for word in sentence.split()] + \
                               [eos_id]
        return sent_ids

    def tokenized_sentence(self, sentence):
        sent_ids = self.get_ids_from_sentence(sentence)
        return [self.id_to_word[word_id] for word_id in sent_ids]

    def decode_sentence_from_ids(self, sent_ids):
        words = list()
        for i, word_id in enumerate(sent_ids):
            if word_id in [bos_id, eos_id, pad_id]:
                # Skip these words
                continue
            else:
                words.append(self.id_to_word[word_id])
        return ' '.join(words)

    def add_words_from_sentence(self, sentence):
        sentence = normalize_sentence(sentence)
        for word in sentence.split():
            if word not in self.word_to_id:
                # add this word to the vocabulary
                self.word_to_id[word] = self.num_words
                self.id_to_word[self.num_words] = word
                self.word_count[word] = 1
                self.num_words += 1
            else:
                # update the word count
                self.word_count[word] += 1

abstracts_only = []
titles_only = []
                
vocab = Vocabulary()
for ids, title, abstract, categories in data:
    abstracts_only.append(abstract)
    titles_only.append(title)
    vocab.add_words_from_sentence(title)
    vocab.add_words_from_sentence(abstract)
print(f"Total words in the vocabulary = {vocab.num_words}")

Total words in the vocabulary = 36292


In [17]:
print_list(sorted(vocab.word_count.items(), key=lambda item: item[1], reverse=True), 30)

('the', 92050)
('of', 64248)
('.', 59238)
('and', 32690)
('a', 30864)
('in', 26941)
('to', 23517)
('is', 18678)
('for', 14358)
('with', 12928)
('that', 12731)
('We', 11592)
('The', 10923)
('are', 9531)
('on', 9162)
('by', 8526)
('we', 7280)
('as', 6171)
('an', 6127)
('from', 5913)
('be', 5863)
('at', 5715)
('this', 5527)
('which', 5012)
('In', 3955)
('model', 3682)
('can', 3608)
('two', 3510)
('A', 3418)
('field', 3107)



In [18]:
for ids, title, abstract, categories in data[:2]:
    sentence = abstract
    word_tokens = vocab.tokenized_sentence(sentence)

    # Automatically adds bos_id and eos_id before and after sentence ids respectively
    word_ids = vocab.get_ids_from_sentence(sentence)
    print(sentence)
    print(word_tokens)
    print(word_ids)
    print(vocab.decode_sentence_from_ids(word_ids))
    print()

word = "the"
word_id = vocab.word_to_id[word]
print(f"Word = {word}")
print(f"Word ID = {word_id}")
print(f"Word decoded from ID = {vocab.decode_sentence_from_ids([word_id])}")

  A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from quark-antiquark,
gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as
all-orders resummation of initial-state gluon radiation valid at
next-to-next-to-leading logarithmic accuracy. The region of phase space is
specified in which the calculation is most reliable. Good agreement is
demonstrated with data from the Fermilab Tevatron, and predictions are made for
more detailed tests with CDF and DO data. Predictions are shown for
distributions of diphoton pairs produced at the energy of the Large Hadron
Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs
boson are contrasted with those produced from QCD processes at the LHC, showing
that enhanced sensitivity to the signal can be obtained with judicious
selection of events.

['<s>', 'A', 'fu

In [26]:
class ArXiv_dataset(Dataset):
    """ArXiv dataset consisting of Abstract, title pairs."""

    def __init__(self, abstracts, titles, vocab, device):
        """
        Args:
            conversations: list of tuple (src_string, tgt_string)
                         - src_string: String of the source sentence
                         - tgt_string: String of the target sentence
            vocab: Vocabulary object that contains the mapping of
                    words to indices
            device: cpu or cuda
        """
        self.abstract_title = list(zip(abstracts, titles))
        self.vocab = vocab
        self.device = device

        def encode(src, tgt):
            src_ids = self.vocab.get_ids_from_sentence(src)
            tgt_ids = self.vocab.get_ids_from_sentence(tgt)
            return (src_ids, tgt_ids)

        # We will pre-tokenize the conversations and save in id lists for later use
        self.tokenized_abstract_title = [encode(src, tgt) for src, tgt in self.abstract_title]

    def __len__(self):
        return len(self.abstract_title)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        return {"conv_ids":self.tokenized_abstract_title[idx], "conv":self.abstract_title[idx]}

def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (src_seq, trg_seq).
    We should build a custom collate_fn rather than using default collate_fn,
    because merging sequences (including padding) is not supported in default.
    Seqeuences are padded to the maximum length of mini-batch sequences (dynamic padding).
    Args:
        data: list of dicts {"conv_ids":(src_ids, tgt_ids), "conv":(src_str, trg_str)}.
            - src_ids: list of src piece ids; variable length.
            - tgt_ids: list of tgt piece ids; variable length.
            - src_str: String of src
            - tgt_str: String of tgt
    Returns: dict { "conv_ids":     (src_ids, tgt_ids),
                    "conv":         (src_str, tgt_str),
                    "conv_tensors": (src_seqs, tgt_seqs)}
            src_seqs: torch tensor of shape (src_padded_length, batch_size).
            trg_seqs: torch tensor of shape (tgt_padded_length, batch_size).
            src_padded_length = length of the longest src sequence from src_ids
            tgt_padded_length = length of the longest tgt sequence from tgt_ids

    """
    # Sort conv_ids based on decreasing order of the src_lengths.
    # This is required for efficient GPU computations.
    src_ids = [torch.LongTensor(e["conv_ids"][0]) for e in data]
    tgt_ids = [torch.LongTensor(e["conv_ids"][1]) for e in data]
    src_str = [e["conv"][0] for e in data]
    tgt_str = [e["conv"][1] for e in data]
    data = list(zip(src_ids, tgt_ids, src_str, tgt_str))
    data.sort(key=lambda x: len(x[0]), reverse=True)
    src_ids, tgt_ids, src_str, tgt_str = zip(*data)

    ### BEGIN YOUR CODE ###

    # Pad the src_ids and tgt_ids using token pad_id to create src_seqs and tgt_seqs
    src_seqs = pad_sequence(src_ids, batch_first=False, padding_value=pad_id)
    tgt_seqs = pad_sequence(tgt_ids, batch_first=False, padding_value=pad_id)
    ### END YOUR CODE ###

    return {"conv_ids":(src_ids, tgt_ids), "conv":(src_str, tgt_str), "conv_tensors":(src_seqs.to(device), tgt_seqs.to(device))}

In [29]:
# Create the DataLoader for all_conversations
dataset = ArXiv_dataset(abstracts_only, titles_only, vocab, device)

batch_size = 2

data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [30]:
# Test one batch of training data
first_batch = next(iter(data_loader))
print(f"Testing first training batch of size {len(first_batch['conv'][0])}")
print(f"List of source strings:")
print_list(first_batch["conv"][0])
print(f"Tokenized source ids:")
print_list(first_batch["conv_ids"][0])
print(f"Padded source ids as tensor (shape {first_batch['conv_tensors'][0].size()}):")
print(first_batch["conv_tensors"][0])

Testing first training batch of size 2
List of source strings:
  We investigate analytic solutions to Witten's bosonic string field theory and
Berkovits' WZW-type superstring field theory. We construct solutions with
parameters out of simpler ones, using a commutative monoid that includes the
family of wedge states. Our solutions are generalizations of solutions for
marginal deformations by nonsingular currents, and can also reproduce Schnabl's
tachyon vacuum solution in bosonic string field theory. This implies that such
known solutions are generated from simple solutions which are based on the
identity state. We also discuss gauge transformations and induced field
redefinitions for our solutions in both bosonic and super string field theory.

  We study the AdS_5 x S^5 sigma-model truncated to the near-flat-space limit
to two-loops in perturbation theory. In addition to extending previously known
one-loop results to the full SU(2|2)^2 S-matrix we calculate the two-loop
correction to 

In [31]:
print(f"Testing first training batch of size {len(first_batch['conv'][1])}")
print(f"List of target strings:")
print_list(first_batch["conv"][1])
print(f"Tokenized target ids:")
print_list(first_batch["conv_ids"][1])
print(f"Padded target ids as tensor (shape {first_batch['conv_tensors'][1].size()}):")
print(first_batch["conv_tensors"][1])

Testing first training batch of size 2
List of target strings:
Comments on Solutions for Nonsingular Currents in Open String Field
  Theories
World-sheet scattering in AdS_5 x S^5 at two loops

Tokenized target ids:
tensor([    1,  5395,   168,  6309,    26, 31260,  8651,    20, 12560,  3363,
          194,  7554,     2])
tensor([   1, 2607, 2904,  344,   20, 3918,  231,  685,   11,  294, 3638,    2])

Padded target ids as tensor (shape torch.Size([13, 2])):
tensor([[    1,     1],
        [ 5395,  2607],
        [  168,  2904],
        [ 6309,   344],
        [   26,    20],
        [31260,  3918],
        [ 8651,   231],
        [   20,   685],
        [12560,    11],
        [ 3363,   294],
        [  194,  3638],
        [ 7554,     2],
        [    2,     0]], device='cuda:0')


# Model Definitions

# Training

In [32]:
def train(model, data_loader, num_epochs, model_file, learning_rate=0.0001):
    """
    Train the model for given number of epochs and save the trained model in
    the final model_file.
    """
    decoder_learning_ratio = 5.0

    ### BEGIN YOUR CODE ###

    encoder_parameter_names = ['embed', 'encoder', 'trans'] # <- Add a list of encoder parameter names here!

    ### END YOUR CODE ###

    encoder_named_params = list(filter(lambda kv: any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    decoder_named_params = list(filter(lambda kv: not any(key in kv[0] for key in encoder_parameter_names), model.named_parameters()))
    encoder_params = [e[1] for e in encoder_named_params]
    decoder_params = [e[1] for e in decoder_named_params]
    optimizer = torch.optim.AdamW([
        {'params': encoder_params},
        {
            'params': decoder_params,
            'lr': learning_rate * decoder_learning_ratio
        }
    ], lr = learning_rate)

    clip = 50.0
    for epoch in tqdm.trange(num_epochs, desc="training", unit="epoch"):
        with tqdm.tqdm(data_loader, desc=f"epoch {epoch + 1}", unit="batch", total=len(data_loader), position=0, leave=True) as batch_iterator:
            model.train()
            total_loss = 0.0
            for i, batch_data in enumerate(batch_iterator, start=1):
                source, target = batch_data["conv_tensors"]
                optimizer.zero_grad()
                loss = model.compute_loss(source, target)
                total_loss += loss.item()
                loss.backward()

                # Gradient clipping before taking the step
                _ = nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()

                batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())

    # Save the model after training
    torch.save(model.state_dict(), model_file)

In [None]:
num_epochs = 50
batch_size = 512

# Reloading the data_loader to increase batch_size
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
baseline_model = Seq2seqBaseline(vocab).to(device)
train(baseline_model, data_loader, num_epochs, "baseline_model.pt")

In [None]:
# Reload the model from the model file. Useful when you have already trained and saved the model
baseline_model = Seq2seqBaseline(vocab).to(device)
baseline_model.load_state_dict(torch.load("baseline_model.pt", map_location=device))