In [1]:
import torch
import numpy as np

In [2]:
class Embedder(torch.nn.Module):
    
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = torch.nn.Embedding(vocab_size, d_model)
        
    def forward(self, x):
        # [123, 0, 23, 5] -> [[..512..], [...512...], ...]
        return self.embed(x)  

In [3]:
import math

class PositionalEncoder(torch.nn.Module):
    
    def __init__(self, d_model, max_seq_len=80):
        super().__init__()
        self.d_model = d_model
        
        # create constant positional encoding matrix
        pe_matrix = torch.zeros(max_seq_len, d_model)
        
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe_matrix[pos, i] = math.sin(pos/10000**(2*i/d_model))
                pe_matrix[pos, i+1] = math.cos(pos/10000**(2*i/d_model))
        pe_matrix = pe_matrix.unsqueeze(0)     # Add one dimension for batch size
        self.register_buffer('pe', pe_matrix)  # Register as persistent buffer
        
    def forward(self, x):
        # x is a sentence after embedding with dim (batch, number of words, vector dimension)
        seq_len = x.size()[1]
        x = x + self.pe[:, :seq_len]
        return x

In [4]:
import math
import torch.nn.functional as F

# Given Query, Key, Value, calculate the final weighted value
def scaled_dot_product_attention(q, k, v, mask=None, dropout=None):
    # Shape of q and k are the same, both are (batch_size, seq_len, d_k)
    # Shape of v is (batch_size, seq_len, d_v)
    attention_scores = torch.matmul(q, k.transpose(-2, -1))/math.sqrt(q.shape[-1])  # size (batch_size, seq_len, seq_len)
    
    # Apply mask to scores
    # 
    if mask is not None:
        attention_scores = attention_scores.masked_fill(mask == 0, value=-1e9)
        
    # Softmax along the last dimension
    attention_weights = F.softmax(attention_scores, dim=-1)
    
    if dropout is not None:
        attention_weights = dropout(attention_weights)
        
    output = torch.matmul(attention_weights, v)
    return output

In [5]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, n_heads, d_model, dropout=0.1):
        super().__init__()
        
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_k = self.d_v = d_model//n_heads
        
        # self attention linear layers
        # Linear layers for q, k, v vectors generation in different heads
        self.q_linear_layers = []
        self.k_linear_layers = []
        self.v_linear_layers = []
        for i in range(n_heads):
            self.q_linear_layers.append(torch.nn.Linear(d_model, self.d_k))
            self.k_linear_layers.append(torch.nn.Linear(d_model, self.d_k))
            self.v_linear_layers.append(torch.nn.Linear(d_model, self.d_v))
        
        self.dropout = torch.nn.Dropout(dropout)
        self.out = torch.nn.Linear(n_heads*self.d_v, d_model)
        
    def forward(self, q, k, v, mask=None):
        multi_head_attention_outputs = []
        for q_linear, k_linear, v_linear in zip(self.q_linear_layers,
                                                self.k_linear_layers,
                                                self.v_linear_layers):
            new_q = q_linear(q)  # size: (batch_size, seq_len, d_k)
            new_k = k_linear(k)  # size: (batch_size, seq_len, d_k)
            new_v = v_linear(v)  # size (batch_size, seq_len, d_v)
            
            # Scaled Dot-Product attention
            head_v = scaled_dot_product_attention(new_q, new_k, new_v, mask, self.dropout)  # (batch_size, seq_len, d_v)
            multi_head_attention_outputs.append(head_v)
            
        # Concat
        #import pdb; pdb.set_trace()
        concat = torch.cat(multi_head_attention_outputs, -1)  # (batch_size, seq_len, n_heads*d_v)
        
        # Linear layer to recover to original shap
        output = self.out(concat)  # (batch_size, seq_len, d_model)
        
        return output

In [6]:
class FeedForward(torch.nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()
        
        self.linear_1 = torch.nn.Linear(d_model, d_ff)
        self.dropout = torch.nn.Dropout(dropout)
        self.linear_2 = torch.nn.Linear(d_ff, d_model)
        
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

In [7]:
class LayerNorm(torch.nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.d_model = d_model
        self.alpha = torch.nn.Parameter(torch.ones(self.d_model))
        self.beta = torch.nn.Parameter(torch.zeros(self.d_model))
        self.eps = eps
        
    def forward(self, x):
        # x size: (batch_size, seq_len, d_model)
        x_hat = (x - x.mean(dim=-1, keepdim=True))/(x.std(dim=-1, keepdim=True) + self.eps)
        x_tilde = self.alpha*x_hat + self.beta
        return x_tilde

In [8]:
import copy

def clone_layer(module, N):
    return torch.nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [9]:
class GPTBlock(torch.nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        self.norm_1 = LayerNorm(d_model)
        self.norm_2 = LayerNorm(d_model)
        
        self.multi_head_attention = MultiHeadAttention(n_heads, d_model)
        
        self.feed_forward = FeedForward(d_model)
        
    def forward(self, x, mask):
        x = self.norm_1(x)
        x = x + self.multi_head_attention(x, x, x, mask)
        x = self.norm_2(x)
        x = x + self.feed_forward(x)
        return x

In [10]:
class GPT(torch.nn.Module):
    def __init__(self, d_model, N, n_heads, vocab_size, dropout=0.1):
        super().__init__()
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.gpt_layers = clone_layer(GPTBlock(d_model, n_heads), N)
        self.norm = LayerNorm(d_model)

    def forward(self, x, mask):
        x = self.embed(x)
        x = self.pe(x)
        for block in self.gpt_layers:
            x = block(x, mask)
        return self.norm(x)

In [12]:
import re
import spacy
import itertools
from torchtext import data

def tokenizer(text):
    text = text.split('\n')
    exceptions = ['begin', 'end']
    ignorelist = ['&', '$', '#', '_', '{', '}', '~', '^']
    exceptions = ['\\\\'+tag+"{[^{}]*}" for tag in exceptions]
    pattern = exceptions + ["\\\\\w*|{|}|\$|\[|\]|<text>|\s*"]
    pattern = '|'.join(pattern)
    pattern = r"(%s)" % pattern
    
    tokenized = [list(filter(None, (lambda x: re.split(pattern, x))(line))) + ['\n'] for line in text]
    
    return list(itertools.chain.from_iterable(tokenized))

In [13]:
import glob
import os
import pandas as pd
from tqdm.auto import tqdm

PATH = "./../data/papers/"

tex_files = glob.glob(PATH + "*.tex", recursive = True)

wordCount = 0
sentCount = 0
docCount = 0
vocab = set()
vocabDict = {} # Vocabulary dictionary
vocabCount = 0
vocabCumSum = [] # To plot Vocab vs Words
wordCumSum = [] # To plot Vocab vs Words

for file in tqdm(tex_files):
    text = open(file, "rt", encoding = "utf8").read()
    tokens = tokenizer(text)
    docCount += 1
    
    for token in tokens:
        wordCount += 1
        token = token.lower()
        if token not in vocab:
            vocab.add(token)
            vocabDict[token] = 1
            vocabCount += 1
        else:
            vocabDict[token] += 1
    
    vocabCumSum += [vocabCount]
    wordCumSum += [wordCount]

  0%|          | 0/109877 [00:00<?, ?it/s]

In [15]:
vocabDF = pd.DataFrame(vocabDict.items())

len(vocabDF)

339216

In [11]:
d_model = 512
n_heads = 8
N = 12
vocab_size = 100

model = GPT(d_model, N, n_heads, vocab_size)

def create_mask(text):
    pad = 1 #SRC.vocab.stoi['<pad>']    
    # Input mask
    pad_mask = (text != pad).unsqueeze(1)
    seq_len = text.size(1)
    nopeak_mask = np.tril(np.ones((1, seq_len, seq_len)), k=0).astype('uint8')
    nopeak_mask = torch.from_numpy(nopeak_mask) != 0
    mask = pad_mask & nopeak_mask
    print(mask)
    return mask



x = torch.tensor([[0,0,0,0,0]])
                 
mask = create_mask(torch.tensor(x))
model(x, mask)

tensor([[[ True, False, False, False, False],
         [ True,  True, False, False, False],
         [ True,  True,  True, False, False],
         [ True,  True,  True,  True, False],
         [ True,  True,  True,  True,  True]]])


  mask = create_mask(torch.tensor(x))


tensor([[[ 0.5564, -2.1160, -2.1147,  ...,  1.0211, -2.6412, -1.4760],
         [ 0.7566, -2.2017, -1.9696,  ...,  0.8697, -2.6994, -1.1817],
         [ 0.7672, -1.9968, -2.1514,  ...,  0.9146, -2.2939, -1.0523],
         [ 0.5217, -1.8834, -1.9739,  ...,  0.7363, -2.4286, -0.7821],
         [ 0.6801, -2.0235, -1.6651,  ...,  0.5793, -2.2954, -1.1591]]],
       grad_fn=<AddBackward0>)