In [1]:
# referenced karpathy/minGPT when necessary for transformer implementation

import torch
import torch.nn as nn

from torch.utils.data import Dataset

import numpy as np

import re

import linecache

from tqdm import tqdm

import math

In [2]:
device = torch.device("cpu")

In [3]:
class GELU(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_dim, num_heads, dropout):
        super(MultiHeadAttention, self).__init__()

        self.num_heads = num_heads

        self.dk = emb_dim//self.num_heads

        # learn projections in one linear operation
        self.qkv_proj = nn.Linear(emb_dim, emb_dim*3)
        self.out_proj = nn.Linear(emb_dim, emb_dim)

        self.scale = torch.sqrt(torch.tensor(self.dk, dtype=torch.float32))

        self.softmax = nn.Softmax()

        self.attention_dropout = nn.Dropout(dropout)
        self.residual_dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch_size, seq_length, emb_dim = x.shape

        qkv_combined = self.qkv_proj(x)

        # batch size, seq length, num heads, head emb dim
        q, k, v = torch.split(qkv_combined, emb_dim, dim=2)

        # batch size, num heads, seq length, head emb dim
        q = q.view(batch_size, seq_length, self.num_heads, self.dk).transpose(1, 2)
        k = k.view(batch_size, seq_length, self.num_heads, self.dk).transpose(1, 2)
        v = v.view(batch_size, seq_length, self.num_heads, self.dk).transpose(1, 2)

        # create attention pattern - batch size, num heads, head emb dim, head emb dim
        attention_pattern = q @ k.transpose(-2, -1)
        attention_pattern = self.softmax(attention_pattern/self.scale)

        attention_pattern = self.attention_dropout(attention_pattern)

        out = (attention_pattern @ v).transpose(1, 2).contiguous().view(batch_size, seq_length, emb_dim)

        return self.residual_dropout(self.out_proj(out))

In [9]:
class TransformerBlock(nn.Module):
    def __init__(self, emb_dim, num_heads, dropout):
        super(TransformerBlock, self).__init__()

        self.num_heads = num_heads

        self.dk = emb_dim//self.num_heads

        self.layer_norm1 = nn.LayerNorm(emb_dim) # layer norm over the embedding dimension
        self.attention = MultiHeadAttention(emb_dim, num_heads, dropout)
        self.layer_norm2 = nn.LayerNorm(emb_dim)

        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, emb_dim * 4), # fully connected
            nn.Linear(emb_dim * 4, emb_dim), # projection back into original embedding dims
            GELU(),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = self.layer_norm1(x)
        x = self.attention(x)
        x = self.layer_norm2(x)

        return self.mlp(x)


block = TransformerBlock(50, 2, 0.1)
# 4 batch size, 10 seq length, 50 emb dimension
sample_input = torch.randn((4, 10, 50))

output = block(sample_input)

print(output.shape)

torch.Size([4, 10, 50])


  attention_pattern = self.softmax(attention_pattern/self.scale)


In [28]:
class SimpleTransformer(nn.Module):
    def __init__(self, emb_dim, num_heads, dropout, embedding_path, vocab_path, num_layers=4):
        super(SimpleTransformer, self).__init__()

        self.num_layers = num_layers

        word_embeddings = np.load(embedding_path)
        self.vocab = np.load(vocab_path)

        self.embedding_layer = nn.Embedding.from_pretrained(torch.from_numpy(word_embeddings).float())
        
        self.transformer_blocks = []
        for i in range(num_layers):
            self.transformer_blocks += [TransformerBlock(emb_dim, num_heads, dropout)]
        
        self.transformer_blocks = nn.ModuleList(self.transformer_blocks)

        self.pool = nn.AdaptiveMaxPool1d(1)
        self.sigmoid = nn.Sigmoid()


    def forward(self, x):
        embeddings = self.embedding_layer(x)

        final_output = torch.zeros_like(embeddings)
        
        for layer in self.transformer_blocks:
            final_output = layer(final_output)
        
        final_output = self.pool(final_output)
        final_output = self.sigmoid(final_output)

        return final_output.squeeze(2)


    def get_indices(self, string):
        indices = []

        string_arr = string.split(" ")

        for word in string_arr:
            if np.where(self.vocab == word)[0].shape[0] != 0:
                indices += [np.where(self.vocab == word)[0]]
            else:
                indices += [np.array([0])]
        
        return torch.tensor(indices).squeeze(1)

In [29]:
class EntityDataset(Dataset):
    # overriden methods
    def __init__(self, file_path, model):
        self.file_path = file_path

        self.model = model


    def __len__(self):
        with open(self.file_path, "rbU") as f:
            num_lines = sum(1 for _ in f)
        
        # don't count first line
        return num_lines - 1


    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        particular_line = linecache.getline(self.file_path, idx+1)
        cleaned_sample = self.clean_up(particular_line)

        input_sentence = cleaned_sample[4]
        entity1 = cleaned_sample[1]
        entity2 = cleaned_sample[2]

        sentence_arr = self.model.get_indices(input_sentence)
        entity1_arr = self.model.get_indices(entity1)
        entity2_arr = self.model.get_indices(entity2)

        '''labels = torch.zeros_like(sentence_arr)

        for _, word_index in enumerate(sentence_arr):
            for entity_index in entity1_arr:
                if entity_index == word_index:
                    labels[_] = 1
            for entity_index in entity2_arr:
                if entity_index == word_index:
                    labels[_] = 1'''

        labels1 = self.generate_labels(sentence_arr, entity1_arr)
        labels2 = self.generate_labels(sentence_arr, entity2_arr)

        labels = labels1 | labels2

        return sentence_arr, labels.type(torch.float32)
    

    # first instance of entity in sentence
    def generate_labels(self, sentence_arr, entity_arr):
        correct = []

        for _, token in enumerate(sentence_arr):
            if _ < sentence_arr.shape[0] - entity_arr.shape[0]:
                if token == entity_arr[0]:
                    not_equal = False
                    for i, val in enumerate(entity_arr):
                        if sentence_arr[_+i] != val:
                            not_equal = True
                    if not not_equal:
                        correct += [_]
            else:
                break
        
        labels = torch.zeros_like(sentence_arr)
        for i in correct:
            labels[i:i+entity_arr.shape[0]] = 1
        
        return labels
        

    # helper
    def clean_up(self, line):
        remove_chars = ["<e1>", "</e1>", "<e2>", "</e2>"]

        line = line.strip()

        for char in remove_chars:
            line = line.replace(char, "")
        
        # string clean up
        line = re.sub(r'[^a-zA-Z1-9\s]', '', line)
        line = re.sub(' +', ' ', line)
        line = line.lower()

        line_data = line.split("\t")
        
        return line_data

In [30]:
test_string = "hello how are you"

model = SimpleTransformer(50, 5, 0.05, "utils/embs_npa.npy", "utils/vocab_npa.npy")

indices = model.get_indices(test_string)

print(indices.shape)

output = model(indices.unsqueeze(0))

print(output.shape)

torch.Size([4])
torch.Size([1, 4])


  attention_pattern = self.softmax(attention_pattern/self.scale)


In [31]:
batch_size = 1

dataset = EntityDataset('data/en_corpora_test.txt', model)

train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

  with open(self.file_path, "rbU") as f:


In [32]:
sample = next(iter(train_loader))

print(sample)

  with open(self.file_path, "rbU") as f:


[tensor([[   518,   1527,     16,      9,   2843,    284,      8,     89,   2539,
            984,     23,  12072,  13037,   1016,     23,    281,   2412,      7,
           1683,     23,   3022, 150915]]), tensor([[1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]])]


In [33]:
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [34]:
EPOCHS = 1

for epoch in range(EPOCHS):
    for _, sample in tqdm(enumerate(train_loader), total=len(train_loader)):
        optimizer.zero_grad()

        input = sample[0]
        labels = sample[1]

        output = model(input)

        loss = criterion(output, labels)

        loss.backward()

        optimizer.step()

        if _ % 50 == 0:
            print(loss.item())

  with open(self.file_path, "rbU") as f:
  attention_pattern = self.softmax(attention_pattern/self.scale)
  0%|          | 1/5461 [00:00<30:42,  2.96it/s]

0.8730854392051697


  1%|          | 51/5461 [00:13<20:23,  4.42it/s]

0.7030960917472839


  2%|▏         | 102/5461 [00:28<25:25,  3.51it/s]

0.6933606863021851


  3%|▎         | 151/5461 [00:42<23:25,  3.78it/s]

0.6939178109169006


  4%|▎         | 201/5461 [00:56<23:05,  3.80it/s]

0.6913807988166809


  5%|▍         | 251/5461 [01:09<21:27,  4.05it/s]

0.6931471824645996


  6%|▌         | 301/5461 [01:23<24:26,  3.52it/s]

0.6931471824645996


  6%|▋         | 352/5461 [01:37<19:22,  4.40it/s]

0.6915850043296814


  7%|▋         | 401/5461 [01:50<23:34,  3.58it/s]

0.6892816424369812


  8%|▊         | 451/5461 [02:05<32:26,  2.57it/s]

0.6933079957962036


  9%|▉         | 501/5461 [02:19<23:33,  3.51it/s]

0.6889674067497253


 10%|█         | 552/5461 [02:34<16:49,  4.86it/s]

0.6892901062965393


 11%|█         | 601/5461 [02:49<23:31,  3.44it/s]

0.6829128265380859


 12%|█▏        | 651/5461 [03:05<25:01,  3.20it/s]

0.6906524300575256


 13%|█▎        | 702/5461 [03:19<22:56,  3.46it/s]

0.6931471228599548


 14%|█▍        | 751/5461 [03:34<21:59,  3.57it/s]

0.6934482455253601


 15%|█▍        | 801/5461 [03:48<23:12,  3.35it/s]

0.6940341591835022


 16%|█▌        | 851/5461 [04:03<22:53,  3.36it/s]

0.6900690793991089


 16%|█▋        | 901/5461 [04:17<22:31,  3.37it/s]

0.6838020086288452


 17%|█▋        | 952/5461 [04:31<24:42,  3.04it/s]

0.6932098865509033


 18%|█▊        | 1001/5461 [04:45<21:47,  3.41it/s]

0.6771064400672913


 18%|█▊        | 1005/5461 [04:47<21:13,  3.50it/s]


KeyboardInterrupt: 

In [27]:
test_string = "I am a very cool person"

indices = model.get_indices(test_string)

output = model(indices.unsqueeze(0))

print(output)

tensor([[0.5000, 0.5000, 0.5113, 0.5000, 0.5027, 0.5000]],
       grad_fn=<SqueezeBackward1>)


  attention_pattern = self.softmax(attention_pattern/self.scale)
