# Data Gathering and Installations

In [2]:
import os

# Download the wiki103 dataset
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip?ref=blog.salesforceairesearch.com
# Unzip the file
if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ and os.environ['KAGGLE_KERNEL_RUN_TYPE'] == 'Interactive':
    # For kaggle notebook
    !unzip /kaggle/working/wikitext-103-raw-v1.zip?ref=blog.salesforceairesearch.com
else:
    !unzip wikitext-103-raw-v1.zip?ref=blog.salesforceairesearch.com

--2023-12-11 15:56:31--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip?ref=blog.salesforceairesearch.com
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.88.254, 52.217.171.240, 54.231.204.168, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.88.254|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191984949 (183M) [application/zip]
Saving to: ‘wikitext-103-raw-v1.zip?ref=blog.salesforceairesearch.com’


2023-12-11 15:56:36 (42.9 MB/s) - ‘wikitext-103-raw-v1.zip?ref=blog.salesforceairesearch.com’ saved [191984949/191984949]

Archive:  wikitext-103-raw-v1.zip?ref=blog.salesforceairesearch.com
   creating: wikitext-103-raw/
  inflating: wikitext-103-raw/wiki.test.raw  
  inflating: wikitext-103-raw/wiki.valid.raw  
  inflating: wikitext-103-raw/wiki.train.raw  


In [3]:
!pip install transformers torchinfo



# Imports

In [4]:
import torch, os, math, random, transformers, matplotlib.pyplot as plt, torch.nn.functional as F
from torch import optim, nn
from transformers import BertTokenizerFast
from torchinfo import summary
from tqdm.notebook import tqdm
from time import time

from torch.utils.data import DataLoader, TensorDataset, Dataset

# BERT Tokenizer

In [5]:
# Load pre-trained model tokenizer (vocabulary)
bert = False
if bert:
  tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# In fine tuning phase. [INS] [/INS] tokens are to be added.
# This can be done by tokenizer.add_tokens([list of tokens])
# Then the model can be updated as model.resize_token_embeddings(len(tokenizer))

In [6]:
class Tokenizer:
    def __init__(self,vocab_size = 1000,special_tokens=["<PAD>","<START>","<OOV>","<END>"]):
        self.vocab_size = vocab_size
        # Special Tokens should be in same order
        self.token_dict = dict(zip(special_tokens, range(len(special_tokens))))

    def fits_on_texts(self,texts):
        word_counts = Counter()
        for text in texts:
            word_counts.update([word.lower() for word in text.split()])

        for word,_ in word_counts.most_common(self.vocab_size):
            if len(self.token_dict) < self.vocab_size:
                self.token_dict[word] = len(self.token_dict)
            else:
                break

    def texts_to_sequences(self,texts):
        seq_list = []
        for text in texts:
            seq = []
            for word in text.split():
                seq.append(self.token_dict.get(word.lower(),2))  # 2 is the <OOV> token
            seq_list.append(seq.copy())
        return seq_list

    def sequences_to_texts(self,seq_list):
        texts = []
        temp_token_dict = {value: key for key,value in self.token_dict.items()}
        for seq in seq_list:
            text = ""
            for token in seq:
                # Assumed that we are getting corrected sequences.
                if token not in (0,1,2,3):  text += temp_token_dict.get(token,"<UNK>") + " "
            texts.append(text.strip())
        return texts

    def token_of(self,word):
        return self.token_dict.get(word,2)  # 2 is the <OOV> token

    def value_of(self,token):
        for key,value in self.token_dict.items():
            if (token == value):  return key
        return "<UNK>"

# Hyper params

In [22]:
hparams = {
    "seq_len": 5, "vocab_size" : tokenizer.vocab_size if bert else 10_000,  # Vocab_size here is a temporary
    "d_model" : 60, "d_ff" : 720, "num_heads" : 6, "dropout" : 0.15,
    "num_layers" : 1,
    "epochs" : 10, "batch_size" : 5120,
}

print(f'Vocab Size is :{hparams["vocab_size"]}')

# Experiment with num_layers : 1, dropout : 0.1, d_model : 60, d_ff : 360 (hopping to have more memory)

Vocab Size is :10000


### Extracting the paragraphs from the wiki103 dataset

In [8]:
"""
The headings are like
=<text>=
Basically it means the if we use this as seperator.
We can get the resultant string as meaningful context.

We then use that context word by word to get the next word.
"""
from collections import Counter

def wiki103_data_extraction(path="wikitext-103-raw/", files=["train", 'test', 'valid']):
    return "".join([open(f"{path}wiki.{filename}.raw", 'r', encoding='utf-8').read() for filename in files])

def wiki103_pre_processing(content, start_index = 0, length = None):
    # Split the contexts
    context,context_list = "",[]
    for line in content.split("\n"):
        if (len(line) > 3):  # Removing the empty lines
            if line[1] == '=' and line[-2] == '=' and context != "":
                # Heading found
                if start_index != 0: start_index -= 1
                else:
                   context_list.append(context)
                   if length is not None and len(context_list) >= length:
                      return context_list
                context = ""
            else:   context += line
    context_list.append(context)   # For last context
    return context_list

### Converting paragraphs to inputs and targets

In [7]:
# def single_token_encode_bert(inp):
#   text, seq_len, toTensor = inp
#   token_seq =  tokenizer.encode(text, add_special_tokens = True)

#   inp_list, target_list = [], []
#   for i in range(2, len(token_seq)):
#     # Ignore the [CLS] and do not consider the last token input.
#     seq, target = token_seq[:i], token_seq[i]

#     # Check for the input seq_len
#     if len(seq) > seq_len:  seq = seq[-seq_len:]
#     else: seq += [0]*max(0,seq_len - len(seq))

#     inp_list.append(seq)
#     target_list.append(target)
#   return ((torch.tensor(inp_list) if toTensor else inp_list),
#           (torch.tensor(target_list) if toTensor else target_list))

# def get_tokens_slowly_bert(texts, seq_len, toTensor = True):
#     inp_dataset, target_dataset = [], []
#     for text in tqdm(texts,desc='Extracting',colour = 'green'):
#         inp_data, target_data = single_token_encode_bert((text,seq_len, toTensor))
#         inp_dataset.extend(inp_data)
#         target_dataset.extend(target_data)

#     print('Converting to tensors')
#     return [(torch.stack(inp_dataset) if toTensor else inp_dataset),
#             (torch.tensor(target_dataset) if toTensor else target_dataset)]

# def get_slicing_tokens_slowly_bert(texts, dataset, seq_len, toTensor = True, batch_size = 5000):
#   prev_index = 0

#   while (prev_index < len(texts)):
#     print(f'Iteration:{int(prev_index/batch_size)+1}')
#     res = get_tokens_slowly_bert(texts[prev_index:prev_index + batch_size], hparams['seq_len'], toTensor)
#     dataset[0].append(res[0])
#     dataset[1].append(res[1])
#     del res
#     prev_index += batch_size

In [9]:
def single_token_encode(inp):
  text, seq_len, toTensor, tokenizer = inp
  token_seq =  tokenizer.texts_to_sequences([text])[0]

  inp_list, target_list = [], []
  for i in range(2, len(token_seq)):
    # Ignore the [CLS] and do not consider the last token input.
    seq, target = token_seq[:i], token_seq[i]

    # Check for the input seq_len
    if len(seq) > seq_len:  seq = seq[-seq_len:]
    else: seq += [0]*max(0,seq_len - len(seq))

    inp_list.append(seq)
    target_list.append(target)
  return ((torch.tensor(inp_list) if toTensor else inp_list),
          (torch.tensor(target_list) if toTensor else target_list))

def get_tokens_slowly(tokenizer, texts, seq_len, toTensor = True):
    inp_dataset, target_dataset = [], []
    for text in tqdm(texts,desc='Extracting',colour = 'green'):
        inp_data, target_data = single_token_encode((text,seq_len, toTensor, tokenizer))
        inp_dataset.extend(inp_data)
        target_dataset.extend(target_data)

    print('Converting to tensors')
    return [(torch.stack(inp_dataset) if toTensor else inp_dataset),
            (torch.tensor(target_dataset) if toTensor else target_dataset)]

def get_slicing_tokens_slowly(tokenizer, texts, dataset, seq_len, toTensor = True, batch_size = 5000):
  prev_index = 0

  while (prev_index < len(texts)):
    print(f'Iteration:{int(prev_index/batch_size)+1}')
    res = get_tokens_slowly(tokenizer, texts[prev_index:prev_index + batch_size], hparams['seq_len'], toTensor)
    dataset[0].append(res[0])
    dataset[1].append(res[1])
    del res
    prev_index += batch_size

In [None]:
# from multiprocessing import Pool

# def get_tokens(texts, seq_len, toTensor = True, num_processes = 4):
#   inp_dataset, target_dataset = [], []

#   with Pool(num_processes) as pool:
#     processed_data = list(tqdm(pool.imap(single_token_encode,
#                                          [(text, seq_len, toTensor) for text in texts]),
#                                 total=len(texts), desc='Processing', colour='green'))
#   for data in processed_data:
#     inp_dataset.extend(data[0])
#     target_dataset.extend(data[1])

#   return [(torch.stack(inp_dataset) if toTensor else inp_dataset),
#           (torch.tensor(target_dataset) if toTensor else target_dataset)]

### Custom Dataset for effective RAM management in kaggle

In [10]:
class customDataset(Dataset):
    """
    A custom dataset to mimic the action of on fly data loading with RAM usage.
    """
    total_examples = 0
    dataset = [[],[]]
    def __init__(self, texts, vocab_size, seq_len, batch_size):
        """
        Attributes
        ----------
        texts [List[str]] : The sequences of paragraphs
        seq_len : The sequence length needed.
        batch_size : batches to be processed
        """
        self.tokenizer = Tokenizer(vocab_size = vocab_size)
        self.tokenizer.fits_on_texts(texts)
        get_slicing_tokens_slowly(self.tokenizer, texts, self.dataset, seq_len, batch_size = batch_size)
        self.total_examples += sum([t.shape[0] for t in self.dataset[0]])
    def __len__(self):
        return self.total_examples

    def update_dataset(self, texts, seq_len, batch_size):
        """
        Attributes
        ----------
        texts [List[str]] : The sequences of paragraphs
        seq_len : The sequence length needed.
        batch_size : batches to be processed
        """
        get_slicing_tokens_slowly(texts, self.dataset, seq_len, batch_size = batch_size)
        self.total_examples += sum([t.shape[0] for t in self.dataset[0]])


    def __getitem__(self, idx):
        """
        Attributes
        ----------
        idx : Actual index of the tensor
        """
        stk_idx = 0
        while (idx >= self.dataset[0][stk_idx].shape[0]):
            idx -= self.dataset[0][stk_idx].shape[0]
        return self.dataset[0][stk_idx][idx], self.dataset[1][stk_idx][idx]

# Model

### Basic Layers

In [11]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model, self.vocab_size = d_model, vocab_size
        self.embedding = nn.Embedding(vocab_size,d_model)

    def forward(self,x):   # x: (batch_size, seq_len)
        return self.embedding(x)*math.sqrt(self.d_model)  # (batch_size, seq_len, d_model)
        # math.sqrt() is suggested from research paper.

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, seq_len=1000, dropout = 0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(0.1)  # A dropout layer to prevent overfitting

        pe = torch.zeros(seq_len, d_model)  # Empty Matrix with zeros
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)  # (max_seq_len) -> (max_seq_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)  # Filling even indices with sin function
        pe[:, 1::2] = torch.cos(position * div_term)  # Filling odd indices with cos function

        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (batch_size, seq_len, d_model)
        seq_len = x.size(1)
        # Add positional encodings to the input
        x = x + self.pe[:seq_len, :].unsqueeze(0)
        return self.dropout(x)

class LayerNormalization(nn.Module):
    def __init__(self,eps: float = 10**-6) -> None:
        super().__init__()
        self.eps = eps  # CPU and GPU cannot go to very very small precisions.
        self.alpha = nn.Parameter(torch.ones(1))   # Weight (Multiplied)
        self.bias = nn.Parameter(torch.zeros(1))   # Bias (Added)

    def forward(self,x):  # x: (batch_size, seq_len, d_model)
        mean = x.mean(dim = -1, keepdim = True)
        std = x.std(dim = -1, keepdim = True)
        return self.alpha*(x - mean)/(std + self.eps) + self.bias

class FeedForward(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)  # W1 and B1
        self.linear2 = nn.Linear(d_ff, d_model) # W2 and B2
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # (batch_size,seq_len,d_model) -> (batch_size,seq_len,d_ff) -> (batch_size,)
        return self.linear2(self.dropout(self.linear1(x)))

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.out_proj = nn.Linear(d_model, d_model, bias=True)  # Wo matrix
        assert d_model % num_heads == 0  # Ensure d_model is divisible by num_heads
        self.head_dim = d_model // num_heads

    def forward(self, q, k, v, causal_mask=False):
        batch_size, seq_len_q, d_model = q.shape
        _, seq_len_kv, _ = k.shape

        interim_shape_q = (batch_size, seq_len_q, self.num_heads, self.head_dim)
        interim_shape_kv = (batch_size, seq_len_kv, self.num_heads, self.head_dim)

        q = q.view(interim_shape_q).transpose(1, 2)  # Shape: (batch_size, num_heads, seq_len_q, head_dim)
        k = k.view(interim_shape_kv).transpose(1, 2)  # Shape: (batch_size, num_heads, seq_len_kv, head_dim)
        v = v.view(interim_shape_kv).transpose(1, 2)  # Shape: (batch_size, num_heads, seq_len_kv, head_dim)

        weight = q @ k.transpose(-1, -2)  # Shape: (batch_size, num_heads, seq_len_q, seq_len_kv)
        if causal_mask:
            weight.masked_fill_(torch.ones_like(weight, dtype=torch.bool).triu(1), -torch.inf)
        weight = F.softmax(weight / math.sqrt(self.head_dim), dim=-1)
        self.attention_scores = weight

        output = weight @ v  # Shape: (batch_size, num_heads, seq_len_q, head_dim)
        # Reshape back to (batch_size, seq_len_q, d_model)
        output = output.transpose(1, 2).reshape(batch_size, seq_len_q, d_model)

        return self.out_proj(output)

class SelfAttention(nn.Module):
  def __init__(self,d_model,num_heads,dropout=0.1):
      super().__init__()
      self.mha = MultiHeadAttention(d_model=d_model,num_heads=num_heads,dropout=dropout)
  def forward(self,x,causal_mask=False):      # x: (Batch_size, Seq_Len, Dim)
      return self.mha(x,x,x,causal_mask)    # (Batch_Size, Seq_Len, Dim)

class ProjectionLayer(nn.Module):
    # The last linear and softmax layer of Transformer.
    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x): # x: (Batch_size,seq_len,d_model)
        # The output should be (Batch_size,seq_len, vocab_size)
        return torch.log_softmax(self.proj(x), dim = -1)

### Model Blocks

In [12]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(DecoderBlock, self).__init__()
        self.self_attn = SelfAttention(d_model, num_heads, dropout)
        self.norm1, self.norm2, self.norm3 = [LayerNormalization() for _ in range(3)]
        self.feed_forward = FeedForward(d_model, d_ff, dropout)

    def forward(self, x):
        out1 = self.norm1(x + self.self_attn(x, causal_mask = True))   # Self Attention and Layer Normalization (Add and Norm)
        return self.norm3(out1 + self.feed_forward(out1))              # Feed Forward Layer Normalization (Add and Norm)

class Transformer(nn.Module):
    def __init__(self, vocab_size, seq_len, d_model, num_heads, d_ff, num_layers, dropout=0.1):
        super(Transformer, self).__init__()
        self.input_embedding = InputEmbeddings(d_model, vocab_size)
        self.pos_encodings = PositionalEncoding(d_model, seq_len, dropout)
        self.decoder = nn.ModuleList([DecoderBlock(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.out_proj = ProjectionLayer(d_model, vocab_size)

    def forward(self, input_seq):
        # Embedding and positional encoding
        input_enc = self.pos_encodings(self.input_embedding(input_seq))

        # Decoder blocks
        for decoder_block in self.decoder:
            input_enc = decoder_block(input_enc)

        input_enc = self.out_proj(input_enc)
        return input_enc

### Verifying the model architecture

In [23]:
model = Transformer(hparams['vocab_size'], hparams['seq_len'], hparams['d_model'],
                     hparams['num_heads'], hparams['d_ff'], hparams['num_layers'],
                     hparams['dropout'])
summary(model,input_sizes=(hparams['batch_size'],hparams['seq_len']))

Layer (type:depth-idx)                        Param #
Transformer                                   --
├─InputEmbeddings: 1-1                        --
│    └─Embedding: 2-1                         600,000
├─PositionalEncoding: 1-2                     --
│    └─Dropout: 2-2                           --
├─ModuleList: 1-3                             --
│    └─DecoderBlock: 2-3                      --
│    │    └─SelfAttention: 3-1                3,660
│    │    └─LayerNormalization: 3-2           2
│    │    └─LayerNormalization: 3-3           2
│    │    └─LayerNormalization: 3-4           2
│    │    └─FeedForward: 3-5                  87,180
├─ProjectionLayer: 1-4                        --
│    └─Linear: 2-4                            610,000
Total params: 1,300,846
Trainable params: 1,300,846
Non-trainable params: 0

# Training

In [24]:
def train_step(model, optimizer, hparams, trainLoader, validLoader, evalPerEpoch = False, Device = 'cpu'):
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    # optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
    model.cuda() if Device == 'cuda' else model.cpu()

    for epoch in range(hparams["epochs"]):
        model.train()
        total_loss, acc,total_count = 0.0, 0.0, 0.0
        pbar = tqdm(trainLoader,desc='Training', colour = 'green', leave = False)
        for input_batch, targets_batch in pbar:
            # Get the data to the Device
            input_batch, targets_batch = input_batch.to(Device), targets_batch.to(Device)

            optimizer.zero_grad()
            output = model(input_batch)
            # Calculate the loss
            # The last token of output should be the next word of the sequence.
            # For getting an Auto Completion model
            loss = criterion(output[:,-1], targets_batch)

            # Backward pass
            loss.backward()
            optimizer.step()
            # Update the loss
            total_loss += loss.item()
            acc += (output[:,-1].argmax(-1) == targets_batch).sum().item()
            total_count += input_batch.shape[0]
            pbar.set_postfix({'loss':total_loss/len(trainLoader),'acc':acc*100/total_count})
            del input_batch, targets_batch, output, loss

        torch.cuda.empty_cache()
        print(f"Epoch: {epoch+1}, Loss: {(total_loss/len(trainLoader)):.4f}, acc: {(acc*100/(len(trainLoader)*hparams['batch_size'])):.4f}, ",end="")
        model.eval()
        total_loss, acc = 0.0, 0.0
        pbar = tqdm(validLoader, desc = 'Validating', leave = False)
        for input_batch, targets_batch in pbar:
            # Get the data to the Device
            input_batch, targets_batch = input_batch.to(Device), targets_batch.to(Device)

            # Clearing the gradiants
            optimizer.zero_grad()

            # Find the outputs
            output = model(input_batch)

            # Calculate the loss
            # The last token of output should be the next word of the sequence.
            # For getting an Auto Generation model
            loss = criterion(output[:,-1], targets_batch)

            # Update the loss
            total_loss += loss.item()
            acc += (output[:,-1].argmax(-1) == targets_batch).sum().item()
            pbar.set_postfix({'loss':total_loss/len(validLoader),'acc':acc*100/(len(validLoader)*hparams['batch_size'])})
            del input_batch, targets_batch, output, loss
        torch.cuda.empty_cache()
        print(f"\tVal_Loss: {(total_loss/len(validLoader)):.4f}, Val_Acc: {(acc*100/(len(validLoader)*hparams['batch_size'])):.4f}")
    # Set the model to normal mode.
    model.cpu()

# Pipeline

In [None]:
# hparams['batch_size'] = 128

In [None]:
# import gc

# gc.collect()

In [25]:
start_time = time()
# Get train data as batches so as to reduce the momentary overload on RAM
train = wiki103_pre_processing(wiki103_data_extraction(files=['train']), length = 5_000)
print(f'Extracted {len(train)} paras')
train_dataset = customDataset(train, hparams['vocab_size'], hparams['seq_len'], batch_size = 5_000)
print(f'Train Dataset after 30K: {train_dataset.total_examples}')

# train = wiki103_pre_processing(wiki103_data_extraction(files=['train']), start_index = 75000, length = 75000)
# train_dataset.update_dataset(train, hparams['seq_len'], batch_size = 10000)
# print(f'Train Dataset after 200K: {train_dataset.total_examples}')
print(f'Time taken:{time() - start_time}')
del train

Extracted 5000 paras
Iteration:1


Extracting:   0%|          | 0/5000 [00:00<?, ?it/s]

Converting to tensors
Train Dataset after 30K: 1784533
Time taken:37.875691175460815


In [26]:
print(len(train_dataset.dataset[0]))
train_dataset.tokenizer.vocab_size

1


10000

In [27]:
# Get the data.
# train = wiki103_pre_processing(wiki103_data_extraction(files=['train']), length = 140000)
val = wiki103_pre_processing(wiki103_data_extraction(files=['valid']))
print(f"Data Extracted\ntVal paras:{len(val)}")

# Convert it to datasets
# train = get_slicing_tokens_slowly(train, hparams['seq_len'], toTensor = True)
val = get_tokens_slowly(train_dataset.tokenizer, val, hparams['seq_len'], toTensor = True)
print(f"Converted into datasets\n\tVal examples:{val[0].shape}")

# Prepare dataloaders
train = DataLoader(train_dataset, batch_size=hparams['batch_size'], shuffle=True)
val = DataLoader(TensorDataset(val[0],val[1]), batch_size=hparams['batch_size'], shuffle=True)
print("Dataloaders are prepared")

Data Extracted
tVal paras:549


Extracting:   0%|          | 0/549 [00:00<?, ?it/s]

Converting to tensors
Converted into datasets
	Val examples:torch.Size([208842, 5])
Dataloaders are prepared


In [28]:
# Create the model and Optimizer instance
model = Transformer(hparams['vocab_size'], hparams['seq_len'], hparams['d_model'],
                    hparams['num_heads'], hparams['d_ff'], hparams['num_layers'],
                    hparams['dropout'])
# adam_optimizer = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
# optimizer = AdamWarmup(model_size = hparams['d_model'], warmup_steps = 4000, optimizer = adam_optimizer)

In [29]:
optimizer = optim.AdamW(model.parameters(),lr=0.02,betas=(0.9, 0.98), eps=1e-9)

# lr = 0.03 (3x more in hope of gain max accuracy in epoch 1)

In [None]:
# Select the appropriate device
Device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Selecting device as {Device}')

# Train the model.
print("Training the model")
train_step(model, optimizer, hparams, train, val, Device = Device)

Selecting device as cpu
Training the model


Training:   0%|          | 0/349 [00:00<?, ?it/s]

# Testing

In [20]:
val = wiki103_pre_processing(wiki103_data_extraction(files=['valid']))
val = get_tokens_slowly(train_dataset.tokenizer,val, hparams['seq_len'], toTensor = True)

Extracting:   0%|          | 0/549 [00:00<?, ?it/s]

Converting to tensors


In [21]:
model.eval()
model.cpu()

Transformer(
  (input_embedding): InputEmbeddings(
    (embedding): Embedding(10000, 30)
  )
  (pos_encodings): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): ModuleList(
    (0-1): 2 x DecoderBlock(
      (self_attn): SelfAttention(
        (mha): MultiHeadAttention(
          (out_proj): Linear(in_features=30, out_features=30, bias=True)
        )
      )
      (norm1): LayerNormalization()
      (norm2): LayerNormalization()
      (norm3): LayerNormalization()
      (feed_forward): FeedForward(
        (linear1): Linear(in_features=30, out_features=180, bias=True)
        (linear2): Linear(in_features=180, out_features=30, bias=True)
        (dropout): Dropout(p=0.15, inplace=False)
      )
    )
  )
  (out_proj): ProjectionLayer(
    (proj): Linear(in_features=30, out_features=10000, bias=True)
  )
)

In [33]:
# Run the pipeline cell before running this.
import random

match_count = 0
sample_size = 20
for iter in range(sample_size):
  i = random.randint(0,(val[1].shape)[0])
  test_input, test_target = val[0][i], val[1][i]

  pred = model(test_input.unsqueeze(0)).argmax(-1)
  print(pred.shape)
  print(f"\tMatch:{test_target == pred[0][-1]}")
  match_count += (test_target == pred[0][-1]).item()
#   print(f"Acutal Output:{tokenizer.sequences_to_texts(pred.tolist())[0]}\n")

print(f"Accuracy : {match_count*100/sample_size}\n\nNow Context is empty:")

torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:True
torch.Size([1, 5])
	Match:True
torch.Size([1, 5])
	Match:True
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:True
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:True
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:False
torch.Size([1, 5])
	Match:False
Accuracy : 25.0

Now Context is empty:


# Testing with user input

In [34]:
model_context = ""
# Device = 'cuda' if torch.cuda.is_available() else 'cpu'
while True:
    ask = input('User:').lower()
    if (ask == 'exit'):
        break
    answer = ""
    for i in range(5):
        # (batch_size = 1, seq_len = hparams['inp_seq_len'],)
        model_inp = torch.tensor(tokenizer.encode(ask, add_special_tokens = True)[-hparams['seq_len']:]).unsqueeze(0)
        # This model variable holds the trained model. (defined in the training block)
        model_out = model(model_inp).argmax(-1)
        out_txt = tokenizer.decode(model_out[:,-1].item())
        # Adding the data to context
        ask += " " + out_txt
        answer += " " + out_txt
    print(f"Model:{answer}\n")
    ask = ""

User:this is a


IndexError: ignored

# Saving the model

In [None]:
model.cpu()
optimizer.cpu()

In [None]:
import json

# Save the model, optimizer, and hyperparameters
model_path = '/kaggle/working/model.pt'
optimizer_path = '/kaggle/working/optimizer.pth'
hparams_path = '/kaggle/working/hyperparameters.json'

# Save model state
torch.save(model.state_dict(), model_path)

# Save optimizer parameters
torch.save(optimizer.state_dict(), optimizer_path)

# Save hyperparameters
with open(hparams_path, 'w') as f:
    json.dump(hparams, f)

In [None]:
optimizer_path = '/kaggle/working/optimizer_state_dict.pth'
torch.save(optimizer.optimizer.state_dict(),optimizer_path)

# Loading the model

In [None]:
import torch
import json

# Load hyperparameters
# hparams_path = '/kaggle/input/base-model-false/hyperparameters.json'
model_path = '/kaggle/input/model-training-3/model.pt'
hparams_path = '/kaggle/input/model-training-3/hyperparameters.json'
optimizer_path = '/kaggle/input/model-training-3/optimizer.pth'

print('Getting the hyperparams')
with open(hparams_path, 'r') as f:
    hparams = json.load(f)

# Load the model state
# model_path = '/kaggle/input/base-model-false/model.pth'
print('Loading the model')
model = Transformer(hparams['vocab_size'], hparams['seq_len'], hparams['d_model'],
                    hparams['num_heads'], hparams['d_ff'], hparams['num_layers'],
                    hparams['dropout'])
model.load_state_dict(torch.load(model_path))
model.eval()
print(summary(model,input_sizes=(hparams['batch_size'],hparams['seq_len'])))

print('Getting the optimizer')
optimizer = optim.AdamW(model.parameters(),lr=0.0001,betas=(0.9, 0.98), eps=1e-9)
# optimizer_path = '/kaggle/working/optimizer_state_dict.pth'
optimizer.load_state_dict(torch.load(optimizer_path))

# Load optimizer parameters
# optimizer_path = '/kaggle/input/base-model-false/optimizer_state_dict.pth'
# adam_optimizer = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
# optimizer = AdamWarmup(model_size = hparams['d_model'], warmup_steps = 4000, optimizer = adam_optimizer)
# optimizer.optimizer.load_state_dict(torch.load(optimizer_path))

In [None]:
print('Preparing the validation set')
val = wiki103_pre_processing(wiki103_data_extraction(files=['valid']))
val = get_tokens_slowly(val, hparams['seq_len'], toTensor = True)
val = DataLoader(TensorDataset(val[0],val[1]), batch_size=hparams['batch_size'], shuffle=True)

Device = 'cuda' if torch.cuda else 'cpu'
print(f'Device Selected as:{Device}')

model.to(Device)
model.eval()
criterion = nn.CrossEntropyLoss(ignore_index=0)

total_loss, acc = 0.0, 0.0
pbar = tqdm(val, desc = 'Validating', leave = False)
for input_batch, targets_batch in pbar:
    input_batch, targets_batch = input_batch.to(Device), targets_batch.to(Device)
    # Clearing the gradiants
    optimizer.zero_grad()

    # Find the outputs
    output = model(input_batch)

    # Calculate the loss
    # The last token of output should be the next word of the sequence.
    # For getting an Auto Generation model
    loss = criterion(output[:,-1], targets_batch)

    # Update the loss
    total_loss += loss.item()
    acc += (output[:,-1].argmax(-1) == targets_batch).sum().item()
    pbar.set_postfix({'loss':total_loss/len(val),'acc':acc/(len(val)*hparams['batch_size'])})
    del input_batch, targets_batch, output, loss

torch.cuda.empty_cache() if Device == 'cuda' else None
print(f"\tVal_Loss: {(total_loss/len(val)):.4f}, Val_Acc: {(acc/(len(val)*hparams['batch_size'])):.4f}")

In [None]:
del val

In [None]:
del optimizer, model, hparams

In [None]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [None]:
# optimizer_path = '/kaggle/working/optimizer_state_dict.pth'
optimizer.load_state_dict(torch.load(optimizer_path))