In [1]:
import re
import torch
from torch.utils.data import Dataset
import pickle
import os
import time 
from time import sleep
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from logging import Logger as logger
import warnings
import gc
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.simplefilter('ignore')
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer, AutoModelWithLMHead
from tqdm.notebook import tqdm
from transformers import TextDataset,DataCollatorForLanguageModeling,AutoConfig

In [2]:
m = AutoModelWithLMHead.from_pretrained("../input/transformer-distilation-gpt-2/gpt2_6L")
config = AutoConfig.from_pretrained('../input/transformer-distilation-gpt-2/gpt2_6L')

In [3]:
state_dict = m.state_dict()

In [4]:
string = open('../input/mark-twain-books/MarkTwain_9_clean.txt',encoding='utf8',errors='ignore').read()
new_str = re.sub('�', '', string)
open('Test.txt', 'w').write(new_str)

136454

In [5]:
string = open('/kaggle/input/mark-twain-books/Combine.txt',encoding='utf8',errors='ignore').read()
new_str = re.sub('�', '', string)
open('Train.txt', 'w').write(new_str)

6588596

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self,config,device):
        super(MultiHeadAttention,self).__init__()
        self.n_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_dim = self.hidden_size//self.n_heads
        self.q = nn.Linear(self.hidden_size,self.hidden_size)
        self.k = nn.Linear(self.hidden_size,self.hidden_size)
        self.v = nn.Linear(self.hidden_size,self.hidden_size)
        self.device = device

        self.fc = nn.Linear(self.hidden_size,self.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(self.device)

    def forward(self, query, key, value, mask = None):
        batch_size = query.shape[0]
        Q = self.q(query)
        K = self.k(key)
        V = self.v(value)
        # [batch size, query len, hid dim]
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        # [batch size, n heads, query len, head dim]
        score = torch.matmul(Q, K.permute(0, 1, 3, 2)) /self.scale
        if mask is not None:
            score = score.masked_fill(mask == 0, -1e10)
        attention = torch.softmax(score, dim = -1)
        x = torch.matmul(self.dropout(attention), V)
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hidden_size)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc(x)
        return x, attention
    
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self,config):
        super(PositionwiseFeedforwardLayer,self).__init__()
        self.pf_dim = 3072
        self.hid_dim = config.hidden_size
        self.fc_1 = nn.Linear(self.hid_dim, self.pf_dim)
        self.fc_2 = nn.Linear(self.pf_dim, self.hid_dim)
        self.dropout = nn.Dropout(0.1)
        self.activation = nn.GELU()
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(self.activation(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x
    
class DecoderLayer(nn.Module):
    def __init__(self,config,device):
        super(DecoderLayer,self).__init__()
        self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)
        self.enc_attn_layer_norm = nn.LayerNorm(config.hidden_size)
        self.ff_layer_norm = nn.LayerNorm(config.hidden_size)
        self.self_attention = MultiHeadAttention(config, device)
        self.encoder_attention = MultiHeadAttention(config, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(config)
        self.dropout = nn.Dropout(0.1)

    def forward(self, trg,trg_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention cross attention
    
        
        #dropout, residual connection and layer norm
#         trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg
    
class Decoder(nn.Module):
    def __init__(self,config,device):
        super(Decoder,self).__init__()
        self.device = device
        self.vocab = config.vocab_size
        self.hid_dim = config.hidden_size
        self.max_length = config.max_position_embeddings
        self.tok_embedding =nn.Embedding.from_pretrained(state_dict['transformer.wte.weight'], freeze=False)                    #nn.Embedding(self.vocab, self.hid_dim)
        self.pos_embedding = nn.Embedding.from_pretrained(state_dict['transformer.wpe.weight'], freeze=False)                     #nn.Embedding(self.max_length, self.hid_dim)
        self.n_layers = 6 #config.num_hidden_layers
        self.layers = nn.ModuleList([DecoderLayer(config,device) 
                                     for _ in range(self.n_layers)])
        self.dropout = nn.Dropout(0.1)
        
        self.scale = torch.sqrt(torch.FloatTensor([config.hidden_size])).to(device)
#         self.dropout = nn.Dropout(0.1)
        self.fc_out = nn.Linear(self.hid_dim, self.vocab)
        
    def make_trg_mask(self,trg):
        
        
        #trg_pad_mask = [batch size, 1, 1, trg len]

#         trg_pad_mask = (trg).unsqueeze(1).unsqueeze(2)  #No padding#!= tokenizer.pad_token_id
        
        trg_len = trg.shape[1]
        
        trg_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
#         print(trg_pad_mask.size(),trg_sub_mask.size())
        #trg_sub_mask = [trg len, trg len]
            
#         trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask
    
    def forward(self, trg):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        trg_mask = self.make_trg_mask(trg)      
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg = layer(trg,trg_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
model  = Decoder(config,device)


In [8]:
#  = [3,324]
# model(x).size
# torch.Size([3, 324, 50257])

In [9]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

train_path = 'Train.txt'
test_path = 'Test.txt'

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [10]:
class TextDataset(Dataset):

    def __init__(
        self,
        tokenizer,
        file_path: str,
        block_size: int):
        if os.path.isfile(file_path) is False:
            raise ValueError(f"Input file path {file_path} not found")

        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
        saved = False
        cache_dir = None
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else directory,
            f"cached_lm_{tokenizer.__class__.__name__}_{block_size}_{filename}",
        )

     
        if os.path.exists(cached_features_file) and saved :
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
#                 logger.info(
#                     f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
#                 )

        else:
#                 logger.info(f"Creating features from dataset file at {directory}")

                self.examples = []
                with open(file_path, encoding="utf-8") as f:
                    text = f.read()

                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

                for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                    self.examples.append(
                        tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
                    )
                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should look for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                    saved = True
#                 logger.info(
#                     f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
#                 )

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        return {"input_ids":torch.tensor(self.examples[i], dtype=torch.long)}

In [11]:
train_loader = torch.utils.data.DataLoader(TextDataset(tokenizer,train_path,128),batch_size=24, shuffle=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(TextDataset(tokenizer,test_path,128),batch_size=24, shuffle=False, num_workers=2)

Token indices sequence length is longer than the specified maximum sequence length for this model (1580900 > 1024). Running this sequence through the model will result in indexing errors


In [16]:
no_decay = ['bias', 'LayerNorm.weight','LayerNorm.bias']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

In [17]:
EPOCHS = 3
accumulation_steps = 1
num_train_optimization_steps = int(EPOCHS * len(train_loader) / accumulation_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
                                    num_training_steps=num_train_optimization_steps)

In [18]:
def loss_fn(labels,prediction_scores):
    shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
    labels = labels[:, 1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss()
    lm_loss = loss_fct(shifted_prediction_scores.view(-1, config.vocab_size), labels.view(-1))
    return lm_loss

In [19]:
def valid_func(model,valid_loader):
    model.eval()
    bar = tqdm(valid_loader,file=sys.stdout)
#     loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100)
    losses = []
    with torch.no_grad():
        for batch_idx, (data) in enumerate(bar):
            data =  collate(data)
            x = data["input_ids"].to(device)
            y = data['labels'].to(device)
            pred = model(x)  
            
            loss = loss_fn(y,pred)
            losses.append(loss.item())
           
            bar.set_description(f'loss: {loss.item():.5f}')
   
    loss_valid = np.mean(losses)
    return loss_valid

In [20]:
import sys
best_epoch_loss = np.inf
model.to(device)
for epoch in range(5):
    start_time = time.time()
    avg_loss = 0.0
    model.train()
    tbar = tqdm(train_loader, file=sys.stdout)
    loss_list = []
    val_loss_list = []
    tbar.set_description(f"Epoch {epoch+1}")
    for step, data in enumerate(tbar):
        data =  collate(data)
        x = data["input_ids"].to(device)
        y = data['labels'].to(device)
        optimizer.zero_grad()
        pred = model(x)
        loss = loss_fn(y,pred)
#         loss = loss_fn(pred, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        tbar.set_postfix(loss=loss.item())
        sleep(0.1)
    loss_list.append(loss.detach().cpu().item())
    avg_loss = np.round(np.mean(loss_list), 4)
    tbar.set_description(f"Epoch {epoch + 1} Loss: {avg_loss} lr: {scheduler.get_last_lr()}")
    vloss = valid_func(model,val_loader)
#     log_df.loc[len(log_df.index)] = [epoch+1,avg_loss,vloss]
    print(f'Epoch--{epoch+1} ### Train loss---{avg_loss} ### Valid_Loss---{vloss}')
#     if (step%200)==0:
#         print(f'Train_loss={avg_loss}')
    if vloss<best_epoch_loss:
        best_epoch_loss = vloss
        PATH = f"gpt2_epoch__{epoch}.pth"
#         model.save_pretrained(PATH)
        torch.save(model.state_dict(), PATH)
#         print(f'Model Saved--epoch--{epoch+1}')
        
    
del train_loader
del model
del val_loader
gc.collect()

  0%|          | 0/515 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch--1 ### Train loss---6.8496 ### Valid_Loss---7.626103242238362


  0%|          | 0/515 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch--2 ### Train loss---6.4768 ### Valid_Loss---7.299758831659953


  0%|          | 0/515 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch--3 ### Train loss---6.5631 ### Valid_Loss---7.244344870249431


  0%|          | 0/515 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch--4 ### Train loss---6.6371 ### Valid_Loss---7.244344870249431


  0%|          | 0/515 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch--5 ### Train loss---6.3216 ### Valid_Loss---7.244344870249431


321

In [None]:
# !ls ../working/gpt2_epoch__2.pth

In [36]:
model = Decoder(config,device)
model.load_state_dict(torch.load('../working/gpt2_epoch__2.pth'))

<All keys matched successfully>

In [37]:
model.cuda()

Decoder(
  (tok_embedding): Embedding(50257, 768)
  (pos_embedding): Embedding(1024, 768)
  (layers): ModuleList(
    (0-5): 6 x DecoderLayer(
      (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (enc_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (ff_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (self_attention): MultiHeadAttention(
        (q): Linear(in_features=768, out_features=768, bias=True)
        (k): Linear(in_features=768, out_features=768, bias=True)
        (v): Linear(in_features=768, out_features=768, bias=True)
        (fc): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder_attention): MultiHeadAttention(
        (q): Linear(in_features=768, out_features=768, bias=True)
        (k): Linear(in_features=768, out_features=768, bias=True)
        (v): Linear(in_features=768, out_features=768, bias=True)
  

In [48]:
def generate(text, max_new_tokens=512, temperature=1.0, do_sample=True, top_k=5):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        idx = tokenizer.encode(text,add_special_tokens=False, return_tensors="pt")
        idx = idx.to(device)
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= 128 else idx[:, -128:]
            # forward the model to get the logits for the index in the sequence
            logits = model(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # either sample from the distribution or take the most likely element
            if do_sample:
                idx_next = torch.multinomial(probs, num_samples=1)
            else:
                _, idx_next = torch.topk(probs, k=1, dim=-1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

In [49]:
out = generate('this of the test')

In [50]:
# out

tensor([[5661,  286,  262, 1332,   11,  220,  262,  640,   11,  220,  284,  262,
          640,   11,  290,  262,  220,  290,  340,  373,  407,  257,  220,  290,
          339,  373,  257,  286,  340,  373,  407,   11,  290,  339,  373,  257,
          220,  262,  220,  290,  262,  220,  262,  220,  290,  339,  373,  407,
          284,  262,  220,  290,  257, 1310,   13,  314,  373,  257,  220,  257,
          220,  284,  307,  257,  640,   11,  290,  339,  550,  407,  220,  262,
          640,  286,  262,  220,  284,  262,  220,  290,  314,  550,  407,  284,
          262, 1310,  220,  314,  550,  587,  257,  922,   11,  290,  340,   13,
          314,  714,  307,  257,  220,  257,  220,  262,  584,  284,  307,  257,
         1310,   11,  290,  340,  318,  262,  220,  284,  262,  220,  314,  423,
          262,  220,  286,  465,  220,  262,  220,  314,  373,  257, 1310,  220,
          262,  220,  262,  220,  290,  314,  373,  257, 1310,  286,  262,  584,
           11,  290,  340,  

In [51]:
tokenizer.decode(out[0])

'this of the test,  the time,  to the time, and the  and it was not a  and he was a of it was not, and he was a  the  and the  the  and he was not to the  and a little. I was a  a  to be a time, and he had not  the time of the  to the  and I had not to the little  I had been a good, and it. I could be a  a  the other to be a little, and it is the  to the  I have the  of his  the  I was a little  the  the  and I was a little of the other, and it is, and he said to a and it.  to the other  of a time to be in it is a good of his  the little and he would be to be a  The was not  a time. The the time to the  I have not a good to the  and the  the other. The he was to be  of the other.  the a  to be a  the other. He was a little. He was a of the  and it was not to the  of the  of the  to have to be  I have  the a time, but it.  to a  the other of the  I had a good of the  the  the little, the  and he had been a  to the  and the  the the a of it was not the the old. I was in a little.  I have