In [1]:
import re
import torch
from torch.utils.data import Dataset
import pickle
import os
import time 
from time import sleep
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from logging import Logger as logger
import warnings
import gc
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.simplefilter('ignore')
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer, AutoModelWithLMHead
from tqdm.notebook import tqdm
from transformers import TextDataset,DataCollatorForLanguageModeling,AutoConfig

In [2]:
m = AutoModelWithLMHead.from_pretrained("../input/transformer-distilation-gpt-2/gpt2_6L")
config = AutoConfig.from_pretrained('../input/transformer-distilation-gpt-2/gpt2_6L')

In [3]:
state_dict = m.state_dict()

In [4]:
string = open('../input/mark-twain-books/MarkTwain_9_clean.txt',encoding='utf8',errors='ignore').read()
new_str = re.sub('�', '', string)
open('Test.txt', 'w').write(new_str)

136454

In [5]:
string = open('/kaggle/input/mark-twain-books/Combine.txt',encoding='utf8',errors='ignore').read()
new_str = re.sub('�', '', string)
open('Train.txt', 'w').write(new_str)

6588596

In [18]:
class MultiHeadAttention(nn.Module):
    def __init__(self,config,device):
        super(MultiHeadAttention,self).__init__()
        self.n_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_dim = self.hidden_size//self.n_heads
        self.q = nn.Linear(self.hidden_size,self.hidden_size)
        self.k = nn.Linear(self.hidden_size,self.hidden_size)
        self.v = nn.Linear(self.hidden_size,self.hidden_size)
        self.device = device

        self.fc = nn.Linear(self.hidden_size,self.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(self.device)

    def forward(self, query, key, value, mask = None):
        batch_size = query.shape[0]
        Q = self.q(query)
        K = self.k(key)
        V = self.v(value)
        # [batch size, query len, hid dim]
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        x = F.scaled_dot_product_attention(Q,K,V,dropout_p=0.1,is_causal = True)
#         # [batch size, n heads, query len, head dim]
#         score = torch.matmul(Q, K.permute(0, 1, 3, 2)) /self.scale
#         if mask is not None:
#             score = score.masked_fill(mask == 0, -1e10)
#         attention = torch.softmax(score, dim = -1)
#         x = torch.matmul(self.dropout(attention), V)
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hidden_size)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc(x)
        return x
    
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self,config):
        super(PositionwiseFeedforwardLayer,self).__init__()
        self.pf_dim = 3072
        self.hid_dim = config.hidden_size
        self.fc_1 = nn.Linear(self.hid_dim, self.pf_dim)
        self.fc_2 = nn.Linear(self.pf_dim, self.hid_dim)
        self.dropout = nn.Dropout(0.1)
        self.activation = nn.GELU()
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(self.activation(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x
    
class DecoderLayer(nn.Module):
    def __init__(self,config,device):
        super(DecoderLayer,self).__init__()
        self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)
        self.enc_attn_layer_norm = nn.LayerNorm(config.hidden_size)
        self.ff_layer_norm = nn.LayerNorm(config.hidden_size)
        self.self_attention = MultiHeadAttention(config, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(config)
        self.dropout = nn.Dropout(0.1)

    def forward(self, trg,trg_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        
        #self attention
        _trg = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention cross attention
    
        
        #dropout, residual connection and layer norm
#         trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg
    
class Decoder(nn.Module):
    def __init__(self,config,device):
        super(Decoder,self).__init__()
        self.device = device
        self.vocab = config.vocab_size
        self.hid_dim = config.hidden_size
        self.max_length = config.max_position_embeddings
        self.tok_embedding =nn.Embedding.from_pretrained(state_dict['transformer.wte.weight'], freeze=False)                    #nn.Embedding(self.vocab, self.hid_dim)
        self.pos_embedding = nn.Embedding.from_pretrained(state_dict['transformer.wpe.weight'], freeze=False)                     #nn.Embedding(self.max_length, self.hid_dim)
        self.n_layers = 6 #config.num_hidden_layers
        self.layers = nn.ModuleList([DecoderLayer(config,device) 
                                     for _ in range(self.n_layers)])
        self.dropout = nn.Dropout(0.1)
        
        self.scale = torch.sqrt(torch.FloatTensor([config.hidden_size])).to(device)
#         self.dropout = nn.Dropout(0.1)
        self.fc_out = nn.Linear(self.hid_dim, self.vocab)
        
    def make_trg_mask(self,trg):
        
        
        #trg_pad_mask = [batch size, 1, 1, trg len]

#         trg_pad_mask = (trg).unsqueeze(1).unsqueeze(2)  #No padding#!= tokenizer.pad_token_id
        
        trg_len = trg.shape[1]
        
        trg_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
#         print(trg_pad_mask.size(),trg_sub_mask.size())
        #trg_sub_mask = [trg len, trg len]
            
#         trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask
    
    def forward(self, trg):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        trg_mask = self.make_trg_mask(trg)      
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg = layer(trg,trg_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
model  = Decoder(config,device)


In [20]:
#  = [3,324]
# model(x).size
# torch.Size([3, 324, 50257])

In [21]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

train_path = 'Train.txt'
test_path = 'Test.txt'

In [22]:
class TextDataset(Dataset):

    def __init__(
        self,
        tokenizer,
        file_path: str,
        block_size: int):
        if os.path.isfile(file_path) is False:
            raise ValueError(f"Input file path {file_path} not found")

        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
        saved = False
        cache_dir = None
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else directory,
            f"cached_lm_{tokenizer.__class__.__name__}_{block_size}_{filename}",
        )

     
        if os.path.exists(cached_features_file) and saved :
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
#                 logger.info(
#                     f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
#                 )

        else:
#                 logger.info(f"Creating features from dataset file at {directory}")

                self.examples = []
                with open(file_path, encoding="utf-8") as f:
                    text = f.read()

                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

                for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                    self.examples.append(
                        tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
                    )
                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should look for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                    saved = True
#                 logger.info(
#                     f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
#                 )

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        return {"input_ids":torch.tensor(self.examples[i], dtype=torch.long)}

In [23]:
def collate(batch):
    labels = batch["input_ids"].clone()
    if tokenizer.pad_token_id is not None:
        labels[labels == tokenizer.pad_token_id] = -100
    batch["labels"] = labels
    return batch

In [24]:
train_loader = torch.utils.data.DataLoader(TextDataset(tokenizer,train_path,128),batch_size=24, shuffle=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(TextDataset(tokenizer,test_path,128),batch_size=24, shuffle=False, num_workers=2)

Token indices sequence length is longer than the specified maximum sequence length for this model (1580900 > 1024). Running this sequence through the model will result in indexing errors


In [25]:
no_decay = ['bias', 'LayerNorm.weight','LayerNorm.bias']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4)

In [26]:
EPOCHS = 10
accumulation_steps = 1
num_train_optimization_steps = int(EPOCHS * len(train_loader) / accumulation_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
                                    num_training_steps=num_train_optimization_steps)

In [27]:
def loss_fn(labels,prediction_scores):
    shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
    labels = labels[:, 1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
    lm_loss = loss_fct(shifted_prediction_scores.view(-1, config.vocab_size), labels.view(-1))
    return lm_loss

In [28]:
def valid_func(model,valid_loader):
    model.eval()
    bar = tqdm(valid_loader,file=sys.stdout)
#     loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100)
    losses = []
    with torch.no_grad():
        for batch_idx, (data) in enumerate(bar):
            data =  collate(data)
            x = data["input_ids"].to(device)
            y = data['labels'].to(device)
            pred = model(x)  
            
            loss = loss_fn(y,pred)
            losses.append(loss.item())
           
            bar.set_description(f'loss: {loss.item():.5f}')
   
    loss_valid =  np.round(np.mean(losses),4)
    return loss_valid

In [29]:
import sys
best_epoch_loss = np.inf
model.to(device)
for epoch in range(EPOCHS):
    start_time = time.time()
    avg_loss = 0.0
    model.train()
    tbar = tqdm(train_loader, file=sys.stdout)
    loss_list = []
    tbar.set_description(f"Epoch {epoch+1}")
    for step, data in enumerate(tbar):
        data =  collate(data)
        x = data["input_ids"].to(device)
        y = data['labels'].to(device)
        optimizer.zero_grad()
        pred = model(x)
        loss = loss_fn(y,pred)
#         loss = loss_fn(pred, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        tbar.set_postfix(loss=loss.item())
#         sleep(0.1)
        loss_list.append(loss.detach().cpu().item())
    avg_loss = np.round(np.mean(loss_list), 4)
#     tbar.set_description(f"Epoch {epoch + 1} Loss: {avg_loss} lr: {scheduler.get_last_lr()}")
#     vloss = valid_func(model,val_loader)
#     log_df.loc[len(log_df.index)] = [epoch+1,avg_loss,vloss]
    print(f'Epoch--{epoch+1} ### Train loss---{avg_loss}')
#     if (step%200)==0:
#         print(f'Train_loss={avg_loss}')
#     if vloss<best_epoch_loss:
#         best_epoch_loss = vloss
    PATH = f"gpt2_epoch__{epoch}.pth"
#         model.save_pretrained(PATH)
    torch.save(model.state_dict(), PATH)
#         print(f'Model Saved--epoch--{epoch+1}')
        
    
del train_loader
del model
del val_loader
gc.collect()

  0%|          | 0/515 [00:00<?, ?it/s]

Epoch--1 ### Train loss---6.2484


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch--2 ### Train loss---4.8753


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch--3 ### Train loss---4.4331


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch--4 ### Train loss---4.0991


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch--5 ### Train loss---3.8031


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch--6 ### Train loss---3.5282


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch--7 ### Train loss---3.2725


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch--8 ### Train loss---3.044


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch--9 ### Train loss---2.8537


  0%|          | 0/515 [00:00<?, ?it/s]

Epoch--10 ### Train loss---2.7171


12855

In [30]:
# !ls ../working/gpt2_epoch__2.pth

In [31]:
model = Decoder(config,device)
model.load_state_dict(torch.load('../working/gpt2_epoch__9.pth'))

<All keys matched successfully>

In [32]:
model.cuda()

Decoder(
  (tok_embedding): Embedding(50257, 768)
  (pos_embedding): Embedding(1024, 768)
  (layers): ModuleList(
    (0-5): 6 x DecoderLayer(
      (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (enc_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (ff_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (self_attention): MultiHeadAttention(
        (q): Linear(in_features=768, out_features=768, bias=True)
        (k): Linear(in_features=768, out_features=768, bias=True)
        (v): Linear(in_features=768, out_features=768, bias=True)
        (fc): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (positionwise_feedforward): PositionwiseFeedforwardLayer(
        (fc_1): Linear(in_features=768, out_features=3072, bias=True)
        (fc_2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=F

In [33]:
def generate(text, max_new_tokens=512, temperature=1.0, do_sample=True, top_k=10):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        idx = tokenizer.encode(text,add_special_tokens=False, return_tensors="pt")
        idx = idx.to(device)
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= 128 else idx[:, -128:]
            # forward the model to get the logits for the index in the sequence
            logits = model(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # either sample from the distribution or take the most likely element
            if do_sample:
                idx_next = torch.multinomial(probs, num_samples=1)
            else:
                _, idx_next = torch.topk(probs, k=1, dim=-1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

In [34]:
out = generate('this of the test')

In [35]:
# out

In [36]:
tokenizer.decode(out[0])

"this of the test that can be a  noble soul, but you can do it.It shall be hard.I think it  of a man to make the most men that do the most noble and wise  admirers.The man who could not live to be hanged, said to them:  I have seen a good idea that they were a very person, and  you should say that that the king would be an ass who would be his  best, and so we might do it.I will say that a few moments  I have seen him, and then I will not tell you what he did.  The gentleman and his sons were full of self-complacent and gentle folk,  and were his work.He was very well, of course, and I mean, but  that they couldn't stand that man; and he couldn't seem as to have  been a person's.He looked up in his mind and said, but said,  I didn't seem to get him around him at all--I thought  I was ashamed of him to have him a given him, and asked him if he  would not have him.But that he couldn't have the trouble with us,  but he didn't quite what we were about.  He didn't.  I knew the matter what h

In [37]:
out = generate('long before that I was')
tokenizer.decode(out[0])

'long before that I was  surprised to see a man, I had just as fresh a cat in a tree on the  floor of the ground that had been asleep, there was a man whose head  thawed his legs and his back. Then I said:  Now you know the marks of the first time you know it, and then Ive  got a been in my mind, Tom Sawyer.  Yes, he couldnt seem to tell about it now.  Well, that aint what to say, anyway. Well, then, when you was over, and  _you_ know a body, aint you ever so? Thats the way you  dont know. And I know about it a person it, I aint going to do  any of a robber. But its awful.  The professor was gone out of the place and held no longer, but that  didnt make no use. He was in the best, too, and I reckoned it would be a  bad little while, and you see it was just a little brick.  It was a mighty small boy, you know, and you know that if youve got to know the  thing you aint _do_?  You just waitIll tell me and see you do it, Tom said theres another thing youll be,  and that _you_ do, and you c

In [38]:
out = generate('Well, sir, you could')
tokenizer.decode(out[0])

"Well, sir, you could have thought  that this miracle is so; that's the only one thing that does not know just  what.It has been done a great many months since then, and  I'd like to know that, if you were, or what it can help you,  or you must it I think, or maybe; that's to give you the right one  and take the trouble.It's a good time to be used at a time like that for  other.  Now, what's a general thing?The king and a magician's.He must  get his money and give up the chance, and so, I could make  it out, too.I said:  There was nothing wrong about it to be so much to you.  The reason why should it be done in a business, that there's.  I was a good deal confused.I mean, now, you know, what you do.  Well, now, I had to make out this thing with what is the  method of a magician who is a common way; I know it may be  not; yet I mean; if there was an error in my hand I could have been  to keep it up _that_ I would.  Well, I didn't.I mean to say it; it was just a fact; that that  _can_ wo

In [39]:
out = generate('uncle into a thousand times')
tokenizer.decode(out[0])

"uncle into a thousand times more  --and I am satisfied again now--I mean to see you--and I do, don't like  any harm.You've got to go into a nice old age-turtle.  That's the very good. I don't quite think much of anything.  The old man, you're full of a hair--n't a hair off my ears, and a  minute's head's tail, and a little--he'll fetch it to the man's  and the old man, and he'll get it out on you till it goes  on his back and get the way.Well, it does seem so?  But I said the king.  That's a lie.  Well, I don't know, I didn't quite know.  Well, then, the man who can't go to the court that man that's  a-laughing and just at a time in time, and he's all at once  three days at a little time when that is finished.Now  I'll tell him what you've got to give you something to  give to give you a chance to do with that thing for?  You don't remember anything I've got a dirk, and he'll get out  and follow it; you'll get him the way you want to make him pay  it by one-dishing, or he can't.  You 