In [None]:
!pip install transformers



In [None]:
from transformers import BertTokenizer, BertModel
from math import ceil, floor
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook

import pandas as pd
import numpy as np
import torch as tt
from torch import nn

In [None]:
X = tt.randn(20,32,100)

In [None]:
print(X.device)

cpu


In [None]:
tt.optim.lr_scheduler.ReduceLROnPlateau

torch.optim.lr_scheduler.ReduceLROnPlateau

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased')
device = tt.device('cuda')

In [None]:
class Embedder:
  def __init__(self, model, device):
    self.embedder = model
    self.device = device
  
  def embed_src(self, src):
    embed_src = self.embedder(src)[0].detach()
    embed_src = embed_src.to(self.device)
    embed_src = embed_src.permute(1,0,2)
    return embed_src
  
  def embed_tgt(self, tgt):
    embed_tgt = self.embedder(tgt)[0].detach()
    embed_tgt = embed_tgt.to(self.device)
    embed_tgt = embed_tgt.permute(1,0,2)
    ## set start token vector to zero - for fair benchmarking
    embed_tgt[0] = tt.zeros((embed_tgt.size(1), embed_tgt.size(2)), dtype=tt.float32)
    return embed_tgt
  


class MyLinearModel(nn.Module):
  pass

class MyRNN(nn.Module):
  pass

class MySeq2SeqRNN(nn.Module):
  pass

class MySeq2SeqRNNWithAttn(nn.Module):
  pass

embedder = Embedder(model, device)

class MyTransformer(nn.Module):
  def __init__(self, n_vocab: int, n_encoder_layers=1, n_decoder_layers=6,
               embedder_model=embedder, embed_dim=768, n_encoder_head=8, n_decoder_head=8,
               device=tt.device('cuda'), sos_id=101):
    super().__init__()
    self.embedder = embedder
    encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=n_encoder_head)
    self.encoder = nn.TransformerEncoder(encoder_layer, n_encoder_layers)
    decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=n_decoder_head)
    self.decoder = nn.TransformerDecoder(decoder_layer, n_decoder_layers )
    self.logits = nn.Linear(embed_dim, n_vocab)
    self.device = device
    self.embed_dim = embed_dim
    self.sos_id = sos_id
    self.device = device
    #self.cpu = tt.device('cpu')
    self.to(device)
  
  def encode(self, batch):
    with tt.no_grad():
      embed_batch = self.embedder.embed_src(batch)
    return self.encoder(embed_batch)
  
  def decode(self, src, tgt):
    with tt.no_grad():
      embed_tgt = self.embedder.embed_tgt(tgt)
    embed_tgt = embed_tgt[:-1:]
    decoded = self.decoder(embed_tgt, src)
    return self.logits(decoded)
  
  def decode_test(self, src, tgt):
    with tt.no_grad():
      embed_tgt = self.embedder.embed_tgt(tgt)
    #print(src.size(), embed_tgt.size())
    decoded = self.decoder(embed_tgt, src)
    return self.logits(decoded)
  
  def forward(self, src, tgt):
    inp = self.encode(src)
    outp = self.decode(inp, tgt)
    return outp
  
  def infer(self, batch, method='argmax'):
    if method=='argmax':
      return batch.argmax(2)
  
  def forward_test(self, src, max_len):
    start_vector = tt.Tensor([[self.sos_id] for i in range(src.size(0))]).long()
    inp = self.encode(src)
    for i in range(max_len-1):
      outp = self.decode_test(inp, start_vector)
      new_prediction = self.infer(outp)[-1::].cpu().permute(1,0)
      #print(new_prediction.size(), start_vector.size())
      start_vector = tt.cat((start_vector, new_prediction), dim=1)
    return outp

In [None]:
tt.randn(size=(1,20,128))

tensor([[[-1.0907, -0.4004,  2.0348,  ..., -1.7408, -1.1427, -1.3007],
         [-1.3680, -1.0893,  0.6311,  ..., -0.3272, -0.0067, -0.9456],
         [-0.7941,  0.5127, -0.4766,  ..., -0.2680,  0.0287, -0.5691],
         ...,
         [-0.4428,  0.1501, -0.1062,  ..., -0.3498,  1.2442, -0.2496],
         [ 0.4898, -0.7990,  0.2871,  ..., -0.1896,  1.0046, -1.5094],
         [-0.1014,  0.3693, -1.0493,  ...,  1.8612, -0.2678, -0.6491]]])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir('drive/My Drive/FunnyHeadlineProject')

In [None]:
df = pd.read_csv("NewsPunchlinesLateNight.csv", index_col='Unnamed: 0')

In [None]:
df.head()

Unnamed: 0,Newsline,Punchline
0,"According to a new report, young Republican st...","According to some Republican staffers, they ar..."
1,President Trump is headed home after his trip ...,"And I saw that at one point, 2,000 protesters ..."
2,Scientists have developed a robot that can con...,"When the scientists unveiled the robot, it scr..."
3,"According to a new poll, one third of American...",The other two thirds have cats.
4,"My policy is, you should treat selfies like yo...","Try not to do it alone, definitely don't do it..."


In [None]:
df.iloc[:1]

Unnamed: 0,Newsline,Punchline
0,"According to a new report, young Republican st...","According to some Republican staffers, they ar..."


In [None]:
df.iloc[:10]

Unnamed: 0,Newsline,Punchline
0,"According to a new report, young Republican st...","According to some Republican staffers, they ar..."
1,President Trump is headed home after his trip ...,"And I saw that at one point, 2,000 protesters ..."
2,Scientists have developed a robot that can con...,"When the scientists unveiled the robot, it scr..."
3,"According to a new poll, one third of American...",The other two thirds have cats.
4,"My policy is, you should treat selfies like yo...","Try not to do it alone, definitely don't do it..."
5,It’s come out that Education Secretary Betsy D...,"Or as Betsy DeVos calls it, “75 percent of them.”"
6,"In recent years, the Tour de France has been t...","And by ""rumors of drug use"" I mean more drugs ..."
7,Happy Presidents Day.,This is a day when we celebrate history by get...
8,A militia group that is protesting the U.S. go...,Of course you can understand why they’re angry...
9,The Supreme Court ruled that a baker in Colora...,"Ladies and gentlemen, in my opinion, if there’..."


In [None]:
df.shape

(17545, 2)

In [None]:
class MyBatch:
  def __init__(self, df):
    for col_name in df:
      setattr(self, col_name, df[col_name].values)

class MyBatchIterator:
  def __init__(self, df, batch_size):
    self.df = df
    self.batch_size = batch_size
  
  def __iter__(self):
    self.start = 0
    return self
  
  def __next__(self):
    if self.start >= self.df.shape[0]:
      raise StopIteration
    batch = self.df.iloc[self.start:self.start+self.batch_size]
    batch = MyBatch(batch)
    return batch
  
  def __len__(self):
    return ceil(self.df.shape[0]/self.batch_size)

In [None]:
def tokenize_batch(batch, tokenizer=tokenizer):
  X = tokenizer(batch.Newsline.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=128)['input_ids']
  y = tokenizer(batch.Punchline.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=128)['input_ids']
  return X, y

In [None]:
def embed_tokens(token_tensor, model=model):
  outp = model(token_tensor)[0].detach()
  return outp

In [None]:
batch_size = 128

train, test = train_test_split(df, test_size=0.2)
train_iter = MyBatchIterator(train, batch_size)
test_iter = MyBatchIterator(test, batch_size)

In [None]:
def fake_train(train_iter, n_epochs):
  for epoch in range(n_epochs):
    for batch in tqdm_notebook(train_iter, total=len(train_iter)):
      X, y = tokenize_batch(batch)
      X = embed_tokens(X)
      y = embed_tokens(y)

In [None]:
float('inf')

inf

In [None]:
help(tt.save)

Help on function save in module torch.serialization:

save(obj, f:Union[str, os.PathLike, BinaryIO], pickle_module=<module 'pickle' from '/usr/lib/python3.6/pickle.py'>, pickle_protocol=2, _use_new_zipfile_serialization=True) -> None
    Saves an object to a disk file.
    
    See also: `saving-loading-tensors`
    
    Args:
        obj: saved object
        f: a file-like object (has to implement write and flush) or a string or
           os.PathLike object containing a file name
        pickle_module: module used for pickling metadata and objects
        pickle_protocol: can be specified to override the default protocol
    
    .. note::
        A common PyTorch convention is to save tensors using .pt file extension.
    
    .. note::
        PyTorch preserves storage sharing across serialization. See
        `preserve-storage-sharing` for more details.
    
    .. note::
        The 1.6 release of PyTorch switched ``torch.save`` to use a new
        zipfile-based file format. ``

In [None]:
def train_epoch(train_iter, model, criterion, optimizer, device, epoch):
  model.train()
  batch_iter = tqdm_notebook(enumerate(train_iter), total=len(train_iter),
                             desc='Training epoch %d' % (epoch + 1), leave=True)
  running_loss = 0
  for i, batch in batch_iter:
    optimizer.zero_grad()
    src, tgt  = tokenize_batch(batch)
    pred = model(src, tgt)
    ## From (seq_len, batch_size, vocab_len) to (seq_len * batch_size, vocab_len)
    pred = pred.view(-1,pred.size(2))
    ## From (batch_size, seq_len) to (seq_len * batch_size)
    ground_truth = tgt.permute(1,0)[1::].reshape(-1).to(device)
    loss = criterion(pred, ground_truth)
    loss.backward()
    optimizer.step()

    curr_loss = loss.data.cpu().detach().item()
    loss_smoothing = i / (i+1)
    running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss
    batch_iter.set_postfix(loss='%.4f' % running_loss)


def test_epoch(test_iter, model, criterion, device):
  model.eval()
  batch_iter = tqdm_notebook(enumerate(test_iter), total=len(test_iter),
                             desc='Testing', leave=True)
  test_loss = []
  with tt.no_grad():
    for i, batch in batch_iter:
      src, tgt = tokenize_batch(batch)
      max_len = tgt.size(1)
      pred = model.forward_test(src, max_len)
      print(pred.size(), tgt.size())
      ## From (seq_len, batch_size, vocab_len) to (seq_len * batch_size, vocab_len)
      pred = pred.view(-1,pred.size(2))
      ## From (batch_size, seq_len) to (seq_len * batch_size)
      ground_truth = tgt.permute(1,0)[1:].reshape(-1).to(device)
      loss = criterion(pred, ground_truth).data.cpu().detach().item()
      test_loss.append(loss)
  return np.mean(test_loss)
    
def train(model, n_epochs, train_iter, test_iter, criterion, optimizer, device, scheduler=None):
  prev_test_loss = float('inf')
  for epoch in range(n_epochs):
    train_epoch(train_iter, model, criterion, optimizer, device, epoch)
    test_loss = test_epoch(test_iter, model, criterion, device)
    if type(scheduler) == tt.optim.lr_scheduler.ReduceLROnPlateau:
      scheduler.step(test_loss)
    elif scheduler:
      scheduler.step()
    print('validation loss=%.4f' % test_loss)
    if test_loss > prev_test_loss:
      print("New record! - Saving model")
      tt.save(model.state_dict(), 'my_model')

In [None]:
tokenizer.vocab_size

28996

In [None]:
device=tt.device('cuda')

In [None]:
my_model = MyTransformer(n_vocab=tokenizer.vocab_size, device=device)

In [None]:
optimizer = tt.optim.Adam(my_model.parameters())

In [None]:
criterion = nn.CrossEntropyLoss().to(device)

In [None]:
train(my_model, n_epochs=10, train_iter=train_iter, test_iter=test_iter, optimizer=optimizer, criterion=criterion, device=device)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, description='Training epoch 1', max=110.0, style=ProgressStyle(descrip…

KeyboardInterrupt: ignored

In [None]:
tt.cat((tt.randn((128,1)), tt.randn((128,1))), dim=1).size()