In [1]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 4.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 26.1MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 28.0MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers.data.data_collator import DataCollatorForLanguageModeling
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np


In [4]:

import random
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from tqdm.notebook import tqdm
import numpy as np
device = 'cuda'

In [5]:
%%capture
!pip install wandb --upgrade

In [6]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mromanovand[0m (use `wandb login --relogin` to force relogin)


True

In [7]:
class TerraDataset(Dataset):
    def __init__(self, file):
        self.file = file
        self.strings = self.load()[0]
        self.predict = torch.tensor(self.load()[1])
        self.tokens = self.transform()
    def load(self):
      data = [json.loads(d) for d in open(self.file, encoding="utf-8")]
      s1 = []
      s2 = []
      for a in data:
        s1.append( a['premise']+' ~> '+a['hypothesis'])
        if a['label']== 'entailment':
          s2.append(1)
        else:
          s2.append(0)
      return [s1,s2]
    def transform(self):
      model_name_or_path = "sberbank-ai/rugpt3small_based_on_gpt2"
      tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
      input_ids = []
      for i,string in enumerate(self.strings):
        input_ids.append(tokenizer(string, return_length=True))
      return input_ids
    def __len__(self):
        return len(self.strings)
    def __getitem__(self, idx):
        return self.tokens[idx],self.predict[idx]

In [8]:
cuda = 'cuda'

In [9]:
class LogisticRegression(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.gpt3 = GPT2LMHeadModel.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2").to(device)
        self.linear = torch.nn.Linear(768,1).to(device)
    def forward(self, sampler):
        a = self.gpt3(input_ids = torch.tensor(sampler['input_ids']).to(device), attention_mask = torch.tensor(sampler['attention_mask']).to(device),
                return_dict = True, output_hidden_states = True)['hidden_states'][-1]
        l = sampler['length'].to(cuda)-1
        l = l.unsqueeze(-1)
        ind = l.repeat(1,768)
        ind = ind.unsqueeze(1)
        res = torch.gather(a, 1, ind)
        predicted = torch.sigmoid(self.linear(res))
        return predicted

In [10]:
config = dict(
    epochs=20,
    batch_size=10,
    learning_rate=0.00001,
    )

In [11]:
def model_pipeline(hyperparameters):
    with wandb.init(project="pytorch-demo", config=hyperparameters):
      config = wandb.config
      model, train_loader, criterion, optimizer,val_loader,length = make(config)
      train(model, train_loader,  criterion, optimizer, config,val_loader,length)
    return model

In [12]:
def make(config):
    train_loader = make_loader( batch_size=config.batch_size)
    val_loader,length = valid_loader( batch_size=config.batch_size)
    model = LogisticRegression().to(device)
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(
        model.parameters(), lr=config.learning_rate)
    return model, train_loader, criterion, optimizer, val_loader,length

In [13]:
def make_loader( batch_size):
    training_data = TerraDataset('/content/drive/MyDrive/Colab Notebooks/TERRa/train.jsonl')
    tokenizer = GPT2Tokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False)
    def coll(s):
      a,b = map(list,zip(*s))
      return collator(a),b
    loader = torch.utils.data.DataLoader(training_data,
                                         batch_size=batch_size, 
                                         shuffle=True, collate_fn = coll
                                         )
    return loader

def valid_loader( batch_size):
    valid_data = TerraDataset('/content/drive/MyDrive/Colab Notebooks/TERRa/val.jsonl')
    tokenizer = GPT2Tokenizer.from_pretrained("sberbank-ai/rugpt3small_based_on_gpt2")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False)
    def colla(s):
      a,b = map(list,zip(*s))
      return collator(a),b
    loader = torch.utils.data.DataLoader(valid_data,
                                         batch_size=batch_size, 
                                         shuffle=True, collate_fn = colla
                                         )
    length = valid_data.__len__()
    return loader, length

In [18]:
def train(model, loader, criterion, optimizer, config,val_loader,length):

    batch_ct = 0
    best = 0 
    for epoch in tqdm(range(config.epochs)):
        for batch_ndx, (sample, target) in enumerate(loader):
            loss = train_batch(sample, target, model, optimizer, criterion)  
            batch_ct += 1
            if ((batch_ct + 1) % 5) == 0:
                train_log(loss, epoch,batch_ct)
        accur, val_loss = val_test(model, val_loader,length,criterion)
        if accur > best:
          best = accur
          best_model_state = model.state_dict()
        val_log(accur,val_loss,batch_ct)
        torch.save(best_model_state, '/content/drive/MyDrive/Colab Notebooks/finetunemodel.pth')

def train_batch(sample, target, model, optimizer, criterion):
    y_predicted = model(sample).squeeze()
    y_predicted = y_predicted.float().to(device = cuda)
    target = torch.tensor(target)
    target = target.float().to(device = cuda)
    loss = criterion(y_predicted, target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss

In [19]:
def val_test( modus,valid_dataloader,length,loss_func):
  k = 0
  cor_pred = []
  val_loss = []
  for ind , (valid,target) in enumerate(valid_dataloader):
    target = torch.tensor(target)
    target = target.float().to(device = cuda)
    y = modus(valid).squeeze().to(cuda)
    y = y.float().to(device = cuda)
    answ = torch.where(y > 0.5, 1. , 0.)
    cor_pred.append(torch.sum(torch.eq(target, answ).gt(0).to(torch.float32)).item())
    val_loss.append(float(loss_func(y,target)))
  result = sum(cor_pred)/length
  return result,np.mean(val_loss)

In [20]:
def train_log(loss, epoch, batch_ct):
    loss = float(loss)
    wandb.log({"epoch": epoch, "loss": loss},step = batch_ct)
def val_log(accur,val_loss,batch_ct):
    wandb.log({"val_accur": accur, "val_loss": val_loss},step = batch_ct)

In [None]:
mod = model_pipeline(config)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

  import sys
