In [1]:
!pip install evaluate
!pip install sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0mCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacrebleu
Successfully installed sacrebleu-2.3.1
[0m

In [2]:
import math

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, EncoderDecoderModel, set_seed
import evaluate

from tqdm import tqdm

set_seed(42)

bleu = evaluate.load('sacrebleu')

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [3]:
class TData(Dataset):
    def __init__(self, df, encoder_tokenizer, decoder_tokenizer, device='cpu'):
        super(TData, self).__init__()

        self.df = df
        self.encoder_tokenizer = encoder_tokenizer
        self.decoder_tokenizer = decoder_tokenizer
        self.device= device

    def __getitem__(self, i):
        row = self.df.iloc[i]

        start = row['answer_start'] - row.context.find(row.cloze) - 1
        end = start + len(row['answer'])

        cloze = row.cloze[:start] + self.encoder_tokenizer.mask_token + row.cloze[end:]
        
        encoder_inputs = self.encoder_tokenizer(cloze, padding='max_length', max_length=256,
                                                truncation=True, return_tensors='pt')
        question = self.decoder_tokenizer.bos_token + row['question'] + self.decoder_tokenizer.eos_token
        decoder_outputs = self.decoder_tokenizer(question, padding='max_length', max_length=256,
                                                 truncation=True, return_tensors="pt")

        encoder_inputs = {k: v[0].to(self.device) for k, v in encoder_inputs.items()}
        decoder_outputs = {k: v[0].to(self.device) for k, v in decoder_outputs.items()}

        return {'encoder_inputs': encoder_inputs, 
                'decoder_outputs': decoder_outputs}

    def __len__(self):
        return len(self.df)

In [4]:
def train_step(model, train_loader, criterion, optimizer):
    model.train()

    losses, ppls, bleus = [], [], []
    pbar = tqdm(train_loader)
    for i, batch in enumerate(pbar):
        encoder_inputs = batch['encoder_inputs']
        decoder_outputs = batch['decoder_outputs']

        optimizer.zero_grad()
        h_state = model.encoder(**encoder_inputs).last_hidden_state
        out = model.decoder(**decoder_outputs, encoder_hidden_states=h_state, encoder_attention_mask=encoder_inputs['attention_mask']).logits
        loss = criterion(out[:, :-1].moveaxis(1, -1), decoder_outputs['input_ids'][:, 1:])
        
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        ppls.append(math.exp(loss.item()))
        
        references = [[o] for o in decoder_tokenizer.batch_decode(decoder_outputs['input_ids'], skip_special_tokens=True)]
        predictions = decoder_tokenizer.batch_decode(out.argmax(dim=-1), skip_special_tokens=True)
        results = bleu.compute(predictions=predictions, references=references)
        bleus.append(results['score'])

        pbar.set_description(f'Batch {i+1}/{len(train_loader)}: Loss: {np.mean(losses):.4f} - Perplexity: {np.mean(ppls):.4f} - Bleu: {np.mean(bleus):.4f}')

    return np.mean(losses), np.mean(ppls), np.mean(bleus)


def eval_step(model, val_loader, criterion):
    model.eval()

    losses, ppls, bleus = [], [], []
    with torch.no_grad():
        pbar = tqdm(val_loader)
        for i, batch in enumerate(pbar):
            encoder_inputs = batch['encoder_inputs']
            decoder_outputs = batch['decoder_outputs']

            h_state = model.encoder(**encoder_inputs).last_hidden_state
            out = model.decoder(**decoder_outputs, encoder_hidden_states=h_state, encoder_attention_mask=encoder_inputs['attention_mask']).logits
            loss = criterion(out[:, :-1].moveaxis(1, -1), decoder_outputs['input_ids'][:, 1:])
            
            losses.append(loss.item())
            ppls.append(math.exp(loss.item()))
            
            references = [[o] for o in decoder_tokenizer.batch_decode(decoder_outputs['input_ids'], skip_special_tokens=True)]
            predictions = decoder_tokenizer.batch_decode(out.argmax(dim=-1), skip_special_tokens=True)
            results = bleu.compute(predictions=predictions, references=references)
            bleus.append(results['score'])
            
            pbar.set_description(f'Batch {i+1}/{len(val_loader)}: Loss: {np.mean(losses):.4f} - Perplexity: {np.mean(ppls):.4f} - Bleu: {np.mean(bleus):.4f}')

    return np.mean(losses), np.mean(ppls), np.mean(bleus)


def train(model, train_loader, criterion, optimizer, val_loader=None, device='cpu'):
    train_losses, train_ppls, train_bleus = [], [], []
    val_losses, val_ppls, val_bleus = [], [], []

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch+1}/{EPOCHS}')

        train_loss, train_ppl, train_bleu = train_step(model, train_loader, criterion, optimizer)
        train_losses.append(train_loss)
        train_ppls.append(train_ppl)
        train_bleus.append(train_bleu)

        if val_loader is not None:
            val_loss, val_ppl, val_bleu = eval_step(model, val_loader, criterion)
            val_losses.append(val_loss)
            val_ppls.append(val_ppl)
            val_bleus.append(val_bleu)
        
        test_case = val_df.iloc[0]
        question = ask(model, test_case.answer, test_case.cloze, device='cuda')
        print(f'GENERATED -> {question}')

    return train_losses, train_ppls, train_bleus, val_losses, val_ppls, val_bleus

@torch.no_grad()
def ask(model, answer, cloze, answer_start=None, device='cpu'):
    if answer_start is None:
        answer_start = cloze.find(answer)

    answer_end = answer_start + len(answer)
    
    cloze = cloze[:answer_start] + encoder_tokenizer.mask_token + cloze[answer_end:]

    encoder_inputs = encoder_tokenizer(cloze, return_tensors='pt', truncation=True).input_ids.to(device)

    bad_word_ids = decoder_tokenizer(answer, add_special_tokens=False).input_ids
    generated_ids = model.generate(encoder_inputs,
                                        decoder_start_token_id=decoder_tokenizer.bos_token_id,
                                        num_beams=5, max_length=50, do_sample=True,
                                        top_k=50, top_p=0.95, early_stopping=True,
                                        pad_token_id=decoder_tokenizer.eos_token_id,
                                        #no_repeat_ngram_size=2,
                                        bad_words_ids=[bad_word_ids],
                                        num_return_sequences=1)
    
    return decoder_tokenizer.batch_decode(generated_ids)

In [5]:
TRAIN_DIR = '/kaggle/input/my-quad/my_quad.csv'

trainval_df = pd.read_csv(TRAIN_DIR)

train_df = trainval_df.sample(frac=.85, random_state=42)
val_df = trainval_df.drop(train_df.index)

In [6]:
BATCH_SIZE = 2
EPOCHS = 10
DEVICE = 'cuda'

In [7]:
encoder_tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
decoder_tokenizer = AutoTokenizer.from_pretrained("redrussianarmy/gpt2-turkish-cased")
decoder_tokenizer.add_special_tokens({'bos_token': '<BOS>', 'pad_token': '<PAD>', 'eos_token': '<EOS>'})

encoder_model = AutoModel.from_pretrained("dbmdz/bert-base-turkish-cased").to(DEVICE)
decoder_model = AutoModelForCausalLM.from_pretrained("redrussianarmy/gpt2-turkish-cased", add_cross_attention=True).to(DEVICE)
decoder_model.resize_token_embeddings(len(decoder_tokenizer))
decoder_model.config.add_cross_attention = True

model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) 
optimizer = optim.Adadelta(model.parameters())
criterion = nn.CrossEntropyLoss()

Downloading (…)okenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/594k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-turkish-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/510M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at redrussianarmy/gpt2-turkish-cased and are newly initialized: ['transformer.h.0.ln_cross_attn.weight', 'transformer.h.2.crossattention.c_proj.bias', 'transformer.h.9.crossattention.c_proj.bias', 'transformer.h.5.crossattention.bias', 'transformer.h.5.crossattention.masked_bias', 'transformer.h.10.crossattention.bias', 'transformer.h.11.crossattention.c_proj.bias', 'transformer.h.4.crossattention.c_attn.weight', 'transformer.h.0.crossattention.bias', 'transformer.h.9.crossattention.bias', 'transformer.h.3.crossattention.masked_bias', 'transformer.h.3.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.10.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.2.crossattention.c_proj.weight', 'transformer.h.10.crossattention.c_proj.bias', 'transformer.h.11.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.weight', 'transform

In [14]:
for name, param in model.named_parameters():
    if 'decoder' not in name:
        param.requires_grad = False

In [16]:
for name, param in model.named_parameters():
    param.requires_grad =True

In [9]:
train_data = TData(train_df, encoder_tokenizer, decoder_tokenizer, device=DEVICE)
val_data = TData(val_df, encoder_tokenizer, decoder_tokenizer, device=DEVICE)

In [10]:
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)

In [11]:
ex = val_df.iloc[0]
print(f'answer -> {ex.answer}\ncloze -> {ex.cloze}\nquestion -> {ex.question}')

answer -> 136
cloze ->  Panthers hattında ayrıca, sadece 9 başlangıçta 5 sack eden uç çizgi savunmacısı Kony Ealy ile birlikte 136 kez ile NFL'nin aktif kariyer sack lideri ve 5 kez profesyonel bir top atıcısı olan Jared Allen öne çıkmaktadır.
question -> Jared Allen'ın kaç tane kariyer sack edişi vardır?


In [17]:
train_losses, train_ppls, train_bleus, val_losses, val_ppls, val_bleus = train(model, train_loader, criterion, optimizer, val_loader=val_loader, device=DEVICE)

Epoch 1/10


Batch 566/566: Loss: 0.6356 - Perplexity: 324179258968372.5625 - Bleu: 1.5252: 100%|██████████| 566/566 [03:19<00:00,  2.84it/s] 
Batch 100/100: Loss: 0.3915 - Perplexity: 1.4866 - Bleu: 2.2747: 100%|██████████| 100/100 [00:13<00:00,  7.54it/s]


GENERATED -> ['<BOS>KKK anlama anlama anlama anlama yapmıştır?<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>']
Epoch 2/10


Batch 566/566: Loss: 0.3696 - Perplexity: 1.4540 - Bleu: 2.5621: 100%|██████████| 566/566 [03:19<00:00,  2.84it/s]
Batch 100/100: Loss: 0.3541 - Perplexity: 1.4312 - Bleu: 3.1969: 100%|██████████| 100/100 [00:13<00:00,  7.59it/s]


GENERATED -> ["<BOS>Los'ın yılda yılda yılda yılda yılda yılda kaç kaç kaç kaç kaç kaç kaç sürdü?<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>"]
Epoch 3/10


Batch 566/566: Loss: 0.3338 - Perplexity: 1.4022 - Bleu: 3.1967: 100%|██████████| 566/566 [03:19<00:00,  2.84it/s]
Batch 100/100: Loss: 0.3307 - Perplexity: 1.3976 - Bleu: 3.4835: 100%|██████████| 100/100 [00:13<00:00,  7.57it/s]


GENERATED -> ["<BOS>Publ'nin ilk ilk son ilk ilk son adı nedir?<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>"]
Epoch 4/10


Batch 566/566: Loss: 0.2937 - Perplexity: 1.3459 - Bleu: 3.7305: 100%|██████████| 566/566 [03:19<00:00,  2.83it/s]
Batch 100/100: Loss: 0.2946 - Perplexity: 1.3479 - Bleu: 3.5558: 100%|██████████| 100/100 [00:13<00:00,  7.62it/s]


GENERATED -> ["<BOS>DEC'in ardından ne olarak adlandırılır?<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>"]
Epoch 5/10


Batch 566/566: Loss: 0.2454 - Perplexity: 1.2814 - Bleu: 4.2281: 100%|██████████| 566/566 [03:18<00:00,  2.84it/s]
Batch 100/100: Loss: 0.2674 - Perplexity: 1.3119 - Bleu: 4.3089: 100%|██████████| 100/100 [00:13<00:00,  7.62it/s]


GENERATED -> ["<BOS>Temuçin'in karısı kimdir?<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>"]
Epoch 6/10


Batch 566/566: Loss: 0.1974 - Perplexity: 1.2202 - Bleu: 6.0859: 100%|██████████| 566/566 [03:19<00:00,  2.84it/s]
Batch 100/100: Loss: 0.2629 - Perplexity: 1.3072 - Bleu: 5.1362: 100%|██████████| 100/100 [00:13<00:00,  7.59it/s]


GENERATED -> ['<BOS>Fransız Protestanları ne anlama gelmektedir?<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>']
Epoch 7/10


Batch 14/566: Loss: 0.1230 - Perplexity: 1.1315 - Bleu: 17.5414:   2%|▏         | 14/566 [00:04<03:15,  2.82it/s]


KeyboardInterrupt: 

In [13]:
ask(model, ex.answer, ex.cloze, device=DEVICE)

['<BOS>Broncos Panthers, panthersleri nasıl kullanmıştır?<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>']

In [14]:
model.save_pretrained('best_bert_gpt2')

In [15]:
!zip -r best_bert_gpt2.zip best_bert_gpt2/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: best_bert_gpt2/ (stored 0%)
  adding: best_bert_gpt2/config.json (deflated 76%)
  adding: best_bert_gpt2/pytorch_model.bin (deflated 9%)
  adding: best_bert_gpt2/generation_config.json (deflated 24%)


In [None]:
while True:
    pass

In [None]:
!cp /content/attacker_bert_gpt2.pt /content/drive/MyDrive/adversarial-taboo/adversarial-taboo-models