In [1]:
import pandas as pd
from datasets import Dataset

dataframe = pd.read_parquet('/kaggle/input/toxicity-original/original.parquet')

dataframe = dataframe[:12000]

In [2]:
dataset_raw = Dataset.from_pandas(dataframe)
dataset_raw = dataset_raw.remove_columns(['__index_level_0__', 'similarity', 'lenght_diff', 'ref_tox', 'trn_tox'])

train_data, validation_data = dataset_raw.train_test_split(test_size=0.1).values()

In [3]:
from transformers import AutoTokenizer

model_name = 'facebook/bart-base'

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [4]:
def preprocess(examples):
    inputs = examples['reference']
    targets = examples['translation']

    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(text_target=targets, max_length=512, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [5]:
train_tokenized = train_data.map(preprocess, batched=True)
validation_tokenized = validation_data.map(preprocess, batched=True)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [6]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [7]:
batch_size = 16

args = Seq2SeqTrainingArguments(
    "finetuned",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
)

In [8]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.1


In [9]:
import numpy as np
import evaluate

metric = evaluate.load('bleu')


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["bleu"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [11]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_tokenized,
    eval_dataset=validation_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [12]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.928,1.563107,0.2521,13.7483
2,1.6358,1.524177,0.2597,13.6117
3,1.4426,1.514447,0.2561,13.5542
4,1.3355,1.520117,0.2624,13.565
5,1.2743,1.543389,0.2626,13.5925
6,1.1599,1.547353,0.26,13.5008
7,1.1098,1.567129,0.2636,13.5683
8,1.0798,1.575826,0.2609,13.5308
9,1.0268,1.58818,0.2597,13.5258
10,1.0008,1.595776,0.2578,13.52


TrainOutput(global_step=6750, training_loss=1.2799327256944444, metrics={'train_runtime': 1460.0322, 'train_samples_per_second': 73.971, 'train_steps_per_second': 4.623, 'total_flos': 2326763851776000.0, 'train_loss': 1.2799327256944444, 'epoch': 10.0})

In [15]:
!zip -r checkpoint.zip /kaggle/working/finetuned/checkpoint-6500

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: kaggle/working/finetuned/checkpoint-6500/ (stored 0%)
  adding: kaggle/working/finetuned/checkpoint-6500/merges.txt (deflated 53%)
  adding: kaggle/working/finetuned/checkpoint-6500/scheduler.pt (deflated 49%)
  adding: kaggle/working/finetuned/checkpoint-6500/tokenizer_config.json (deflated 50%)
  adding: kaggle/working/finetuned/checkpoint-6500/config.json (deflated 64%)
  adding: kaggle/working/finetuned/checkpoint-6500/trainer_state.json (deflated 79%)
  adding: kaggle/working/finetuned/checkpoint-6500/special_tokens_map.json (deflated 52%)
  adding: kaggle/working/finetuned/checkpoint-6500/vocab.json (deflated 59%)
  adding: kaggle/working/finetuned/checkpoint-6500/training_args.bin (deflated 