In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
import torch
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
)
from enum import Enum
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
class OutputEnum(Enum):
    MISLEADING_MISSING_IMPORTANT_CONTEXT = "misleadingMissingImportantContext"


class2id = {cls.value: i for i, cls in enumerate(OutputEnum)}
id2class = {i: cls.value for i, cls in enumerate(OutputEnum)}

In [4]:
datafiles = {
    "train": "/kaggle/input/twitter-community-notes/train.tsv",
    "test": "/kaggle/input/twitter-community-notes/test.tsv",
    "validation": "/kaggle/input/twitter-community-notes/validation.tsv",
}
dataset = load_dataset("csv", delimiter="\t", data_files=datafiles)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")


def preprocess(example):
    text = example["summary"]
    labels = [float(example[class_]) for class_ in class2id]
    example = tokenizer(text, truncation=True)
    example["labels"] = labels
    return example


small_dataset_train = (
    dataset["train"].shuffle(seed=42).select([i for i in list(range(500_000))])
)
small_dataset_test = (
    dataset["test"].shuffle(seed=42).select([i for i in list(range(75_000))])
)

tokenized_train = small_dataset_train.map(preprocess)
tokenized_test = small_dataset_test.map(preprocess)
tokenized_validation = dataset["validation"].map(preprocess)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/75000 [00:00<?, ? examples/s]

Map:   0%|          | 0/154651 [00:00<?, ? examples/s]

In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    "/kaggle/working/results/checkpoint-62500",
    num_labels=len(OutputEnum),
    id2label=id2class,
    label2id=class2id,
    problem_type="multi_label_classification",
)

In [8]:
import numpy as np
import evaluate


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="epoch",
    eval_strategy="steps",
    eval_steps=15_000,
    report_to="none",
)


clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])


def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))


def computer_metrics(eval_pred):
    pred, labels = eval_pred
    pred = sigmoid(pred)
    pred = (pred > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(
        predictions=pred, references=labels.astype(int).reshape(-1)
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=computer_metrics,
    processing_class=tokenizer,
)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

In [9]:
# trainer.train()

In [10]:
trainer.evaluate(tokenized_validation)

{'eval_loss': 0.4770198166370392,
 'eval_model_preparation_time': 0.002,
 'eval_accuracy': 0.756917187732378,
 'eval_f1': 0.7514594558857559,
 'eval_precision': 0.7158277912132205,
 'eval_recall': 0.79082420717198,
 'eval_runtime': 431.5472,
 'eval_samples_per_second': 358.364,
 'eval_steps_per_second': 44.797}

In [11]:
!tar -czvf check_point_500k_single.tar.gz /kaggle/working/results/checkpoint-62500

tar: Removing leading `/' from member names
/kaggle/working/results/checkpoint-62500/
/kaggle/working/results/checkpoint-62500/trainer_state.json
/kaggle/working/results/checkpoint-62500/model.safetensors
/kaggle/working/results/checkpoint-62500/tokenizer.json
/kaggle/working/results/checkpoint-62500/tokenizer_config.json
/kaggle/working/results/checkpoint-62500/optimizer.pt
/kaggle/working/results/checkpoint-62500/special_tokens_map.json
/kaggle/working/results/checkpoint-62500/rng_state.pth
/kaggle/working/results/checkpoint-62500/training_args.bin
/kaggle/working/results/checkpoint-62500/config.json
/kaggle/working/results/checkpoint-62500/scheduler.pt
/kaggle/working/results/checkpoint-62500/vocab.txt
