In [None]:
!pip install evaluate

In [12]:
import torch
import os
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
)
from enum import Enum
from transformers import Trainer, TrainingArguments
from datasets import load_dataset


os.environ["WANDB_DISABLED"] = "true"
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [13]:
class OutputEnum(Enum):
    MISLEADING_OTHER = "misleadingOther"
    MISLEADING_FACTUAL_ERROR = "misleadingFactualError"
    MISLEADING_MANIPULATED_MEDIA = "misleadingManipulatedMedia"
    MISLEADING_OUTDATED_INFORMATION = "misleadingOutdatedInformation"
    MISLEADING_MISSING_IMPORTANT_CONTEXT = "misleadingMissingImportantContext"
    MISLEADING_UNVERIFIED_CLAIM_AS_FACT = "misleadingUnverifiedClaimAsFact"
    MISLEADING_SATIRE = "misleadingSatire"
    TRUSTWORTHY_SOURCES = "trustworthySources"
    NOT_MISLEADING_FACTUALLY_CORRECT = "notMisleadingFactuallyCorrect"
    NOT_MISLEADING_OUTDATED_BUT_NOT_WHEN_WRITTEN = (
        "notMisleadingOutdatedButNotWhenWritten"
    )
    NOT_MISLEADING_CLEARLY_SATIRE = "notMisleadingClearlySatire"
    NOT_MISLEADING_PERSONAL_OPINION = "notMisleadingPersonalOpinion"


class2id = {cls.value: i for i, cls in enumerate(OutputEnum)}
id2class = {i: cls.value for i, cls in enumerate(OutputEnum)}

In [14]:
datafiles = {
    "train": "/kaggle/input/twitter-community-notes/train.tsv",
    "test": "/kaggle/input/twitter-community-notes/test.tsv",
    "validation": "/kaggle/input/twitter-community-notes/validation.tsv",
}
dataset = load_dataset("csv", delimiter="\t", data_files=datafiles)

In [15]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")


def preprocess(example):
    text = example["summary"]
    labels = [float(example[class_]) for class_ in class2id]
    example = tokenizer(text, truncation=True)
    example["labels"] = labels
    return example


small_dataset_train = (
    dataset["train"].shuffle(seed=42).select([i for i in list(range(500_000))])
)
small_dataset_test = (
    dataset["test"].shuffle(seed=42).select([i for i in list(range(75_000))])
)

tokenized_train = small_dataset_train.map(preprocess)
tokenized_test = small_dataset_test.map(preprocess)

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/75000 [00:00<?, ? examples/s]

In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-cased",
    num_labels=len(OutputEnum),
    id2label=id2class,
    label2id=class2id,
    problem_type="multi_label_classification",
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
import numpy as np
import evaluate


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="epoch",
    eval_strategy="steps",
    eval_steps=15_000,
    report_to="none",
)

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])


def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))


def computer_metrics(eval_pred):
    pred, labels = eval_pred
    pred = sigmoid(pred)
    pred = (pred > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(
        predictions=pred, references=labels.astype(int).reshape(-1)
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=computer_metrics,
    processing_class=tokenizer,
)


In [23]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
15000,0.2349,0.23158,0.899269,0.744605,0.804878,0.69273
30000,0.2264,0.226051,0.901321,0.750817,0.807809,0.701337
45000,0.2259,0.224181,0.902299,0.75193,0.81416,0.698538
60000,0.2238,0.222256,0.903138,0.754967,0.813952,0.703953
75000,0.2134,0.222132,0.904004,0.760347,0.807497,0.718399
90000,0.2149,0.221367,0.904279,0.758586,0.815004,0.709472
105000,0.2114,0.220389,0.905029,0.761072,0.815348,0.713571
107445,0.2089,0.220387,0.904769,0.759698,0.814895,0.711504


KeyboardInterrupt: 

In [24]:
trainer.evaluate()

{'eval_loss': 0.2200288325548172,
 'eval_accuracy': 0.9050633333333333,
 'eval_f1': 0.7612263547217898,
 'eval_precision': 0.8152504429440215,
 'eval_recall': 0.7139172961101181}

In [25]:
tokenized_validation = dataset["validation"].map(preprocess)
trainer.evaluate(tokenized_validation)

Map:   0%|          | 0/154651 [00:00<?, ? examples/s]

{'eval_loss': 0.22038666903972626,
 'eval_accuracy': 0.9047694486294948,
 'eval_f1': 0.7596981439934734,
 'eval_precision': 0.8148954106709994,
 'eval_recall': 0.7115041272638826}