In [24]:
import torch
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
)
from enum import Enum
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"

In [25]:
class OutputEnum(Enum):
    MISLEADING_OTHER = "misleadingOther"
    MISLEADING_FACTUAL_ERROR = "misleadingFactualError"
    MISLEADING_MANIPULATED_MEDIA = "misleadingManipulatedMedia"
    MISLEADING_OUTDATED_INFORMATION = "misleadingOutdatedInformation"
    MISLEADING_MISSING_IMPORTANT_CONTEXT = "misleadingMissingImportantContext"
    MISLEADING_UNVERIFIED_CLAIM_AS_FACT = "misleadingUnverifiedClaimAsFact"
    MISLEADING_SATIRE = "misleadingSatire"
    TRUSTWORTHY_SOURCES = "trustworthySources"
    NOT_MISLEADING_FACTUALLY_CORRECT = "notMisleadingFactuallyCorrect"
    NOT_MISLEADING_OUTDATED_BUT_NOT_WHEN_WRITTEN = (
        "notMisleadingOutdatedButNotWhenWritten"
    )
    NOT_MISLEADING_CLEARLY_SATIRE = "notMisleadingClearlySatire"
    NOT_MISLEADING_PERSONAL_OPINION = "notMisleadingPersonalOpinion"


class2id = {cls.value: i for i, cls in enumerate(OutputEnum)}
id2class = {i: cls.value for i, cls in enumerate(OutputEnum)}

In [26]:
datafiles = {
    "train": "train.tsv",
    "test": "test.tsv",
    "validation": "validation.tsv",
}
dataset = load_dataset("csv", delimiter="\t", data_files=datafiles)

In [32]:
tokenizer = AutoTokenizer.from_pretrained(
    "distilbert-base-cased"
    )


def preprocess(example):
    text = example["summary"]
    labels = [float(example[class_]) for class_ in class2id]
    example = tokenizer(text, truncation=True)
    example["labels"] = labels
    return example


small_dataset_train = (
    dataset["train"].shuffle(seed=42).select([i for i in list(range(60_000))])
)
small_dataset_test = (
    dataset["test"].shuffle(seed=42).select([i for i in list(range(6_000))])
)

tokenized_train = small_dataset_train.map(preprocess)
tokenized_test = small_dataset_test.map(preprocess)

Map: 100%|██████████| 60000/60000 [00:41<00:00, 1442.46 examples/s]
Map: 100%|██████████| 6000/6000 [00:04<00:00, 1475.70 examples/s]


In [33]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [34]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-cased",
    num_labels=len(OutputEnum),
    id2label=id2class,
    label2id=class2id,
    problem_type="multi_label_classification",
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
import numpy as np
import evaluate


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="epoch",
    eval_strategy="epoch",
    report_to="none"
)

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])


def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))


def computer_metrics(eval_pred):
    pred, labels = eval_pred
    pred = sigmoid(pred)
    pred = (pred > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(
        predictions=pred, references=labels.astype(int).reshape(-1)
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=computer_metrics,
    processing_class=tokenizer,
)


In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [86]:
trainer.evaluate()

{'eval_loss': 0.3207530975341797,
 'eval_accuracy': 0.8833333333333333,
 'eval_f1': 0.7033898305084746,
 'eval_precision': 0.7614678899082569,
 'eval_recall': 0.6535433070866141,
 'eval_runtime': 13.8239,
 'eval_samples_per_second': 21.702,
 'eval_steps_per_second': 1.374,
 'epoch': 10.0}

In [87]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
)

classifier(
     "This is clearly taken out of context. The author did not mean to say that, as you can hear in this previous recording"
)

Device set to use cuda:0


[[{'label': 'misleadingOther', 'score': 0.013595220632851124},
  {'label': 'misleadingFactualError', 'score': 0.03300686925649643},
  {'label': 'misleadingManipulatedMedia', 'score': 0.009008774533867836},
  {'label': 'misleadingOutdatedInformation', 'score': 0.010411562398076057},
  {'label': 'misleadingMissingImportantContext', 'score': 0.1889786422252655},
  {'label': 'misleadingUnverifiedClaimAsFact', 'score': 0.038660287857055664},
  {'label': 'misleadingSatire', 'score': 0.010134004056453705},
  {'label': 'trustworthySources', 'score': 0.8479096293449402},
  {'label': 'notMisleadingFactuallyCorrect', 'score': 0.26700371503829956},
  {'label': 'notMisleadingOutdatedButNotWhenWritten',
   'score': 0.004100655671209097},
  {'label': 'notMisleadingClearlySatire', 'score': 0.01117636077105999},
  {'label': 'notMisleadingPersonalOpinion', 'score': 0.008002033457159996}]]