In [1]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import  DataCollatorWithPadding
from transformers import  TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
import tensorflow as tf

In [2]:
# Import a fake news dataset from HuggingFace repository:

raw_datasets = load_dataset("GonzaloA/fake_news")

Repo card metadata block was not found. Setting CardData to empty.


In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 24353
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 8117
    })
    test: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 8117
    })
})

In [4]:
# Load the distilBERT base uncased model and tokenize the data:

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncation=True,padding=True)

In [5]:
# GLUE's classification tasks, selecting sst2 (Standford Sentiment Treebank) for sentiment (positive or negative) classification task:

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [6]:
task = "sst2"

In [8]:
# Functionalize the tokenizer process and ensure a maximum length on an input sentence that the model can handle is trundicated. All other samples will be padded to the maxiumum length.

sentence1_key, sentence2_key = task_to_keys[task]

def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples['title'], truncation=True,padding=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

In [9]:
encoded_dataset = raw_datasets.map(preprocess_function, batched=True)

In [10]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 24353
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8117
    })
    test: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8117
    })
})

In [11]:
# Finetune the model with training arguments:

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments("test-trainer", num_train_epochs=3, max_steps=100)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.we

In [12]:
# Benchmark the model with an accuracy metric using the predictions (logits) and labels:

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [13]:
# Compile, train and evaluate:

trainer = Trainer(
    model,
    training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

In [14]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=100, training_loss=0.35274051666259765, metrics={'train_runtime': 357.196, 'train_samples_per_second': 2.24, 'train_steps_per_second': 0.28, 'total_flos': 12685823224416.0, 'train_loss': 0.35274051666259765, 'epoch': 0.03})

In [15]:
trainer.evaluate()

{'eval_loss': 0.21315033733844757,
 'eval_accuracy': 0.9223851176543058,
 'eval_runtime': 776.4398,
 'eval_samples_per_second': 10.454,
 'eval_steps_per_second': 1.307,
 'epoch': 0.03}

In [16]:
# Performing some prediction to classify some samples with the trained model:

raw_train_dataset = raw_datasets["train"]

In [17]:
# Example of a fake news exerpt:
raw_train_dataset[0]['title']

' ‘Maury’ Show Official Facebook Posts F*CKED UP Caption On Guest That Looks Like Ted Cruz (IMAGE)'

In [18]:
# The classification of the fake news exerpt:

inputs = tokenizer(raw_train_dataset[0]['title'], return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs, labels=labels)

# Pass the logit outputs into a softmax function to get the probabilities of being fake and true:

tf.nn.softmax(outputs[1].detach().numpy())

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.98396933, 0.0160307 ]], dtype=float32)>

In [19]:
# Example of a real news exerpt:
raw_train_dataset[210]['title']

'U.N. says it believes Afghanistan air strike killed civilians'

In [20]:
# The classification of the real news exerpt:

inputs = tokenizer(raw_train_dataset[210]['title'], return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)
outputs = model(**inputs, labels=labels)

tf.nn.softmax(outputs[1].detach().numpy())

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.02911408, 0.97088593]], dtype=float32)>