In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import torch
from transformers.file_utils import is_tf_available, is_torch_available
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, RobertaTokenizerFast, RobertaForSequenceClassification, BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [2]:
data_news = pd.read_csv('fake_or_real_news.csv')
print(data_news.shape)
data_news[:15]

(6335, 4)


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


In [3]:
# This code segment parses the data_news dataset into a more manageable format

titles = []
tokenized_titles = []
sequence_labels = data_news['label']

title, tokenized_title =  [], []
for news in data_news['title']:
    title.append(news)
    tokenized_title.append(news.split(' '))

In [4]:
# Python list for each news
title[0], tokenized_title[0], sequence_labels[0]

('You Can Smell Hillary’s Fear',
 ['You', 'Can', 'Smell', 'Hillary’s', 'Fear'],
 'FAKE')

In [5]:
unique_sequence_labels = list(set(sequence_labels))
unique_sequence_labels

['FAKE', 'REAL']

In [6]:
# Convert sequence_labels to indices:
label_indices = [unique_sequence_labels.index(l) for l in sequence_labels]

news_dataset = Dataset.from_dict(
    dict(
        titles=title, 
        label=label_indices,
        tokens=tokenized_title,
    )
)
news_dataset = news_dataset.train_test_split(test_size=0.2)

news_dataset['train'][0]


{'titles': 'One chart that shows why the Republican Party was ready for Donald Trump',
 'label': 1,
 'tokens': ['One',
  'chart',
  'that',
  'shows',
  'why',
  'the',
  'Republican',
  'Party',
  'was',
  'ready',
  'for',
  'Donald',
  'Trump']}

In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


In [8]:
def preprocess_function(examples):
    return tokenizer(examples["titles"], truncation=True, padding=True)# truncation=True makes sure to exludes instances with more 512 tokens

In [9]:
# go over all our data set, tokenize them
seq_clf_tokenized_news = news_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/5068 [00:00<?, ? examples/s]

Map:   0%|          | 0/1267 [00:00<?, ? examples/s]

In [10]:
seq_clf_tokenized_news['train'][0]


{'titles': 'One chart that shows why the Republican Party was ready for Donald Trump',
 'label': 1,
 'tokens': ['One',
  'chart',
  'that',
  'shows',
  'why',
  'the',
  'Republican',
  'Party',
  'was',
  'ready',
  'for',
  'Donald',
  'Trump'],
 'input_ids': [101,
  2028,
  3673,
  2008,
  3065,
  2339,
  1996,
  3951,
  2283,
  2001,
  3201,
  2005,
  6221,
  8398,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [11]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [12]:
sequence_clf_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', 
                                                                         num_labels=len(unique_sequence_labels),)

# set an index -> label dictionary
sequence_clf_model.config.id2label = {i: l for i, l in enumerate(unique_sequence_labels)}

In [13]:
sequence_clf_model.config


DistilBertConfig {
  "_attn_implementation_autoset": true,
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "FAKE",
    "1": "REAL"
  },
  "initializer_range": 0.02,
  "label2id": null,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "vocab_size": 30522
}

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):  # common method to take in logits and calculate accuracy of the eval set
    logits, labels = eval_pred   # logit and label are returning from training loop
    predictions = np.argmax(logits, axis=-1) 
    return metric.compute(predictions=predictions, references=labels) # compute the accuracy

metric = evaluate.load("accuracy")
from sklearn.metrics import roc_auc_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

#####################################################

def compute_metrics_binary(eval_pred):
    """metrics for binary classification"""
    
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    # Calculate the AUC score
    auc_score = roc_auc_score(labels, preds)

    # Calculate the accuracy, true positive, false positive, false negative, and true negative values
    acc = metric.compute(predictions=preds, references=labels)
    tp = ((preds >= 0.5) & (labels == 1)).sum()
    fp = ((preds >= 0.5) & (labels == 0)).sum()
    fn = ((preds < 0.5) & (labels == 1)).sum()
    tn = ((preds < 0.5) & (labels == 0)).sum()

    # Calculate the precision, recall, and F1 score
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)

    return {
        'Validation Accuracy': acc['accuracy'],
        'Validation Precision': auc_score,
        'Validation AUC': precision,
        'Validation Recall': recall,
        'Validation F1_Score': f1_score,
        'Validation TP': tp,
        'Validation FP': fp,
        'Validation FN': fn,
        'Validation TN': tn,
    }

#####################################################

from sklearn.metrics import classification_report

def compute_metrics_multiclass(eval_pred):
    """metrics for multiclass classification"""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    report = classification_report(labels, preds, output_dict=True)
    acc_score = report['accuracy']
    pre_score = report['macro avg']['precision']
    rcl_score = report['macro avg']['recall']
    f1_score = report['macro avg']['f1-score']

    return {
        'Validation Accuracy': acc_score,
        'Validation Macro Recall': rcl_score,
        'Validation Macro Precision': pre_score,        
        'Validation Macro F1_Score': f1_score,
        }

In [15]:
epochs = 10

# Training argument
training_args = TrainingArguments(
    output_dir="./distilbert_news_clf/results", # Local directory to save check point of our model as fitting
    num_train_epochs=epochs,         # minimum of two epochs
    per_device_train_batch_size=32,  # batch size for training and evaluation, it common to take around 32, 
    per_device_eval_batch_size=32,   # sometimes less or more, The smaller batch size, the more change model update 
    load_best_model_at_end=True,     # Even if we overfit the model by accident, load the best model through checkpoint
    
    # some deep learning parameters that the trainer is able to take in
    warmup_steps = len(seq_clf_tokenized_news['train']) // 5,  # learning rate scheduler by number of warmup steps
    weight_decay = 0.05,    # weight decay for our learning rate schedule (regularization)
    
    logging_steps = 1,  # Tell the model minimum number of steps to log between (1 means logging as much as possible)
    log_level = 'info',
    eval_strategy = 'epoch', # It is "steps" or "epoch", we choose epoch: how many times to stop training to test
    eval_steps = 50,
    save_strategy = 'epoch'  # save a check point of our model after each epoch
)

# Define the trainer:
trainer = Trainer(
    model=sequence_clf_model,   # take our model (sequence_clf_model)
    args=training_args,         # we just set it above
    train_dataset=seq_clf_tokenized_news['train'], # training part of dataset
    eval_dataset=seq_clf_tokenized_news['test'],   # test (evaluation) part of dataset
    compute_metrics=compute_metrics_binary,    # This part is optional but we want to calculate accuracy of our model 
    data_collator=data_collator         # data colladior with padding. Infact, we may or may not need a data collator
                                        # we can check the model to see how it lookes like with or without the collator
)

In [None]:
# Get initial metrics: evaluation on test set
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tokens, titles. If tokens, titles are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 1267
  Batch size = 32


{'eval_loss': 2.0193748474121094,
 'eval_model_preparation_time': 0.001,
 'eval_Validation Accuracy': 0.1239147592738753,
 'eval_Validation Precision': 0.1243366848205558,
 'eval_Validation AUC': 0.12912912912912913,
 'eval_Validation Recall': 0.1396103896103896,
 'eval_Validation F1_Score': 0.13416536661466458,
 'eval_Validation TP': 86,
 'eval_Validation FP': 580,
 'eval_Validation FN': 530,
 'eval_Validation TN': 71,
 'eval_runtime': 14.4483,
 'eval_samples_per_second': 87.692,
 'eval_steps_per_second': 2.768}

In [17]:
trainer.train()


The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: tokens, titles. If tokens, titles are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5,068
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1,590
  Number of trainable parameters = 66,955,010


KeyboardInterrupt: 

In [None]:
trainer.evaluate()


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: titles, tokens. If titles, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 1267
  Batch size = 32


{'eval_loss': 0.359478235244751,
 'eval_model_preparation_time': 0.002,
 'eval_Validation Accuracy': 0.8389897395422258,
 'eval_Validation Precision': 0.838791277258567,
 'eval_Validation AUC': 0.8328267477203647,
 'eval_Validation Recall': 0.8535825545171339,
 'eval_Validation F1_Score': 0.8430769230769231,
 'eval_Validation TP': 548,
 'eval_Validation FP': 110,
 'eval_Validation FN': 94,
 'eval_Validation TN': 515,
 'eval_runtime': 14.7476,
 'eval_samples_per_second': 85.912,
 'eval_steps_per_second': 2.712,
 'epoch': 10.0}

In [None]:
trainer.save_model()
model_path = "./distilbert-base-uncased"
sequence_clf_model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Saving model checkpoint to ./distilbert_news_clf/results
Configuration saved in ./distilbert_news_clf/results\config.json
Model weights saved in ./distilbert_news_clf/results\model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./distilbert_news_clf/results\tokenizer_config.json
Special tokens file saved in ./distilbert_news_clf/results\special_tokens_map.json
Configuration saved in ./distilbert-base-uncased\config.json
Model weights saved in ./distilbert-base-uncased\model.safetensors
tokenizer config file saved in ./distilbert-base-uncased\tokenizer_config.json
Special tokens file saved in ./distilbert-base-uncased\special_tokens_map.json


('./distilbert-base-uncased\\tokenizer_config.json',
 './distilbert-base-uncased\\special_tokens_map.json',
 './distilbert-base-uncased\\vocab.txt',
 './distilbert-base-uncased\\added_tokens.json',
 './distilbert-base-uncased\\tokenizer.json')

In [None]:
from transformers import pipeline
pipe = pipeline("text-classification", "./distilbert-base-uncased", tokenizer=tokenizer)


loading configuration file ./distilbert-base-uncased\config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "REAL",
    "1": "FAKE"
  },
  "initializer_range": 0.02,
  "label2id": null,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "vocab_size": 30522
}

loading configuration file ./distilbert-base-uncased\config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_di

In [None]:
text = "skibidi toilet just died"
pipe(text)

[{'label': 'FAKE', 'score': 0.922602653503418}]