In [None]:
import os
import torch
import pandas as pd
import evaluate
import numpy as np
import seaborn as sns
import time
import wandb
import warnings
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig, TrainingArguments, Trainer, pipeline
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from datetime import datetime
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from dotenv import load_dotenv
from huggingface_hub import login

## Fine Tuning DistilBERT, MobileBERT and TinyBERT
## for fake news detection

In [None]:
warnings.filterwarnings("ignore") #Don't do in production

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
torch.cuda.empty_cache()

In [None]:
#load_dotenv()

In [None]:
PROJECT_NAME = "FakeNewsClassification"
RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
HF_USER = "CharlesMac"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

EPOCHS = 3
LEARNING_RATE = 2e-5
LOG_INTO_WANDB = True
BATCH_SIZE = 32
TRAINING_DIR = "train_dir"

In [None]:
df = pd.read_excel('Data/fake_news.xlsx')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df['label'].value_counts()

## Dataset Analysis

In [None]:
label_count = df['label'].value_counts(ascending=True)
label_count.plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
1# 1.5 tokens per word on average
df['title_tokens'] = df['title'].apply(lambda x: len(x.split())*1.5)
df['text_tokens'] = df['text'].apply(lambda x: len(x.split())*1.5)

fig, ax = plt.subplots(1, 2, figsize=(15,5))
ax[0].hist(df['title_tokens'], bins=50, color='b')
ax[0].set_title("Title Tokens")
ax[1].hist(df['text_tokens'], bins=50, color='b')
ax[1].set_title("Text Tokens")
plt.show()

## Dataloader and Train Test Split

In [None]:
train, test = train_test_split(df, test_size=0.3,
                                   random_state=42,
                                   shuffle=True,
                                   stratify=df['label'])

test, validation = train_test_split(test, test_size=1/3,
                                    random_state=42,
                                    shuffle=True,
                                    stratify=test['label'])

In [None]:
train.shape, test.shape, validation.shape

In [None]:
dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(train, preserve_index=False),
        "test": Dataset.from_pandas(test, preserve_index=False),
        "validation": Dataset.from_pandas(validation, preserve_index=False)
    }
)

dataset

## Data Tokenization

In [None]:
text = "Machine learning is awesome"

In [None]:
model_ckpt = "distilbert-base-uncased"
distilbert_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
distilbert_tokens = distilbert_tokenizer.tokenize(text)

mobile_model_ckpt = "google/mobilebert-uncased"
mobilebert_tokenizer = AutoTokenizer.from_pretrained(mobile_model_ckpt)
mobilebert_tokens = mobilebert_tokenizer.tokenize(text)

tiny_model_ckpt = "huawei-noah/TinyBERT_General_4L_312D"
tinybert_tokenizer = AutoTokenizer.from_pretrained(tiny_model_ckpt)
tinybert_tokens = tinybert_tokenizer.tokenize(text)

In [None]:
distilbert_tokenizer, mobilebert_tokenizer, tinybert_tokenizer

In [None]:
def tokenize(batch):
    tokens = distilbert_tokenizer(batch['title'], padding=True, truncation=True)
    return tokens

print (tokenize(dataset['train'][:2]))

In [None]:
encoded_dataset = dataset.map(tokenize, batch_size=None, batched=True)

## Model Building

In [None]:
label2id = {"Real": 0, "Fake": 1}
id2label = {0: "Real", 1: "Fake"}
model_ckpt = "distilbert-base-uncased"

num_labels = len(label2id)
config = AutoConfig.from_pretrained(model_ckpt, label2id=label2id, id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

In [None]:
model.config.id2label

## Fine Tuning

In [None]:
# Log in to HuggingFace

# hf_token = os.environ['HF_TOKEN']
# login(hf_token, add_to_git_credential=True)

In [None]:
# wandb_api_key = os.environ['WANDB_API_KEY']
# os.environ["WANDB_API_KEY"] = wandb_api_key
# wandb.login()
# #
# # # Configure Weights & Biases to record against our project
# os.environ["WANDB_PROJECT"] = PROJECT_NAME
# os.environ["WANDB_LOG_MODEL"] = "checkpoint" if LOG_INTO_WANDB else "end"
# os.environ["WANDB_WATCH"] = "gradients"

In [None]:
# if LOG_INTO_WANDB:
#     wandb.init(project=PROJECT_NAME, name=RUN_NAME)

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics_evaluate(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
training_args = TrainingArguments(output_dir=TRAINING_DIR,
                                  overwrite_output_dir=True,
                                  num_train_epochs=EPOCHS,
                                  learning_rate=LEARNING_RATE,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  report_to=None,
#                                  report_to="wandb" if LOG_INTO_WANDB else None,
                                  run_name=RUN_NAME,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  disable_tqdm=False,
                                  # hub_model_id=HUB_MODEL_NAME,
                                  # hub_private_repo=True
                                  )

In [None]:
trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics_evaluate,
                  train_dataset=encoded_dataset['train'],
                  eval_dataset=encoded_dataset['validation'],
                  tokenizer=distilbert_tokenizer)

In [None]:
trainer.train()

In [None]:
trainer.save_model("Models/fake_news" + RUN_NAME)

## Model Evaluation

In [None]:
preds_output = trainer.predict(encoded_dataset['test'])

In [None]:
preds_output.metrics

In [None]:
y_pred = np.argmax(preds_output.predictions, axis=1)
y_true = encoded_dataset['test'][:]['label']

In [None]:
print(classification_report(y_true, y_pred, target_names=list(label2id)))

In [None]:
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize = (5,5))
sns.heatmap(cm, annot=True, xticklabels=label2id.keys(), yticklabels=label2id.keys(), fmt='d', cbar=False, cmap='Reds')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

## Model benchmarking

In [None]:
model_dict = {
    "bert-base":"bert-base-uncased",
    "distilbert":"distilbert-base-uncased",
    "mobilebert":"google/mobilebert-uncased",
    "tinybert":"huawei-noah/TinyBERT_General_4L_312D"
}

In [None]:
def train_model(model_name):
    model_ckpt = model_dict[model_name]
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    config = AutoConfig.from_pretrained(model_ckpt, label2id=label2id, id2label=id2label)
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)

    def local_tokenizer(batch):
        tokens = tokenizer.tokenize(batch['title'], padding=True, truncation=True)
        return tokens

    trainer = Trainer(model=model,
                      args=training_args,
                      compute_metrics=compute_metrics,
                      train_dataset=encoded_dataset['train'],
                      eval_dataset=encoded_dataset['validation'],
                      tokenizer=tokenizer)

    trainer.train()

    preds = trainer.predict(encoded_dataset['test'])
    trainer.save_model("Models/fake_news" + model_ckpt + RUN_NAME)
    return preds.metrics


In [None]:
model_performance = {}
for model_name in model_dict:
    print("\n\n")
    print("Training Model: ", model_name)
    start = time.time()
    result = train_model(model_name)
    end = time.time()
    model_performance[model_name] = {model_name: result, "time taken": end-start}

In [None]:
model_performance

{'bert-base': {'test_loss': 0.14568275213241577,
  'test_accuracy': 0.9606126914660832,
  'test_f1': 0.960656369998893,
  'test_runtime': 2.4963,
  'test_samples_per_second': 1464.584,
  'test_steps_per_second': 46.069},
 'distilbert': {'test_loss': 0.1263875514268875,
  'test_accuracy': 0.962527352297593,
  'test_f1': 0.9625578288439263,
  'test_runtime': 1.3223,
  'test_samples_per_second': 2764.84,
  'test_steps_per_second': 86.968},
 'mobilebert': {'test_loss': 0.13580088317394257,
  'test_accuracy': 0.9488512035010941,
  'test_f1': 0.9489814357607576,
  'test_runtime': 1.4697,
  'test_samples_per_second': 2487.511,
  'test_steps_per_second': 78.245},
 'tinybert': {'test_loss': 0.1453436017036438,
  'test_accuracy': 0.9447483588621444,
  'test_f1': 0.9446144684798617,
  'test_runtime': 0.4243,
  'test_samples_per_second': 8616.846,
  'test_steps_per_second': 271.044}}

In [None]:
# if LOG_INTO_WANDB:
#     wandb.finish()