In this Notebook, we use [Hugging Face](https://huggingface.co/) Transformer library to solve the Fake News Challenge Stage 1 ([FNC-I](http://www.fakenewschallenge.org/)) - stance detection task.

In [1]:
import os
%cd /content
!git clone --recursive https://github.com/BiteKirby3/FakeNewsChallenge
root_dir = "/content/FakeNewsChallenge/fnc-1-baseline"
os.chdir(root_dir)

/content
Cloning into 'FakeNewsChallenge'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 36 (delta 7), reused 31 (delta 4), pack-reused 0[K
Unpacking objects: 100% (36/36), 4.30 MiB | 6.29 MiB/s, done.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import sklearn
import tensorflow as tf
import tqdm
import scipy 
import nltk
from datetime import date
import csv

In [3]:
!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hu

In [19]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


#Data Loading for Hugging Face

In [4]:
from datasets import Dataset

In [5]:
#import the 2 csv files as DataFrame
#Train
train_bodies = pd.read_csv('/content/FakeNewsChallenge/data/train_bodies.csv')
train_stances = pd.read_csv('/content/FakeNewsChallenge/data/train_stances.csv')
#Test
test_bodies = pd.read_csv('/content/FakeNewsChallenge/data/competition_test_bodies.csv')
test_stances = pd.read_csv('/content/FakeNewsChallenge/data/competition_test_stances.csv')
#merge stance and news body, remove Body ID
train = pd.merge(train_stances,train_bodies,how="inner",on="Body ID")
train = train.drop('Body ID', axis=1)
test = pd.merge(test_stances,test_bodies,how="inner",on="Body ID")
test = test.drop('Body ID', axis=1)
#rename stance column to label
train = train.rename(columns={"Stance": "label"})
test = test.rename(columns={"Stance": "label"})
#create a map of the expected ids to their labels with id2label and label2id
id2label = {0: "unrelated", 1: "discuss", 2: "agree", 3: "disagree"}
label2id = {"unrelated": 0, "discuss": 1, "agree": 2, "disagree": 3}
#label2id
train = train.replace({"label": label2id})
test = test.replace({"label": label2id})
#split train into train and valiation 
validation = train.sample(frac=0.2, random_state=25)
train = train.drop(validation.index)

In [6]:
#convert DataFrame to the Hugging Face Dataset
dataset_train = Dataset.from_pandas(train)
dataset_validation = Dataset.from_pandas(validation)
dataset_test = Dataset.from_pandas(test)

#Select a pre-trained model

In [7]:
#@markdown In this Notebook, we support the following model architectures to do the stance classification task:
#@markdown [BERT](https://huggingface.co/docs/transformers/model_doc/bert),
#@markdown [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta),
#@markdown [DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert),
#@markdown [GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt),
#@markdown [LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)(need High-RAM).
MODEL_NAME = 'RoBERTa' #@param ['BERT','DistilBERT','RoBERTa','GPT','LLaMA']

#Fine-tuning a pre-trained model

##Tokenize the input

In [9]:
from transformers import AutoTokenizer

In [10]:
#load a Model tokenizer to preprocess the text field
tokenizer = None
if MODEL_NAME == 'BERT':
  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
elif MODEL_NAME == 'RoBERTa':
  tokenizer = AutoTokenizer.from_pretrained("roberta-base")
elif MODEL_NAME == 'GPT':
  tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
elif MODEL_NAME == 'LLaMA':
  pass
else :
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [11]:
#tokenize input:<Headline><SEP><Body>
def tokenize_input(dataset):
  data_list = []
  for i in range(len(dataset)):
    tokenizer_text = tokenizer.encode_plus(dataset[i]["Headline"],dataset[i]["articleBody"],truncation=True)
    tokenizer_text['label'] = dataset[i]["label"]
    data_list.append(tokenizer_text)
  
  #convert to HuggingFace dataset
  return Dataset.from_pandas(pd.DataFrame(data=data_list))

In [12]:
#apply the tokenize function over train and the validation dataset
tokenized_dataset_train = tokenize_input(dataset_train)
tokenized_dataset_validation = tokenize_input(dataset_validation)

In [13]:
#create a batch of examples using DataCollatorWithPadding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

##Evaluation metric

We are also going to compute metrics while training. For this, we need to define a `compute_metrics` function, that returns a dictionary with the desired metric values.

In [14]:
#metric during training
import evaluate
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [15]:
#create a function that passes your predictions and labels to compute to calculate the accuracy
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train the model!

We are going to train the model using HuggingFace's Trainer API. This requires us to define 3 things: 

* a pre-trained model
* `TrainingArguments`, which specify training hyperparameters. All options can be found in the [docs](https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments). Below, we for example specify that we want to evaluate after every epoch of training, we would like to save the model every epoch, we set the learning rate, the batch size to use for training/evaluation, how many epochs to train for, and so on.
* a `Trainer` object (docs can be found [here](https://huggingface.co/transformers/main_classes/trainer.html#id1)).

In [16]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [17]:
#define the model
model = None
if MODEL_NAME == 'BERT':
  model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4,id2label=id2label, label2id=label2id)
elif MODEL_NAME == 'RoBERTa':
  model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4,id2label=id2label, label2id=label2id)
elif MODEL_NAME == 'GPT':
  model = AutoModelForSequenceClassification.from_pretrained("openai-gpt", num_labels=4,id2label=id2label, label2id=label2id)
elif MODEL_NAME == 'LLaMA':
  # Initializing a LLaMA llama-7b style configuration(unluckily il will surpass the colab RAM)
  pass
else :
  model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4,id2label=id2label, label2id=label2id)

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [20]:
#train
training_args = TrainingArguments(
    output_dir=MODEL_NAME,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Cloning https://huggingface.co/sxie3333/RoBERTa into local empty directory.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1382,0.098588,0.971783


TrainOutput(global_step=2499, training_loss=0.2063636672930891, metrics={'train_runtime': 4318.4443, 'train_samples_per_second': 9.258, 'train_steps_per_second': 0.579, 'total_flos': 1.0518776877732864e+16, 'train_loss': 0.2063636672930891, 'epoch': 1.0})

##Predict the stance

In [21]:
import torch

In [22]:
def classify_stance(headline, body, model_path):
  tokenizer = AutoTokenizer.from_pretrained(model_path)
  inputs = tokenizer.encode_plus(headline, body, truncation=True, return_tensors="pt")
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
  with torch.no_grad():
    logits = model(**inputs).logits
  #get the class with the highest probability
  predicted_class_id = logits.argmax().item()

  return model.config.id2label[predicted_class_id]

In [None]:
#Classify the test dataset, we write the responses to a output csv file.
filename = "/content/FakeNewsChallenge/result/"+MODEL_NAME+str(date.today())+".csv"
with open(filename, 'w', newline='') as csvfile:
    fieldnames = ["STANCE_INDEX","ACTUAL_STANCE","PREDICT_STANCE"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for i in range(len(dataset_test)):
      model_path = '/content/FakeNewsChallenge/fnc-1-baseline/'+MODEL_NAME
      predict_stance = classify_stance(dataset_test[i]['Headline'],dataset_test[i]['articleBody'],model_path)
      writer.writerow({'STANCE_INDEX': str(i), 'ACTUAL_STANCE': str(id2label[dataset_test[i]['label']]), 'PREDICT_STANCE': str(predict_stance)})
      csvfile.flush()

#Scoring classifier for FNC competition

In [35]:
from utils.score import report_score

In [38]:
#read the generated csv
output = pd.read_csv(filename)  

In [37]:
report_score(output["ACTUAL_STANCE"], output["PREDICT_STANCE"])

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |   1187    |    36     |    632    |    48     |
-------------------------------------------------------------
| disagree  |    288    |    52     |    290    |    67     |
-------------------------------------------------------------
|  discuss  |    537    |    54     |   3716    |    157    |
-------------------------------------------------------------
| unrelated |    52     |    20     |    303    |   17974   |
-------------------------------------------------------------
Score: 9907.75 out of 11651.25	(85.03594034974788%)


85.03594034974788