# BERT for fake news detection


Make sure that you are using GPU

In [2]:
import torch 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

You can see the type of gpu available for you:

In [3]:
!nvidia-smi

Wed Mar 23 20:04:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8    27W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## 1. Load the data

In [4]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')
root='drive/My Drive/Colab Notebooks/fakeddit/'
paht_data = 'drive/My Drive/Colab Notebooks/docencia/DeepNLP/'

Mounted at /content/drive


In [5]:
import pandas as pd
train = pd.read_csv(paht_data+"data/fakeddit/all_train.tsv", sep='\t')
val  = pd.read_csv(paht_data+"data/fakeddit/all_validate.tsv", sep='\t')
test  = pd.read_csv(paht_data+"data/fakeddit/all_test_public.tsv", sep='\t')

print('size training: ', len(train))
print('size validation: ', len(val))
print('size test: ', len(test))


size training:  878218
size validation:  92444
size test:  92444


To make faster experimentation, we reduce the size of the datasets (of course, the results will be worse)

In [6]:
import numpy as np

print(f"Training patterns before reduction: {len(train)}")
train = train.sample(int(len(train)/100), random_state=12345)
print(f"Training patterns after reduction:  {len(train)}")

print(f"Training patterns before reduction: {len(val)}")
val = val.sample(int(len(val)/100), random_state=12345)
print(f"Training patterns after reduction:  {len(val)}")


print(f"Test patterns before reduction: {len(test)}")
test = test.sample(int(len(test)/100), random_state=12345)
print(f"Test patterns after reduction:  {len(test)}")

Training patterns before reduction: 878218
Training patterns after reduction:  8782
Training patterns before reduction: 92444
Training patterns after reduction:  924
Test patterns before reduction: 92444
Test patterns after reduction:  924


In [7]:
train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,3_way_label,6_way_label
23458,23458,23458,,,Joeness84,my gf said post to reddit things with faces,1553744000.0,imgur.com,True,b6dx9r,https://external-preview.redd.it/G6wbn3f5V2TJ5...,,0.0,3,pareidolia,"My gf said ""Post to Reddit things with faces""",0.67,0,2,2
445541,445541,445541,,,ChemicalHoneydew,hamster wearing a dprinted cubone skull,1552924000.0,i.redd.it,False,b2ka7a,,,3.0,4,photoshopbattles,PsBattle: Hamster Wearing A 3d-printed Cubone ...,0.83,1,0,0
65122,65122,65122,,,beancounter2885,king george vi enjoying a slide,1398807000.0,i.imgur.com,True,24b9d7,https://external-preview.redd.it/J-1hwAg8VViY6...,,6.0,20,photoshopbattles,King George VI enjoying a slide (x/post r/Hist...,0.85,1,0,0
77341,77341,77341,47807.0,219079.0,ApiContraption,,1517682000.0,,True,dtonon3,https://i.imgur.com/PSgjr8f%2ejpg,7v114x,,1,psbattle_artwork,,,0,2,4
483082,483082,483082,,,drsizzl,you didnt think anyone would notice huh,1560052000.0,i.redd.it,True,byg98w,https://preview.redd.it/ax3u6iox59331.png?widt...,,0.0,4,mildlyinteresting,"You didn't think anyone would notice, huh?",0.61,1,0,0


We only keep the clean title and the columns for classification:

In [8]:
train_data = train[["clean_title", "2_way_label", "3_way_label", "6_way_label"]]
val_data=val[["clean_title", "2_way_label", "3_way_label", "6_way_label"]]
test_data=test[["clean_title", "2_way_label", "3_way_label", "6_way_label"]]
# remove records with Nan values
train_data = train_data.dropna()
val_data = val_data.dropna()
test_data = test_data.dropna()


In [9]:
train_data.head()

Unnamed: 0,clean_title,2_way_label,3_way_label,6_way_label
23458,my gf said post to reddit things with faces,0,2,2
445541,hamster wearing a dprinted cubone skull,1,0,0
65122,king george vi enjoying a slide,1,0,0
483082,you didnt think anyone would notice huh,1,0,0
715222,woah that red bull stuff really does give you ...,0,2,4


We save the texts:

In [10]:
X_train = train_data["clean_title"].values.tolist()
X_val = val_data["clean_title"].values.tolist()
X_test = test_data["clean_title"].values.tolist()
print(type(X_train))

<class 'list'>


In [11]:
print(X_train[0])
print(X_val[0])
print(X_test[0])


my gf said post to reddit things with faces
look at this beautiful flower that landed in my garden pool
dog in a guitar case


We decide what type of classication we will address: 
- binary (2_way_label)
- tres classes (3_way_label)
- 6 classes (6_way_label)

In [12]:
TYPE_CLASSIFICATION = 2 # 3 OR 6

In [13]:
y_train = train_data[str(TYPE_CLASSIFICATION) + '_way_label'].values.tolist()
y_val = val_data[str(TYPE_CLASSIFICATION) + '_way_label'].values.tolist()
y_test = test_data[str(TYPE_CLASSIFICATION) + '_way_label'].values.tolist()


We do not need to encoder the labels because they are already provided as numbers. 

That is, in the 6_way_label, the labels are 0, 1, 2, 3, 4, 5. 
- 0: true
- 1: Satire/Parody:
- 2: Misleading Content
- 3: Imposter Content
- 4: False Connection
- 5: Manipulated Content

In the 3_way_label, the labels are 0, 1, 2:
- 0: true
- 1: the sample is fake and contains text that is true (i.e. direct quotes from propaganda posters)
- 2: False

In the 2_way_label, the labels are 0, 1:
- 0: true
- 1: False


In [14]:
labels_dict = {}
if TYPE_CLASSIFICATION == 2:
    labels_dict[0] = "True"
    labels_dict[1] = "False"
elif TYPE_CLASSIFICATION == 3:
    labels_dict[0] = "True"
    labels_dict[1] = "Fake contains True"
    labels_dict[2] = "False"
elif TYPE_CLASSIFICATION == 6:
    labels_dict[0] = "True"
    labels_dict[1] = "Satire/Parody"
    labels_dict[2] = "Misleading Content"
    labels_dict[3] = "Imposter Content"
    labels_dict[4] = "False Connection"
    labels_dict[5] = "Manipulated Content"

print(labels_dict)

{0: 'True', 1: 'False'}


In [15]:
print(X_train[0], y_train[0], labels_dict[y_train[0]])
print(X_val[0], y_val[0], labels_dict[y_val[0]])
print(X_test[0], y_test[0], labels_dict[y_test[0]])




my gf said post to reddit things with faces 0 True
look at this beautiful flower that landed in my garden pool 0 True
dog in a guitar case 1 False


## Data Visualization

Here we should provide more detail about the dataset (class distribution, average lenght of the texts, etc)

In [16]:
# we will do it after

## Transformers

First, we install the libraries:

In [17]:
!pip3 install transformers numpy torch sklearn


Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 12.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.4 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 37.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found e

To make our experiments reproducible, we set a random seed:


In [18]:
import torch
import numpy as np
torch.manual_seed(0)
np.random.seed(0)

Import the necessary modules:

In [19]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.model_selection import train_test_split

we'll be using the BERT model. More specifically, we'll be using bert-base-uncased pre-trained weights from the library.

In [20]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
model_name = "bert-base-uncased"
# max sequence length for each document/sentence sample
max_length = 512

Download the tokenizer provided by this model:

In [21]:
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Now let's use our tokenizer to encode our corpus. We set truncation to True so that we eliminate tokens that go above max_length, we also set padding to True to pad documents that are less than max_length with empty tokens.

In [22]:
# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length`
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)


The below code wraps our tokenized text data into a torch Dataset. Since we gonna use Trainer from Transformers library, it expects our dataset as a torch.utils.data.Dataset, so we made a simple class that implements the len() method that returns the number of samples, and getitem() method to return a data sample at a specific index.

In [30]:
class TorchDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_torch_dataset = TorchDataset(train_encodings, y_train)
val_torch_dataset = TorchDataset(val_encodings, y_val)
test_torch_dataset = TorchDataset(test_encodings, y_test)

In [31]:
print(len(train_encodings.encodings), len(y_train))

8031 8031


## Training the model

Now that we have our data prepared, let's download and load our BERT model and its pre-trained weights. We're using BertForSequenceClassification class from Transformers library, we set num_labels to the length of our available labels, in this case, 2.

We also cast our model to our CUDA GPU. If you're on CPU (not suggested), then just delete to() method.

In [32]:
# load the model and pass to CUDA
model = BertForSequenceClassification.from_pretrained(model_name, num_labels = TYPE_CLASSIFICATION).to("cuda")

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from cache 

Before we start fine-tuning our model, let's make a simple function to compute the metrics we want. In this case, accuracy.
You're free to include any metric you want, I've included accuracy, but you can add precision, recall, etc.

In [33]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [34]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

using `logging_steps` to initialize `eval_steps` to 400
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [35]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset= train_torch_dataset,         # training dataset
    eval_dataset= val_torch_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

# train the model
trainer.train()

***** Running training *****
  Num examples = 8031
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3012


Step,Training Loss,Validation Loss,Accuracy
400,0.5602,0.501498,0.767497
800,0.4865,0.533748,0.80427
1200,0.4099,0.501628,0.806643
1600,0.3644,0.619363,0.798339


***** Running Evaluation *****
  Num examples = 843
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-400
Configuration saved in ./results/checkpoint-400/config.json
Model weights saved in ./results/checkpoint-400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 843
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-800
Configuration saved in ./results/checkpoint-800/config.json
Model weights saved in ./results/checkpoint-800/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 843
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1200
Configuration saved in ./results/checkpoint-1200/config.json
Model weights saved in ./results/checkpoint-1200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 843
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1600
Configuration saved in ./results/checkpoint-1600/config.json
Model weights saved in ./results/checkpoint-1600/pytorch_model.bin


Step,Training Loss,Validation Loss,Accuracy
400,0.5602,0.501498,0.767497
800,0.4865,0.533748,0.80427
1200,0.4099,0.501628,0.806643
1600,0.3644,0.619363,0.798339
2000,0.344,0.608961,0.794781
2400,0.1978,0.842004,0.807829
2800,0.1865,0.842761,0.807829


***** Running Evaluation *****
  Num examples = 843
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 843
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-2400
Configuration saved in ./results/checkpoint-2400/config.json
Model weights saved in ./results/checkpoint-2400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 843
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-2800
Configuration saved in ./results/checkpoint-2800/config.json
Model weights saved in ./results/checkpoint-2800/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-400 (score: 0.5014976263046265).


TrainOutput(global_step=3012, training_loss=0.3478923432855492, metrics={'train_runtime': 904.2623, 'train_samples_per_second': 26.644, 'train_steps_per_second': 3.331, 'total_flos': 780010709722020.0, 'train_loss': 0.3478923432855492, 'epoch': 3.0})

In [37]:
# evaluate the current model after training
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 843
  Batch size = 20


{'epoch': 3.0,
 'eval_accuracy': 0.767497034400949,
 'eval_loss': 0.5014976263046265,
 'eval_runtime': 6.3238,
 'eval_samples_per_second': 133.306,
 'eval_steps_per_second': 6.8}

In [36]:
# saving the fine tuned model & tokenizer
model_path = "fake-bert-base-uncased"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Configuration saved in fake-bert-base-uncased/config.json
Model weights saved in fake-bert-base-uncased/pytorch_model.bin
tokenizer config file saved in fake-bert-base-uncased/tokenizer_config.json
Special tokens file saved in fake-bert-base-uncased/special_tokens_map.json


('fake-bert-base-uncased/tokenizer_config.json',
 'fake-bert-base-uncased/special_tokens_map.json',
 'fake-bert-base-uncased/vocab.txt',
 'fake-bert-base-uncased/added_tokens.json',
 'fake-bert-base-uncased/tokenizer.json')

In [38]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    # return probs.argmax() is a tensor. We have to return its item
    return probs.argmax().item()


In [39]:
y_pred=[]
for text in X_test:
    y_pred.append(get_prediction(text))


In [40]:
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.77      0.86      0.81       444
           1       0.83      0.72      0.77       407

    accuracy                           0.79       851
   macro avg       0.80      0.79      0.79       851
weighted avg       0.80      0.79      0.79       851



In [42]:
text = "Donald Trump sent his own plane to transport 200 stranded marines."

pred=get_prediction(text)
print('class:', labels_dict[pred])

text = "FBI director received millions from Clinton Foundation, his brother’s law firm does Clinton’s taxes"

pred=get_prediction(text)
print('class:', labels_dict[pred])

class: True
class: False
