# Roberta for fake news detection


Make sure that you are using GPU

In [None]:
import torch 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

You can see the type of gpu available for you:

In [None]:
!nvidia-smi

## 1. Load the data

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')
root = 'drive/My Drive/TFG/'

In [None]:
MULTIMODAL_ONLY = True # if False, we will load all the
nameFile = 'multimodal_'
if not MULTIMODAL_ONLY:
    nameFile ='all_'

NUM_CLASSES = 6 # 2, 3 or 6

labels = []
if NUM_CLASSES == 2:
    labels= ["True", "False"]
    reorder= ["True", "False"]

elif NUM_CLASSES == 3:
    labels= ["True", "Fake contains True", "False"]
    reorder= ["True", "Fake contains True", "False"]

elif NUM_CLASSES == 6:
    labels= ["True", "Satire/Parody", "Misleading Content", "Imposter Content", "False Connection", "Manipulated Content"]
    reorder= ["True", "Satire", "Misleading", "Imposter", "False", "Manipulated"]

In [None]:
import pandas as pd
train = pd.read_csv(root+"data/fakeddit/" + nameFile+ "train.tsv", sep='\t')
val  = pd.read_csv(root+"data/fakeddit/" + nameFile+ "validate.tsv", sep='\t')
test  = pd.read_csv(root+"data/fakeddit/" + nameFile+ "test_public.tsv", sep='\t')

print("Dataset: ", nameFile)
print('size training: ', len(train))
print('size validation: ', len(val))
print('size test: ', len(test))


In [None]:
train.head()

In [None]:
import numpy as np
reduction = 10 #100, 10 or 1

print("reduction applied: ", reduction)


print(f"Training patterns before reduction: {len(train)}")
train = train.sample(int(len(train)/reduction), random_state=12345)
print(f"Training patterns after reduction:  {len(train)}")

print(f"Validation patterns before reduction: {len(val)}")
val = val.sample(int(len(val)/reduction), random_state=12345)
print(f"Validation patterns after reduction:  {len(val)}")

print(f"Test patterns before reduction: {len(test)}")
test = test.sample(int(len(test)/reduction), random_state=12345)
print(f"Test patterns after reduction:  {len(test)}")

Visualize data

In [None]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1, 3 , figsize=(10,5))
print(labels)
train.value_counts(str(NUM_CLASSES)+'_way_label').plot(kind='bar', ax=axes[0])
plt.sca(axes[0])
plt.xticks(rotation=45, horizontalalignment='right')
plt.title('Training Dataset')
plt.ylabel('Counts')

val.value_counts(str(NUM_CLASSES)+'_way_label').plot(kind='bar', ax=axes[1])
plt.sca(axes[1])
plt.xticks(rotation=45, horizontalalignment='right')
plt.title('Validation Dataset')

test.value_counts(str(NUM_CLASSES)+'_way_label').plot(kind='bar', ax=axes[2])
plt.sca(axes[2])
plt.xticks(rotation=45, horizontalalignment='right')
plt.title('Testing Dataset')

Obtain the maximum length (tokens) of the titles (you only see in the training and validation datasets):

In [None]:
x_train=train["clean_title"]
x_val=val["clean_title"]
x_test=test["clean_title"]


y_train = train[str(NUM_CLASSES)+'_way_label']
y_val = val[str(NUM_CLASSES)+'_way_label']
y_test = test[str(NUM_CLASSES)+'_way_label']

We need to know the maximum length (based on number of tokens) of the input sequences (from training and validation dataset) to set the parameter MAX_LENGTH. 
If the maximum length is greater than 512 (maximum lenght for BERT), we will set MAX_LENGTH to 512.

In [None]:
tokens=x_train.apply(lambda x: x.split())
lengths=tokens.apply(lambda x: len(x))
max_train = max(lengths)
# print(max(lengths))

tokens=x_val.apply(lambda x: x.split())
lengths=tokens.apply(lambda x: len(x))
max_val = max(lengths)
# print(max(lengths))

MAX_LENGTH = max(max_train, max_val)
print("The maximum length of the input sequences is {} tokens".format(MAX_LENGTH))

MAX_LENGTH=min(512,MAX_LENGTH)
print("MAX_LENGTH = {}".format(MAX_LENGTH))


Install library transformers

In [None]:
!pip install transformers

Load roberta

In [None]:
import transformers
from transformers import GPT2Tokenizer, TFGPT2Model, GPT2Config
from tqdm import tqdm # Progress Bar
import numpy as np


MODEL_NAME = 'gpt2'


tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME,  
                                                add_special_tokens=True,
                                                max_length=MAX_LENGTH, 
                                                pad_to_max_length=True)

tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, 
                                       add_special_tokens=True, 
                                       max_length=MAX_LENGTH, 
                                       padding='max_length',
                                       return_attention_mask=True, 
                                       return_token_type_ids=True, 
                                       truncation=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32')

Tokenize datasets (it takes a long time to load the training dataset):

In [None]:
X_train = tokenize(x_train, tokenizer)
X_test = tokenize(x_test, tokenizer)
X_val = tokenize(x_val, tokenizer)

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
import warnings
from transformers import logging as hf_logging
hf_logging.set_verbosity_error() # Hidding Huggingface Warnings
warnings.filterwarnings("ignore")

Define the model

In [None]:
config = GPT2Config.from_pretrained(MODEL_NAME, output_hidden_states=True, output_attentions=True)
GPT2 = TFGPT2Model.from_pretrained(MODEL_NAME, config=config)

input_ids_in = tf.keras.layers.Input(shape=(MAX_LENGTH,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(MAX_LENGTH,), name='masked_token', dtype='int32') 

embedding_layer = GPT2(input_ids = input_ids_in, attention_mask = input_masks_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(64, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')(X)

model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

for layer in model.layers[:3]:
    layer.trainable = False

model.summary()

In [None]:
import os

### Create an output directory
output_dir = './model1_outputs'
if not os.path.exists(output_dir): ### If the file directory doesn't already exists,
    os.makedirs(output_dir) ### Make it please

Define callbacks

In [None]:
model_checkpoint = ModelCheckpoint(filepath=output_dir+'/weights.{epoch:02d}.hdf5',
                                  save_weights_only=True)

early_stopping = EarlyStopping(patience=3, # Stop after 3 epochs of no improvement
                               monitor='val_loss', # Look at validation_loss
                               min_delta=0, # After 0 change
                               mode='min', # Stop when quantity has stopped decreasing
                               restore_best_weights=False, # Don't Restore the best weights
                               verbose=1) 

reduce_lr = ReduceLROnPlateau(monitor='val_loss', # Look at validation loss
                              min_lr=0.000001, # Lower bound of learning rate
                              patience=1, # Reduce after 1 with little change
                              mode='min', # Stop when quantity has stopped decreasing
                              factor=0.1, # Reduce by a factor of 1/10
                              min_delta=0.01, # Minimumn change needed
                              verbose=1)

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, 
                    y_train, 
                    epochs = 10, #10
                    batch_size= 16, #16
                    validation_data=(X_val, y_val), 
                    callbacks=[model_checkpoint, early_stopping, reduce_lr])

Epoch 1/10
 522/3525 [===>..........................] - ETA: 8:11 - loss: 1.3036 - accuracy: 0.5220

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    print("Lowest Validation Loss: epoch {}".format(np.argmin(val_loss)+1))
    print("Highest Validation Accuracy: epoch {}".format(np.argmax(val_acc)+1))

plot_history(history)


In [None]:
def get_min_val_loss_epoch(history):
    return "0"+str(np.argmin(history.history['val_loss'])+1)

def get_max_val_acc_epoch(history):
    return "0"+str(np.argmax(history.history['val_accuracy'])+1)

In [None]:
epoch_num = get_max_val_acc_epoch(history)
model.load_weights(output_dir+"/weights."+epoch_num+".hdf5") # Load in model weights


In [None]:
y_test_probs = model.predict(X_test)

# Turn probabilities into an interger prediction
y_hat = []
for prob in y_test_probs:
    y_hat.append(np.argmax(prob))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

def print_cf2(y_test, y_hat):
    cm = confusion_matrix(y_test, y_hat)
    sns.set(font_scale = 1.4, color_codes=True, palette="deep")
    sns.heatmap(pd.DataFrame(cm, index=labels,columns=[0,1,2,3,4,5]), 
                annot = True,
                annot_kws = {"size":16},
                fmt="d",
                cmap="YlGnBu")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted Value")
    plt.xticks([0,1,2,3,4,5], labels, rotation=45)
    plt.ylabel("True Value")
    plt.show()


print("Accuracy:", accuracy_score(y_test, y_hat))
print_cf2(y_test, y_hat)


In [None]:
print(classification_report(y_test, y_hat, target_names=labels))
