# Configuration

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Install all the needed packages
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
# Import all the needed libraries
import numpy as np
import torch

from statistics import mode

from sklearn.metrics import confusion_matrix

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer

from datasets import Dataset, load_metric

In [8]:
# Check that pyTorch is identifying the GPU
if torch.cuda.device_count() > 0:
  print(f'GPU detected. Currently using: "{torch.cuda.get_device_name(0)}"')
else:
  raise Exception('Currently using CPU, change the type of the runtime in the \'runtime\' tab')

GPU detected. Currently using: "Tesla T4"


# Variables and Parameters

In [9]:
# File paths
TOKENIZER_CHECKPOINT1 = "PlanTL-GOB-ES/roberta-large-bne"
TOKENIZER_CHECKPOINT2 = "bertin-project/bertin-roberta-base-spanish"
TOKENIZER_CHECKPOINT3 = "bertin-project/bertin-roberta-base-spanish"

MODEL_PATH1 = "/content/drive/MyDrive/Colab Notebooks/TFG/models/rlb_v.2.6"
MODEL_PATH2 = "/content/drive/MyDrive/Colab Notebooks/TFG/models/btin_v.1.1"
MODEL_PATH3 = "/content/drive/MyDrive/Colab Notebooks/TFG/models/btin_v.0.5"

TEST_DATASET_PATH = "/content/drive/MyDrive/Colab Notebooks/TFG/datasets/fakeNews_spanish/test_dataset/"

# Notebook options
TRUNCATION_LEN = 256
WEIGHT_RESULT = True

# Weights for the weighted predictions
WEIGHT1 = 0.4
WEIGHT2 = 0.4
WEIGHT3 = 0.2

# Preprocessing per model
PREPROCESS_MODEL1 = 3
PREPROCESS_MODEL2 = 2
PREPROCESS_MODEL3 = 1

# Define metrics

In [10]:
accuracy = load_metric('accuracy')
f1 = load_metric('f1')

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [11]:
def compute_metric(eval_pred, test=False):
  predictions, labels = eval_pred

  if test == False:
    predictions = np.argmax(predictions, axis=1)

  result_acc = accuracy.compute(predictions=predictions, references=labels)['accuracy']
  result_f1 = f1.compute(predictions=predictions, references=labels)['f1']

  return {'accuracy': result_acc, 'f1-score': result_f1}

# Load the chosen models

In [12]:
# MODEL 1
tokenizer1 = AutoTokenizer.from_pretrained(TOKENIZER_CHECKPOINT1)
model1 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH1)

# MODEL 2
tokenizer2 = AutoTokenizer.from_pretrained(TOKENIZER_CHECKPOINT2)
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH2)

# MODEL 3
tokenizer3 = AutoTokenizer.from_pretrained(TOKENIZER_CHECKPOINT3)
model3 = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH3)

Downloading:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/504k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.41M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/831k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/497k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [13]:
pipe1 = pipeline("text-classification", model=model1, tokenizer=tokenizer1, device=0)
pipe2 = pipeline("text-classification", model=model2, tokenizer=tokenizer2, device=0)
pipe3 = pipeline("text-classification", model=model3, tokenizer=tokenizer3, device=0)

# Load the test data

In [14]:
test_dataset = Dataset.load_from_disk(TEST_DATASET_PATH)

# Preprocess the data

In [15]:
# FUNCTIONS FOR THE PREPROCESSING (Define if they are not already defined)

# Concatenate source, headline and text, this will be the data to be tokenized
def concat_data(records):
  if PREPROCESS == 1:
    data = {'Data': str(records['Source']) + '. ' + str(records['Headline']) + '. ' + str(records['Text'])}
  elif PREPROCESS == 2:
    data = {'Data': str(records['Source']) + '. ' + str(records['Topic']) + '. ' + str(records['Link']) + '. ' + str(records['Text'])}
  elif PREPROCESS == 3:
    data = {'Data': str(records['Source']) + '. ' + str(records['Link']) + '. ' + str(records['Text'])}
  elif PREPROCESS == 4:
    data = {'Data': str(records['Source']) + '. ' + str(records['Link']) + '. ' + str(records['Headline']) + '. ' + str(records['Text'])}

  return data

# Set a numeric label depending on the Category
#   Label = 0 --> True
#   Label = 1 --> Fake
def set_labels(records):
  return {'labels': 0} if records['Category'] == 'True' else {'labels': 1}

# Normalize labels for the test split
def set_correct_category(records):
  return {'Category': 'Fake'} if records['Category'] == 'FALSO' else {'Category': 'True'}

**Preprocess for Model 1**

In [16]:
# Preprocess the test dataset with the functions we made
PREPROCESS = PREPROCESS_MODEL1
test_dataset1 = test_dataset.map(set_correct_category)
test_dataset1 = test_dataset1.map(concat_data)
test_dataset1 = test_dataset1.map(set_labels)



  0%|          | 0/572 [00:00<?, ?ex/s]

  0%|          | 0/572 [00:00<?, ?ex/s]

  0%|          | 0/572 [00:00<?, ?ex/s]

In [17]:
test_dataset1[0]['Data']

'El Economista. https://www.eleconomista.com.mx/opinion/Covid-19-mentiras-que-matan-20210212-0029.html. El control de la Covid-19 no es sólo un tema de médicos y el resto del personal sanitario y científico. Por desgracia o por fortuna, es un asunto esencialmente político que se decide por hombres y mujeres que se dedican a la política. De las creencias y opiniones de estos últimos, depende el éxito o el fracaso de las acciones que se implementen.\r\n\r\nLos éxitos en la toma de decisiones salvan vidas y naciones; obviamente, los errores matan y más si están acompañados de mentiras y medias verdades. En este sentido, durante el pasado Pulso de la Salud (9 de febrero) el presidente López rompió un récord: en los primeros diez minutos había dicho tres mentiras graves o medias verdades, que también son mentiras. El problema con esto es que las mentiras matan.\r\n\r\nEn esa ocasión, López Obrador dijo que “afortunadamente” se estaban reduciendo los contagios en todo el país. Poco después, 

**Preprocess for Model 2**

In [18]:
# Preprocess the test dataset with the functions we made
PREPROCESS = PREPROCESS_MODEL2
test_dataset2 = test_dataset.map(set_correct_category)
test_dataset2 = test_dataset2.map(concat_data)
test_dataset2 = test_dataset2.map(set_labels)

  0%|          | 0/572 [00:00<?, ?ex/s]

  0%|          | 0/572 [00:00<?, ?ex/s]

  0%|          | 0/572 [00:00<?, ?ex/s]

In [19]:
test_dataset2[0]['Data']

'El Economista. Covid-19. https://www.eleconomista.com.mx/opinion/Covid-19-mentiras-que-matan-20210212-0029.html. El control de la Covid-19 no es sólo un tema de médicos y el resto del personal sanitario y científico. Por desgracia o por fortuna, es un asunto esencialmente político que se decide por hombres y mujeres que se dedican a la política. De las creencias y opiniones de estos últimos, depende el éxito o el fracaso de las acciones que se implementen.\r\n\r\nLos éxitos en la toma de decisiones salvan vidas y naciones; obviamente, los errores matan y más si están acompañados de mentiras y medias verdades. En este sentido, durante el pasado Pulso de la Salud (9 de febrero) el presidente López rompió un récord: en los primeros diez minutos había dicho tres mentiras graves o medias verdades, que también son mentiras. El problema con esto es que las mentiras matan.\r\n\r\nEn esa ocasión, López Obrador dijo que “afortunadamente” se estaban reduciendo los contagios en todo el país. Poco

**Preprocess for Model 3**

In [20]:
# Preprocess the test dataset with the functions we made 
PREPROCESS = PREPROCESS_MODEL3
test_dataset3 = test_dataset.map(set_correct_category)
test_dataset3 = test_dataset3.map(concat_data)
test_dataset3 = test_dataset3.map(set_labels)

  0%|          | 0/572 [00:00<?, ?ex/s]

  0%|          | 0/572 [00:00<?, ?ex/s]

  0%|          | 0/572 [00:00<?, ?ex/s]

In [21]:
test_dataset3[0]['Data']

'El Economista. Covid-19: mentiras que matan. El control de la Covid-19 no es sólo un tema de médicos y el resto del personal sanitario y científico. Por desgracia o por fortuna, es un asunto esencialmente político que se decide por hombres y mujeres que se dedican a la política. De las creencias y opiniones de estos últimos, depende el éxito o el fracaso de las acciones que se implementen.\r\n\r\nLos éxitos en la toma de decisiones salvan vidas y naciones; obviamente, los errores matan y más si están acompañados de mentiras y medias verdades. En este sentido, durante el pasado Pulso de la Salud (9 de febrero) el presidente López rompió un récord: en los primeros diez minutos había dicho tres mentiras graves o medias verdades, que también son mentiras. El problema con esto es que las mentiras matan.\r\n\r\nEn esa ocasión, López Obrador dijo que “afortunadamente” se estaban reduciendo los contagios en todo el país. Poco después, el subsecretario López-Gatell fue por este camino y comple

# Get the predictios of every model

In [22]:
def get_scores(records):
  if USE_PIPE == 1:
    result = pipe1(records['Data'], truncation=True, max_length=TRUNCATION_LEN)
  elif USE_PIPE == 2:
    result = pipe2(records['Data'], truncation=True, max_length=TRUNCATION_LEN)
  else:
    result = pipe3(records['Data'], truncation=True, max_length=TRUNCATION_LEN)
    
  return {'Score': 1 - result[0]['score']} if result[0]['label'] == 'LABEL_0' else {'Score': result[0]['score']}

def predict(results):
  if WEIGHT_RESULT:
    return [round((WEIGHT1*results[0][i]) + (WEIGHT2*results[1][i]) + (WEIGHT3*results[2][i])) for i in range(len(results[0]))]
  else:
    return [round((results[0][i] + results[1][i] + results[2][i])/3) for i in range(len(results[0]))]

In [23]:
USE_PIPE = 1
test_dataset1 = test_dataset1.map(get_scores)

  0%|          | 0/572 [00:00<?, ?ex/s]



In [24]:
USE_PIPE = 2
test_dataset2 = test_dataset2.map(get_scores)

  0%|          | 0/572 [00:00<?, ?ex/s]



In [None]:
USE_PIPE = 3
test_dataset3 = test_dataset3.map(get_scores)

  0%|          | 0/572 [00:00<?, ?ex/s]



In [None]:
results = [test_dataset1['Score'], test_dataset2['Score'], test_dataset3['Score']]

predictions = predict(results)
test_labels = test_dataset1['labels']
eval_pred = [predictions, test_dataset1['labels']]

# Show the results

**Results of the ensemble**

In [None]:
compute_metric(eval_pred, test=True)

In [None]:
print(confusion_matrix(test_labels, predictions))

[[246  40]
 [ 30 256]]


**Results of each model**

In [None]:
predictions1 = [round(test_dataset1['Score'][i]) for i in range(len(test_dataset1['Score']))]
predictions2 = [round(test_dataset2['Score'][i]) for i in range(len(test_dataset2['Score']))]
predictions3 = [round(test_dataset3['Score'][i]) for i in range(len(test_dataset3['Score']))]

In [None]:
compute_metric([predictions1, test_labels], test=True)

{'accuracy': 0.8671328671328671, 'f1-score': 0.8633093525179856}

In [None]:
compute_metric([predictions2, test_labels], test=True)

{'accuracy': 0.8531468531468531, 'f1-score': 0.8604651162790697}

In [None]:
compute_metric([predictions3, test_labels], test=True)

{'accuracy': 0.8548951048951049, 'f1-score': 0.8561525129982669}