# Notebook configuration

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install needed libraries
!pip install transformers datasets
!pip install sentencepiece
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 16.0 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 74.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 61.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 62.5 MB/s 
Collecting aiohttp
  Downl

In [3]:
# Import all the needed libraries
import numpy as np
import pandas as pd
import torch
import functools
import wandb
import random
import os

from datasets import Dataset, DatasetDict, load_metric

from sklearn.metrics import confusion_matrix

from keras.callbacks import EarlyStopping

from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
 TrainingArguments, Trainer, pipeline, EarlyStoppingCallback, \
 RobertaTokenizerFast, EncoderDecoderModel, BertTokenizer, \
 BertForSequenceClassification

from transformers.tokenization_utils import PreTrainedTokenizer


# Make the enviroment deterministic
TENSORS_SEED = 42

torch.manual_seed(TENSORS_SEED)
torch.cuda.manual_seed_all(TENSORS_SEED) 
torch.backends.cudnn.benchmark = False
os.environ["TF_DETERMINISTIC_OPS"] = "1" 

# Load Data and Model

In [4]:
#model_checkpoint = "bert-base-uncased"
#model_checkpoint = "bertin-project/bertin-roberta-base-spanish"
#model_checkpoint = "PlanTL-GOB-ES/roberta-large-bne"
MODEL_CHECKPOINT = "dccuchile/bert-base-spanish-wwm-uncased"

TRUNCATION_LEN = 256

In [5]:
# Load the data
train_data_path = '/content/drive/MyDrive/Colab Notebooks/TFG/datasets/fakeNews_spanish/train.csv'
valid_data_path = '/content/drive/MyDrive/Colab Notebooks/TFG/datasets/fakeNews_spanish/development.csv'
test_data_path = '/content/drive/MyDrive/Colab Notebooks/TFG/datasets/fakeNews_spanish/test.csv'

train_df = pd.read_csv(train_data_path, encoding = 'UTF-8', sep=';', index_col=0)
valid_df = pd.read_csv(valid_data_path, encoding = 'UTF-8', sep=';', index_col=0)
test_df = pd.read_csv(test_data_path, encoding = 'UTF-8', sep=';', index_col=0)

# Since the test dataset was taken under different conditions, we have to change
# it a bit so it looks like the other two:
# ------------------------------------------------------------------------------

test_df.index.names = ['Id']
test_df = test_df.rename(columns={'CATEGORY':'Category',
                                  'TOPICS': 'Topic',
                                  'SOURCE': 'Source',
                                  'HEADLINE': 'Headline',
                                  'TEXT': 'Text',
                                  'LINK': 'Link',
                                  })

test_df["Category"].replace({"FALSO": "Fake", "VERDADERO": "True"}, inplace=True)

# ------------------------------------------------------------------------------

train_dataset = Dataset.from_pandas(train_df, split='train')
valid_dataset = Dataset.from_pandas(valid_df, split='valid')
test_dataset = Dataset.from_pandas(test_df, split='test')

dataset = DatasetDict({'train': train_dataset, 'valid': valid_dataset, 'test': test_dataset})

In [6]:
# FUNCTIONS FOR THE PREPROCESSING

# Concatenate source, headline and text, this will be the data to be tokenized
def concat_data(records):
  return {'Data': str(records['Source']) + '. ' + str(records['Headline']) + '. ' + str(records['Text'])}
  
"""
PREPROCESAMIENTO 3
def concat_data(records):
  return {'Data': str(records['Source']) + '. ' + str(records['Link']) + '. ' + str(records['Text'])}
"""

# Set a numeric label depending on the Category
#   Label = 0 --> True
#   Label = 1 --> Fake
def set_labels(records):
  return {'labels': 0} if records['Category'] == 'True' else {'labels': 1}

In [7]:
dataset = dataset.map(concat_data)
dataset = dataset.map(set_labels)

print(dataset)



  0%|          | 0/676 [00:00<?, ?ex/s]

  0%|          | 0/295 [00:00<?, ?ex/s]

  0%|          | 0/572 [00:00<?, ?ex/s]

  0%|          | 0/676 [00:00<?, ?ex/s]

  0%|          | 0/295 [00:00<?, ?ex/s]

  0%|          | 0/572 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['Category', 'Topic', 'Source', 'Headline', 'Text', 'Link', 'Id', 'Data', 'labels'],
        num_rows: 676
    })
    valid: Dataset({
        features: ['Category', 'Topic', 'Source', 'Headline', 'Text', 'Link', 'Id', 'Data', 'labels'],
        num_rows: 295
    })
    test: Dataset({
        features: ['Category', 'Topic', 'Source', 'Headline', 'Text', 'Link', 'Id', 'Data', 'labels'],
        num_rows: 572
    })
})


# Build the embedding

First, we build the left side of the embedding, to do this, we truncate the right side of the new. We first define the function to tokenize the data, it will be the same for both sides of the new, but we will change the truncation side.

In [8]:
# Define the method to be mapped to the dataset to tokenize the data
def tokenize_data(records):
  return tokenizer(records['Data'], truncation=True, max_length=TRUNCATION_LEN)

In [9]:
TRUNCATION_SIDE = 'right'

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, truncation_side=TRUNCATION_SIDE)

# Remove useless columns
columns = dataset['train'].column_names
columns.remove('labels')

# Map the function, removing at the same time those columns we don't need (only apply to train and valid)
dataset_left = dataset.map(tokenize_data, batched=True, remove_columns=columns)

Downloading:   0%|          | 0.00/310 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/650 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

We can see that we get the left side of the new, this is proven by checking that we have all the first fields concatenated on the first line.

In [10]:
tokenizer.decode(dataset_left['test'][0]['input_ids'])

'[CLS] el economista. covid - 19 : mentiras que matan. el control de la covid - 19 no es sólo un tema de médicos y el resto del personal sanitario y científico. por desgracia o por fortuna, es un asunto esencialmente político que se decide por hombres y mujeres que se dedican a la política. de las creencias y opiniones de estos últimos, depende el éxito o el fracaso de las acciones que se implementen. los éxitos en la toma de decisiones salvan vidas y naciones ; obviamente, los errores matan y más si están acompañados de mentiras y medias verdades. en este sentido, durante el pasado pulso de la salud ( 9 de febrero ) el presidente lópez rompió un récord : en los primeros diez minutos había dicho tres mentiras graves o medias verdades, que también son mentiras. el problema con esto es que las mentiras matan. en esa ocasión, lópez obrador dijo que [UNK] afortunadamente [UNK] se estaban reduciendo los contagios en todo el país. poco después, el subsecretario lópez - gatell fue por este ca

Now, we change the truncation side. We want the right side of the new, so we truncate the left side.

In [11]:
TRUNCATION_SIDE = 'left'

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, truncation_side=TRUNCATION_SIDE)

# Map the function, removing at the same time those columns we don't need (only apply to train and valid)
dataset_right = dataset.map(tokenize_data, batched=True, remove_columns=columns)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
tokenizer.decode(dataset_right['test'][0]['input_ids'])

'[CLS] que nunca había sido rebasada la capacidad de camas y equipo. cualquier revisión a la prensa de los últimos meses relata una o varias historias de personas que buscaron durante días un lugar para un familiar enfermo. algunos lo lograron, otros no. también de personas conocidas que murieron en sus casas porque las rechazaron en los hospitales por no estar [UNK] suficientemente [UNK] enfermos. ese es el panorama y esas son las mentiras y medias verdades. es cierto que no somos el único país en donde faltan suministros, camas, equipo y muere personal médico, pero somos una nación que no está evolucionando favorablemente ante la covid - 19. desde septiembre, amnistía internacional y la publicación the lancet dieron cuenta de que muere más personal médico en méxico que en cualquier otra parte del mundo. la demagogia, la insensibilidad y las mentiras son algo peor que la infodemia y más cuando se da con la complicidad de todo el gabinete. la esperanza de las vacunas está disminuyendo 

A continuación, tendremos que construir un dataset que contenga ambos embeddings, los concatenaremos para formar un embedding the 512 tokens.

## Concatenate both embeddings

In [13]:
# Get dataframes for all the datasets.
dataset_left.set_format('pandas')
df_train_left = dataset_left['train'][:]
df_valid_left = dataset_left['valid'][:]
df_test_left = dataset_left['test'][:]
dataset_left.reset_format()

dataset_right.set_format('pandas')
df_train_right = dataset_right['train'][:]
df_valid_right = dataset_right['valid'][:]
df_test_right = dataset_right['test'][:]
dataset_right.reset_format()

This part of the code is a bit complex, so I am going to unroll the loops so it gets more clear:

```
for row in range (n_rows)
  left_value = df_DS_left[col][row]
  right_value = df_DS_right[col][row]
  df_DS[col] = np.append(left_value, right_value)
```

In [14]:
columns = df_train_left.columns.to_list()
columns.remove('labels')

df_train = pd.DataFrame()
df_valid = pd.DataFrame()
df_test = pd.DataFrame()

# Initialize the labels column since its the same value in both embeddings
df_train['labels'] = df_train_left['labels']
df_valid['labels'] = df_valid_left['labels']
df_test['labels'] = df_test_left['labels']

# combine both embeddings in one
n_rows_train = dataset_left['train'].num_rows
n_rows_valid = dataset_left['valid'].num_rows
n_rows_test = dataset_left['test'].num_rows

for col in columns: 
  df_train[col] = [np.append(ids, df_train_right[col][i]) for i, ids, in zip(range(n_rows_train), df_train_left[col])]
  df_valid[col] = [np.append(ids, df_valid_right[col][i]) for i, ids, in zip(range(n_rows_valid), df_valid_left[col])]
  df_test[col] = [np.append(ids, df_test_right[col][i]) for i, ids, in zip(range(n_rows_test), df_test_left[col])]

In [None]:
df_train

Unnamed: 0,labels,input_ids,token_type_ids,attention_mask
0,1,"[4, 1039, 20184, 25251, 30962, 1008, 29047, 22...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1,"[4, 1311, 7094, 1008, 1032, 3269, 1100, 1075, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,1,"[4, 1039, 20184, 25251, 30962, 1008, 1252, 269...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,0,"[4, 1039, 6542, 1008, 17412, 5107, 4388, 1012,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1,"[4, 17790, 2948, 1008, 23118, 11610, 4752, 136...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...
671,1,"[4, 1039, 1111, 30984, 1207, 1008, 22291, 2119...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
672,0,"[4, 3787, 6208, 1008, 2702, 2389, 4405, 23566,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
673,1,"[4, 1032, 4025, 3557, 1008, 25845, 18545, 1414...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
674,1,"[4, 1039, 1111, 30984, 1207, 1008, 6651, 30746...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [15]:
train_dataset = Dataset.from_pandas(df_train, split='train')
valid_dataset = Dataset.from_pandas(df_valid, split='valid')
test_dataset = Dataset.from_pandas(df_test, split='test')

head_tail_embeddings = DatasetDict({'train': train_dataset, 'valid': valid_dataset, 'test': test_dataset})

# Train the model

## Define metrics

In [16]:
accuracy = load_metric('accuracy')
f1 = load_metric('f1')

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [17]:
def compute_metric(eval_pred, test=False):
  predictions, labels = eval_pred

  if test == False:
    predictions = np.argmax(predictions, axis=1)

  result_acc = accuracy.compute(predictions=predictions, references=labels)['accuracy']
  result_f1 = f1.compute(predictions=predictions, references=labels)['f1']

  return {'accuracy': result_acc, 'f1-score': result_f1}

## Fine-tune the model

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

In [19]:
# Define the Hyperparameters
EPOCHS = 10
LEARNING_RATE = 2e-5
BATCH_SIZE = 6
METRIC_FOR_BEST_MODEL = 'eval_loss'
METRIC_CONDITION = False
WEIGHT_DECAY = 0.01
RANDOM_SEED = 42
ES_PATIENCE = 3

In [20]:
# Define the training parameters
num_train_samples = head_tail_embeddings['train'].num_rows
train_dataset = head_tail_embeddings['train'].shuffle(seed=RANDOM_SEED).select(range(num_train_samples))
valid_dataset = head_tail_embeddings['valid']
logging_steps = len(train_dataset) // (2 * BATCH_SIZE * EPOCHS)

training_args = TrainingArguments(
    output_dir='results',
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    load_best_model_at_end=True,
    metric_for_best_model=METRIC_FOR_BEST_MODEL,
    greater_is_better=METRIC_CONDITION,
    weight_decay=WEIGHT_DECAY,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=logging_steps,
    save_total_limit=3,
    #report_to=report_option,
    push_to_hub=False
)

In [21]:
# Create a Trainer object that will do the work for us
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metric,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=ES_PATIENCE)],
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer
)

In [22]:
trainer.train()

***** Running training *****
  Num examples = 676
  Num Epochs = 10
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 1130
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1-score
1,0.582,0.449236,0.80678,0.798587
2,0.5488,0.439068,0.864407,0.865772
3,0.0012,0.481902,0.908475,0.901099
4,0.0005,0.454838,0.918644,0.913043
5,0.0002,0.48548,0.922034,0.916364


***** Running Evaluation *****
  Num examples = 295
  Batch size = 6
Saving model checkpoint to results/checkpoint-113
Configuration saved in results/checkpoint-113/config.json
Model weights saved in results/checkpoint-113/pytorch_model.bin
tokenizer config file saved in results/checkpoint-113/tokenizer_config.json
Special tokens file saved in results/checkpoint-113/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 295
  Batch size = 6
Saving model checkpoint to results/checkpoint-226
Configuration saved in results/checkpoint-226/config.json
Model weights saved in results/checkpoint-226/pytorch_model.bin
tokenizer config file saved in results/checkpoint-226/tokenizer_config.json
Special tokens file saved in results/checkpoint-226/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 295
  Batch size = 6
Saving model checkpoint to results/checkpoint-339
Configuration saved in results/checkpoint-339/config.json
Model weights saved in results/checkp

TrainOutput(global_step=565, training_loss=0.17813103012143083, metrics={'train_runtime': 482.1599, 'train_samples_per_second': 14.02, 'train_steps_per_second': 2.344, 'total_flos': 889315367116800.0, 'train_loss': 0.17813103012143083, 'epoch': 5.0})

# Get predictions

In [23]:
predictions = trainer.predict(head_tail_embeddings['test'])

***** Running Prediction *****
  Num examples = 572
  Batch size = 6


In [24]:
predictions.metrics

{'test_accuracy': 0.756993006993007,
 'test_f1-score': 0.7754442649434572,
 'test_loss': 0.7967976331710815,
 'test_runtime': 22.0336,
 'test_samples_per_second': 25.96,
 'test_steps_per_second': 4.357}

In [25]:
predicted_labels = np.argmax(predictions.predictions, axis=1)

In [26]:
print(confusion_matrix(predictions.label_ids, predicted_labels))

[[193  93]
 [ 46 240]]


In [27]:
#model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/TFG/models/btin_v.0.4/')