In [1]:
### IMPORTS ###
import pandas as pd
import numpy as np
import pathlib as pl
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
import re

!pip install datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

!pip install transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import get_scheduler
from transformers import TrainingArguments

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

!pip install evaluate
import evaluate

from transformers import Trainer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

PATH = "/content/drive/My Drive/TFM/Data/Fakeddit/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
### CONJUNTOS DE ENTRENAMIENTO, VALIDACIÓN Y TEST ###
df = pd.read_csv(PATH + "multimodal_train.tsv",  sep='\t', on_bad_lines='skip')
df = df[['clean_title', '6_way_label']]
df = df.rename(columns={"clean_title": "Text", "6_way_label": "Label"})
df.fillna(" ", inplace=True)

X_train = df['Text']
y_train = df['Label']

X_train, X_aux, y_train, y_aux = train_test_split(X_train, y_train, test_size=0.95, random_state=55, stratify=y_train)

df = pd.read_csv(PATH + "multimodal_validate.tsv",  sep='\t', on_bad_lines='skip')
df = df[['clean_title', '6_way_label']]
df = df.rename(columns={"clean_title": "Text", "6_way_label": "Label"})
df.fillna(" ", inplace=True)

X_val = df['Text']
y_val = df['Label']

X_val, X_aux, y_val, y_aux = train_test_split(X_val, y_val, test_size=0.95, random_state=55, stratify=y_val)

df = pd.read_csv(PATH + "multimodal_test_public.tsv",  sep='\t', on_bad_lines='skip')
df = df[['clean_title', '6_way_label']]
df = df.rename(columns={"clean_title": "Text", "6_way_label": "Label"})
df.fillna(" ", inplace=True)

X_test = df['Text']
y_test = df['Label']

X_test, X_aux, y_test, y_aux = train_test_split(X_test, y_test, test_size=0.95, random_state=55, stratify=y_test)

print('Tamaño conjunto de Entrenamiento:', len(X_train))
print('Tamaño conjunto de Validación:', len(X_val))
print('Tamaño conjunto de Evaluación:', len(X_test))

Tamaño conjunto de Entrenamiento: 28200
Tamaño conjunto de Validación: 2967
Tamaño conjunto de Evaluación: 2965


# Limpieza y Representación de Textos

In [4]:
### LIMPIEZA DE TEXTOS ###
stopwords_en = stopwords.words("english")
def clean_text(text):
    # transformar a minúscula
    text=str(text).lower()
    # tokenizar
    tokens=word_tokenize(text)
    # borrar stopwords
    tokens = [word for word in tokens if word not in stopwords_en]
    # usar los stems
    tokens = [PorterStemmer().stem(word) for word in tokens]
    # eliminamos las palabras con menos de 2 caráceres
    # ignoramos cualquier palabra que contenga un digito o un símbolo especial
    min_length = 2
    p = re.compile('^[a-zA-Z]+$');
    filtered_tokens=[]
    for token in tokens:
        if len(token)>=min_length and p.match(token):
            filtered_tokens.append(token)

    return filtered_tokens

for text in X_train:
    text = clean_text(text)

# Data Augmentation

No es necesario ya que el dataset es suficientemente grande

# Label Encoding

No es necesario hacer un encoding ya que viene etiquetado de manera numérica

# Crear Diccionario

In [5]:
d = {'train':Dataset.from_dict({'Label':y_train, 'Text':X_train}),
     'val':Dataset.from_dict({'Label':y_val, 'Text':X_val}),
     'test':Dataset.from_dict({'Label':y_test, 'Text':X_test})
     }

dict_dataset = DatasetDict(d)

# Tokenization

In [6]:
model_name ='distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

MAX_LENGTH = 28

In [7]:
def tokenize(examples):
    return tokenizer(examples["Text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)


data_encodings = dict_dataset.map(tokenize, batched=True)
data_encodings

Map:   0%|          | 0/28200 [00:00<?, ? examples/s]

Map:   0%|          | 0/2967 [00:00<?, ? examples/s]

Map:   0%|          | 0/2965 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Label', 'Text', 'input_ids', 'attention_mask'],
        num_rows: 28200
    })
    val: Dataset({
        features: ['Label', 'Text', 'input_ids', 'attention_mask'],
        num_rows: 2967
    })
    test: Dataset({
        features: ['Label', 'Text', 'input_ids', 'attention_mask'],
        num_rows: 2965
    })
})

# Modelo

Necesitamos hacer algunas modificaciones para preparar el dataset para el modelo:

- Eliminamos la columna 'text', porque es un campo que el modelo no espera.
- Renombramos 'label' a 'labels', porque es el nombre que espera el modelo.
- El dataset debe devolver un objeto Tensor en lugar de una lista.

Para pasarle los datos al modelo debemos guardarlos en objetos DataLoader

In [8]:
data_encodings = data_encodings.remove_columns('Text')
data_encodings = data_encodings.rename_column('Label', 'labels')
data_encodings.set_format("torch")
data_encodings

train_dataloader = DataLoader(data_encodings['train'], shuffle=True, batch_size=8)
val_dataloader = DataLoader(data_encodings['val'], batch_size=8)

Para definir el modelo, hay que establecer el número de etiquetas:

In [9]:
NUM_LABELS = 6
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.

Como optimizador vamos a utilizar el optijizador Adam, implementado en AdamW, y definimos el learning rate:

In [10]:
optimizer = AdamW(model.parameters(), lr=5e-5)

# Hiperparámetros

In [11]:
#!pip uninstall -y accelerate
!pip install accelerate
args = TrainingArguments(output_dir="./outputs")
args.evaluation_strategy="epoch"
args.per_device_train_batch_size = 32
args.per_device_eval_batch_size = 32
args.num_train_epochs = 5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Métrica

In [12]:
def compute_metrics(pred):
    """recibe un lote prediciones inferidas por el modelo. """
    y_true = pred.label_ids # son las labels en el gold standard
    y_pred = pred.predictions.argmax(-1) # pred.predictions devuelve una lista con las predicciones
                                         # para cada clase. Debemos quedarnos con la de mayor probabilidad.

    # como son varias clases, utilizaremos la macro
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    acc = accuracy_score(y_true, y_pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training

In [13]:
trainer = Trainer(
    model=model,            # modelo que será ajustado
    args = args,     # hiperparámetros
    train_dataset=data_encodings['train'], # conjunto training
    eval_dataset=data_encodings['val'],   # conjunto de validación
    compute_metrics=compute_metrics,    # función para computar las métricas
)


In [14]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.9387,0.76774,0.734749,0.527394,0.693151,0.490408
2,0.541,0.750005,0.762049,0.584488,0.699544,0.549492
3,0.2949,0.886196,0.738456,0.616817,0.639302,0.608762
4,0.1544,1.031743,0.752275,0.618609,0.665317,0.592945
5,0.0909,1.159266,0.751264,0.625871,0.66728,0.599127


TrainOutput(global_step=4410, training_loss=0.3845897479933135, metrics={'train_runtime': 441.397, 'train_samples_per_second': 319.44, 'train_steps_per_second': 9.991, 'total_flos': 1021520696112000.0, 'train_loss': 0.3845897479933135, 'epoch': 5.0})

In [15]:
trainer.evaluate()

{'eval_loss': 1.159266471862793,
 'eval_accuracy': 0.7512639029322548,
 'eval_f1': 0.6258709358119409,
 'eval_precision': 0.6672795389464997,
 'eval_recall': 0.5991271471292088,
 'eval_runtime': 2.5066,
 'eval_samples_per_second': 1183.685,
 'eval_steps_per_second': 37.102,
 'epoch': 5.0}

# Evaluación

In [16]:
def get_prediction(text):
    # prepara el texto, aplicamos la misma tokenización que la utilizada en el training
    inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt").to("cuda")
    # aplicamos el modelo
    outputs = model(**inputs)
    # obtenemos la probabilidad para cada clase
    probs = outputs[0].softmax(1)
    # argmax nos devuelve la clase con mayor probabilidad.
    # argmax devuelve un tensor. Debemos devolver su valor asociado
    return probs.argmax().item()

In [17]:
y_pred=[get_prediction(text) for text in dict_dataset['test']['Text']]
print(classification_report(y_true=dict_dataset['test']['Label'], y_pred=y_pred, target_names=['0', '1', '2', '3', '4', '5']))

              precision    recall  f1-score   support

           0       0.78      0.84      0.81      1175
           1       0.63      0.45      0.52       176
           2       0.63      0.62      0.62       565
           3       0.42      0.34      0.38        61
           4       0.78      0.79      0.79       873
           5       0.71      0.58      0.64       115

    accuracy                           0.74      2965
   macro avg       0.66      0.60      0.63      2965
weighted avg       0.73      0.74      0.73      2965

