In [1]:
### IMPORTS ###
import pandas as pd
import numpy as np
import pathlib as pl
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import classification_report

!pip install datasets
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

!pip install transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import get_scheduler
from transformers import TrainingArguments

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

!pip install evaluate
import evaluate

from transformers import Trainer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

PATH = "/content/drive/My Drive/PLN/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
### CONJUNTOS DE ENTRENAMIENTO, VALIDACIÓN Y TEST ###
df = pd.read_csv(PATH + "train.csv",  sep=',', on_bad_lines='skip', encoding='utf-8', encoding_errors='ignore')
df = df[['tweet', 'mean_prejudice']]
df = df.rename(columns={"tweet": "Text", "mean_prejudice": "Label"})
df.fillna(" ", inplace=True)

X_train = df['Text']
y_train = df['Label']

X_train, X_aux, y_train, y_aux = train_test_split(X_train, y_train, test_size=0.3, random_state=55)
X_val, X_test, y_val, y_test = train_test_split(X_aux, y_aux, test_size=0.66, random_state=55)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

print('Tamaño conjunto de Entrenamiento:', len(X_train))
print('Tamaño conjunto de Validación:', len(X_val))
print('Tamaño conjunto de Evaluación:',len(X_test))

d = {'train':Dataset.from_dict({'Label':y_train, 'Text':X_train}),
     'val':Dataset.from_dict({'Label':y_val, 'Text':X_val}),
     'test':Dataset.from_dict({'Label':y_test, 'Text':X_test})
     }

dict_dataset = DatasetDict(d)

Tamaño conjunto de Entrenamiento: 1869
Tamaño conjunto de Validación: 272
Tamaño conjunto de Evaluación: 530


# Tokenization

In [4]:
model_name ='bertin-project/bertin-roberta-base-spanish'
tokenizer = AutoTokenizer.from_pretrained(model_name)

MAX_LENGTH = 55

In [5]:
def tokenize(examples):
    return tokenizer(examples["Text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)


data_encodings = dict_dataset.map(tokenize, batched=True)
data_encodings

Map:   0%|          | 0/1869 [00:00<?, ? examples/s]

Map:   0%|          | 0/272 [00:00<?, ? examples/s]

Map:   0%|          | 0/530 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Label', 'Text', 'input_ids', 'attention_mask'],
        num_rows: 1869
    })
    val: Dataset({
        features: ['Label', 'Text', 'input_ids', 'attention_mask'],
        num_rows: 272
    })
    test: Dataset({
        features: ['Label', 'Text', 'input_ids', 'attention_mask'],
        num_rows: 530
    })
})

# Modelo

Necesitamos hacer algunas modificaciones para preparar el dataset para el modelo:

- Eliminamos la columna 'text', porque es un campo que el modelo no espera.
- Renombramos 'label' a 'labels', porque es el nombre que espera el modelo.
- El dataset debe devolver un objeto Tensor en lugar de una lista.

Para pasarle los datos al modelo debemos guardarlos en objetos DataLoader

In [6]:
data_encodings = data_encodings.remove_columns('Text')
data_encodings = data_encodings.rename_column('Label', 'labels')
data_encodings.set_format("torch")
data_encodings

train_dataloader = DataLoader(data_encodings['train'], shuffle=True, batch_size=8)
val_dataloader = DataLoader(data_encodings['val'], batch_size=8)

Para definir el modelo, hay que establecer el número de etiquetas:

In [7]:
NUM_LABELS = 1
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS) 

Some weights of the model checkpoint at bertin-project/bertin-roberta-base-spanish were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at bertin-project/bertin-roberta-base-spanish and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier

Como optimizador vamos a utilizar el optijizador Adam, implementado en AdamW, y definimos el learning rate:

In [8]:
optimizer = AdamW(model.parameters(), lr=5e-5)

# Hyperparámetros

In [9]:
#!pip uninstall -y transformers accelerate
#!pip install transformers accelerate

In [10]:
#!pip uninstall -y transformers accelerate
#!pip install transformers==4.28.0 accelerate

args = TrainingArguments(output_dir="./outputs")
args.evaluation_strategy="epoch"
args.per_device_train_batch_size = 32
args.per_device_eval_batch_size = 32
args.num_train_epochs = 5

# Métrica

In [11]:
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
from scipy import stats

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    # loss metrics
    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)
    # performance metrics
    r2 = r2_score(labels, logits)
    # pearson=stats.pearsonr(labels, logits)[0]

    # we return a dictionary with all metrics
    # return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape, "pearson": pearson}
    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2,"smape":smape}

# Training

In [12]:
trainer = Trainer(
    model=model,            # modelo que será ajustado
    args = args,     # hiperparámetros
    train_dataset=data_encodings['train'], # conjunto training
    eval_dataset=data_encodings['val'],   # conjunto de validación
    compute_metrics=compute_metrics,    # función para computar las métricas
)


In [13]:
trainer.train()



Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,No log,0.602902,0.602902,0.776468,0.63545,-0.016114,22.117471
2,No log,0.557228,0.557228,0.746477,0.592072,0.060864,20.701308
3,No log,0.423182,0.423182,0.650524,0.514107,0.286782,18.187114
4,No log,0.413525,0.413525,0.643059,0.506035,0.303058,18.046087
5,No log,0.422433,0.422433,0.649949,0.513275,0.288044,18.243769


TrainOutput(global_step=295, training_loss=0.6172497248245498, metrics={'train_runtime': 100.7437, 'train_samples_per_second': 92.76, 'train_steps_per_second': 2.928, 'total_flos': 264123614219850.0, 'train_loss': 0.6172497248245498, 'epoch': 5.0})

In [14]:
trainer.evaluate()

{'eval_loss': 0.42243313789367676,
 'eval_mse': 0.42243313789367676,
 'eval_rmse': 0.6499485373497009,
 'eval_mae': 0.5132746696472168,
 'eval_r2': 0.28804357926665425,
 'eval_smape': 18.243769028607538,
 'eval_runtime': 0.8196,
 'eval_samples_per_second': 331.867,
 'eval_steps_per_second': 10.981,
 'epoch': 5.0}

# Evaluación

In [15]:
def get_prediction(text):
    inputs = tokenizer(text, max_length=MAX_LENGTH, padding="max_length", truncation=True, return_tensors="pt").to("cuda")
    outputs = model(**inputs)   
    return outputs[0].item()   

y_pred=[get_prediction(text) for text in dict_dataset['test']['Text']]

In [16]:
def print_metrics(y_test, y_pred, lang=''):
    if lang:
        pass
    else:
        #         print("Final results on the whole test dataset")
        print('|   |MSE|RMSE|MAE|R2|SMAPE|PEARSON|')
        print('|---|---|---|---|---|---|---|')

    mse = mean_squared_error(y_test, y_pred)
    # print ("MSE: ", "{:.2f}".format(mse), end=', ')

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    # print("RMSE: ", "{:.2f}".format(rmse), end=', ')
    
    mae = mean_absolute_error(y_test, y_pred)
    # print("MAE: ", "{:.2f}".format(mae), end=', ')

    r2 = r2_score(y_test, y_pred)
    # print("R2: ", "{:.2f}".format(r2), end=', ')

    diff=[label-pred for (label,pred) in zip(y_test,y_pred)]
    smape= 1/len(y_test) * np.sum(2 * np.abs(diff) / (np.abs(y_test) + np.abs(y_pred))*100)
    # print("SMAPE: ", "{:.2f}".format(smape), end=', ')

    pearson=stats.pearsonr(y_test, y_pred)[0]
    # print("PEARSON: ", "{:.2f}".format(pearson))
    # print()

    text_table=str('|')+lang+str('|') +"{:.2f}".format(mse) \
                +str('|')+"{:.2f}".format(rmse) \
                +str('|')+"{:.2f}".format(mae) \
                +str('|')+"{:.2f}".format(r2) \
                +str('|')+"{:.2f}".format(smape) \
                +str('|')+"{:.2f}".format(pearson) + str('|')

    print(text_table)



In [17]:
print_metrics(y_test, y_pred)

|   |MSE|RMSE|MAE|R2|SMAPE|PEARSON|
|---|---|---|---|---|---|---|
||0.48|0.69|0.54|0.27|19.02|0.53|


# Resultados Finales

In [19]:
df_test = pd.read_csv(PATH + "test.csv",  sep=',', on_bad_lines='skip', encoding='utf-8', encoding_errors='ignore', index_col=False)
df_test = df_test[['index', 'tweet']]

d_final = {'Final':Dataset.from_dict({'ID':df_test['index'], 'Text':df_test['tweet']})}
dict_dataset_final = DatasetDict(d_final)

file = open("Salida_Tarea3.txt","a")

final_predictions = [get_prediction(text) for text in dict_dataset_final['Final']['Text']]
for result in final_predictions:
    file.write(str(result) + "\n")

file.close()
final_predictions

[3.2164525985717773,
 3.871439218521118,
 3.1323091983795166,
 2.7728374004364014,
 3.8060803413391113,
 2.759145975112915,
 3.136350154876709,
 3.1202423572540283,
 3.485654592514038,
 2.871103525161743,
 2.4517650604248047,
 3.7380475997924805,
 3.460862636566162,
 3.1160385608673096,
 3.277242422103882,
 2.224963903427124,
 3.0722625255584717,
 3.0218021869659424,
 2.555720090866089,
 3.284144163131714,
 3.0717554092407227,
 3.5350263118743896,
 3.215956211090088,
 3.4929890632629395,
 2.499182939529419,
 2.835115671157837,
 2.5954301357269287,
 3.3513455390930176,
 3.3435006141662598,
 4.062155246734619,
 3.064563751220703,
 4.1803975105285645,
 3.3906049728393555,
 4.030496120452881,
 3.2520525455474854,
 3.782454252243042,
 3.5768065452575684,
 3.5394630432128906,
 3.7723214626312256,
 2.861758232116699,
 4.225076198577881,
 3.636939764022827,
 2.9226744174957275,
 3.1850221157073975,
 2.4847660064697266,
 4.230284690856934,
 3.35744309425354,
 4.385348320007324,
 3.3574430942535