# Fine-tuning of a BERT model para hacer sentence classification


Clasificar las thesis en subjects


In [34]:
!pip install datasets --quiet
!pip install accelerate -U --quiet

In [1]:
import datasets
import transformers

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import random
import json
import pickle

In [3]:
# Set the seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
transformers.set_seed(seed)

In [4]:
# Check if GPU is available and set device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [5]:
# carpeta para guardar las predicciones
results_path = "./Predictions/"

# carpeta donde se toman los datos
carpeta_datos = ".\\Data\\Text_classification_data\\"
carpeta_datos = ".\\"

# nombre archivos de datos
nombre_train = "train_df.csv"
nombre_val = "val_df.csv"

In [40]:
# nombre de carpeta donde se guarda toda la info del entrenamiento
model_output_dir = "./Models/BERT_fine_tuning_info/"

# path para guardar el modelo
model_path = "./Models/BERT/"

Opciones de modelos bert

https://huggingface.co/google-bert/bert-base-multilingual-cased
    
    
https://huggingface.co/google-bert/bert-base-uncased

Considerar tambien usar distilbert:

https://huggingface.co/distilbert/distilbert-base-multilingual-cased

In [41]:
# delimitar el nombre del modelo a usar
nombre_modelo_bert = "google-bert/bert-base-multilingual-uncased"
#nombre_modelo_bert = "bert-base-uncased"

## Hyper parametros

In [42]:
# archivo donde se ponen los hyperparameters
hyperparameters_file = "./Hyperparameters/Bert.csv"

In [67]:
LEARNING_RATE = 5e-5            # 5e-5, 3e-5, 2e-5
WEIGHT_DECAY = 0.01             # 0.01, 0.1
ADAM_EPSILON = 1e-8             # 1e-8
CLASS_WEIGHTS = None      # "balanced" para tener pesos en las clases, None para tener pesos uniformes
NUM_EPOCHS = 5

# Cargar datos y procesar

In [68]:
# cargar dfs
df_train = pd.read_csv( nombre_train, encoding='utf-8-sig', index_col=0)
df_val = pd.read_csv( nombre_val, encoding='utf-8-sig', index_col=0)
#df_test = pd.read_csv(carpeta_datos + nombre_test, encoding='utf-8-sig', index_col=0)

# ver numero de datos
print(f"Datos de train: {df_train.shape[0]}")
print(f"Datos de val: {df_val.shape[0]}")
#print(f"Datos de test: {df_test.shape[0]}")

Datos de train: 141345
Datos de val: 15705


In [69]:
df_train.head()

Unnamed: 0,thesis,subject
287537,Analyzing undergraduate admissions criteria (t...,"91—Game theory, economics, social and behavior..."
123686,Aproximações markovianas e reamostragem para c...,62—Statistics
288977,Imperfect information in spatial elections: An...,"91—Game theory, economics, social and behavior..."
153883,A Refined Gross-Prasad Conjecture for Unitary ...,11—Number theory
297075,Real-time Detection and Suppression of Malicio...,68—Computer science


In [70]:
# tomar todas las etiquietas
subjects_datos = df_train['subject'].values

# tomar todas las etiquetas unicas en df de entrenamiento
subjects_unicos = sorted(list(df_train['subject'].unique()))

# ver cuantos son
num_clases = len(subjects_unicos)

print(f"Se tienen {num_clases} subjects diferentes")

Se tienen 63 subjects diferentes


In [71]:
# hacer el mapeo de indice a subject y viceversa

encode_subj_2_idx = {subj:idx for idx, subj in enumerate(subjects_unicos)}
decode_idx_2_subj = {idx:subj for idx, subj in enumerate(subjects_unicos)}

In [72]:
# mapear los subjects con sus respectivos indices

df_train['indice_subject'] = df_train['subject'].map(lambda x: encode_subj_2_idx[x])
df_val['indice_subject'] = df_val['subject'].map(lambda x: encode_subj_2_idx[x])
#df_test['indice_subject'] = df_test['subject'].map(lambda x: encode_subj_2_idx[x])

In [73]:
# unicamente conservar columnas de interes en ambos df
df_train = df_train[["thesis", "indice_subject"]]
df_val = df_val[["thesis", "indice_subject"]]
#df_test = df_test[["thesis", "indice_subject"]]

In [74]:
df_train.head()

Unnamed: 0,thesis,indice_subject
287537,Analyzing undergraduate admissions criteria (t...,58
123686,Aproximações markovianas e reamostragem para c...,44
288977,Imperfect information in spatial elections: An...,58
153883,A Refined Gross-Prasad Conjecture for Unitary ...,6
297075,Real-time Detection and Suppression of Malicio...,46


# Procesar los datasets

In [75]:
from datasets import Dataset


# poner los datos en formato para ser usandos por un modelo de hugging face
train_dataset_text = Dataset.from_pandas(df_train)
val_dataset_text = Dataset.from_pandas(df_val)
#test_dataset_text = Dataset.from_pandas(df_test)

# renombrar las columnas de los datasets
train_dataset_text = train_dataset_text.rename_columns({"thesis": "text", "indice_subject": "label"})
val_dataset_text = val_dataset_text.rename_columns({"thesis": "text", "indice_subject": "label"})
#test_dataset_text = test_dataset_text.rename_columns({"thesis": "text", "indice_subject": "label"})


# ver el numero de datos en cada categoria
print(f"Train data: {train_dataset_text.num_rows}")
print(f"Val data:   {val_dataset_text.num_rows}")
#print(f"Test data:  {test_dataset_text.num_rows}")

Train data: 141345
Val data:   15705


In [76]:
# ver ejemplos
print(train_dataset_text[51])
print(val_dataset_text[10])

{'text': 'Three Essays in Matching Mechanism Design', 'label': 58, '__index_level_0__': 219382}
{'text': 'Essays in East African Development', 'label': 58, '__index_level_0__': 237324}


In [77]:
from transformers import BertTokenizer
from transformers import DataCollatorWithPadding


# inicializar el tokenizador
tokenizer = BertTokenizer.from_pretrained(nombre_modelo_bert)

# hacer el data collector con el tokenizador
# este se va a usar para el entrenamiento
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# funcion de procesamiento con truncation y padding
def preprocess_function(examples):

    # tomar el texto
    texts = examples['text']

    return tokenizer(texts, truncation=True, padding=True)



In [78]:
# ejemplo de la funcion de preprocesamiento
textos = train_dataset_text[10:12]
print(textos)

# ver el resultado
print(preprocess_function(textos))

{'text': ['Decay rates for the damped wave equation on the torus', 'Subsets of Groups Exhibiting Regularity in Differences'], 'label': [24, 3], '__index_level_0__': [265043, 257211]}
{'input_ids': [[101, 81960, 31178, 10139, 10103, 12235, 18490, 21560, 43008, 10125, 10103, 87797, 10107, 102], [101, 13241, 14639, 10107, 10108, 15468, 57930, 10285, 15430, 12705, 10104, 30980, 102, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]}


In [79]:
# tokenizar los datasets
train_dataset = train_dataset_text.map(preprocess_function, batched=True)
val_dataset = val_dataset_text.map(preprocess_function, batched=True)
#test_dataset = test_dataset_text.map(preprocess_function, batched=True)

Map:   0%|          | 0/141345 [00:00<?, ? examples/s]

Map:   0%|          | 0/15705 [00:00<?, ? examples/s]

In [80]:
# ver datasets
print(train_dataset)
print(val_dataset)
#print(test_dataset)

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 141345
})
Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 15705
})


# Cargar el Bert

In [81]:
from transformers import BertForSequenceClassification

# cargar el modelo bert, se sabe el numero de labels que se tiene
bert_model = BertForSequenceClassification.from_pretrained(nombre_modelo_bert,
                                                           num_labels= num_clases,
                                                           label2id = encode_subj_2_idx,
                                                           id2label = decode_idx_2_subj)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [82]:
'''
POR SI SE QUIERE USAR LORA
from peft import LoraConfig, PeftModelForSequenceClassification

# hacer la configuracion del Lora para el fine tuning
peft_config = LoraConfig(
    task_type = 'SEQ_CLS',
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="all",
    modules_to_save= ["classifier"],  # entrenar el clasificador
    use_rslora = True,
)

# hacer el modelo considerando lora
model = PeftModelForSequenceClassification(bert_model, peft_config)

# ver
model.print_trainable_parameters()
'''

'\nPOR SI SE QUIERE USAR LORA\nfrom peft import LoraConfig, PeftModelForSequenceClassification\n\n# hacer la configuracion del Lora para el fine tuning\npeft_config = LoraConfig(\n    task_type = \'SEQ_CLS\',\n    r=16,\n    lora_alpha=16,\n    lora_dropout=0.1,\n    bias="all",\n    modules_to_save= ["classifier"],  # entrenar el clasificador\n    use_rslora = True,\n)\n\n# hacer el modelo considerando lora\nmodel = PeftModelForSequenceClassification(bert_model, peft_config)\n\n# ver\nmodel.print_trainable_parameters()\n'

In [83]:
# usar el bert model para el fine tuning
model = bert_model

In [84]:
# mandar el modelo para correr en device
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

# Definir los parametros del entenamiento

In [85]:
from sklearn.metrics import f1_score

# hacer la funcion que calcula metricas
def compute_metrics(eval_pred):

    # separa los logits y el label
    logits, labels = eval_pred

    # usando los logits toma la prediccion
    predictions = np.argmax(logits, axis=-1)

    # calcula f1 macro weighted y micro
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    f1_micro = f1_score(labels, predictions, average='micro')

    # poner las estadisticas en un diccionario
    dict_results = {
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'f1_micro': f1_micro,
    }

    return dict_results

In [86]:
from transformers import TrainingArguments

# delimitar hyperparametros para el entrenamiento
training_args = TrainingArguments(
    output_dir= model_output_dir,                    # output directory
    save_steps = 500,                                 # save checkpoints every x steps
    save_total_limit = 4,                            # save at most x checkpoints (keep the best)
    load_best_model_at_end = True,                   # load the best model found during training at the end of training
    metric_for_best_model = 'f1_macro',              # the best model according to this metric
    greater_is_better = True,                        # greater f1 is better
    num_train_epochs= NUM_EPOCHS,                    # total number of training epochs
    per_device_train_batch_size= 16,                 # batch size per device during training
    per_device_eval_batch_size= 64,                  # batch size for evaluation
    #logging_dir = carpeta_info +  'logs\\',         # logging directory
    logging_strategy = 'steps',                      # log every some number of steps
    logging_first_step = True,                       # log in the first step
    logging_steps = 500,                              # log every x steps, if smaller than 1, ratio of total training steps
    eval_strategy='steps',                           # evaluate every some number of steps
    # eval_steps= 10,                                # evaluate every x steps, Will default to the same value as logging_steps
    learning_rate= LEARNING_RATE,                    # learning rate
    weight_decay= WEIGHT_DECAY,                      # weight decay
    adam_epsilon = ADAM_EPSILON,                     # epsilon hyperparameter for the AdamW optimizer
    #use_cpu  = True,                                 # por desgracia, tengo que usar cpu (a veces)
)

In [87]:
import sklearn
from torch import nn

# Definir la funcion de perdida con pesos

# tomar todas las etiquetas para entrenamiento en un array
etiquetas_y = df_train['indice_subject'].values

# obtnener le peso de cada clase
# si se pasa None en class_weight entonces los pesos son uniformes
pesos_clases = sklearn.utils.class_weight.compute_class_weight(class_weight= CLASS_WEIGHTS,
                                                               classes= np.arange(bert_model.num_labels), # el numero de labels
                                                               y= etiquetas_y)

# convertir a tensor y mover al device
pesos_clases = torch.tensor(pesos_clases, dtype=torch.float).to(device)

# hacer la funcion de perdida con las pesos
weighted_criterion = nn.CrossEntropyLoss(weight=pesos_clases)

In [88]:
from transformers import Trainer

# hacer un trainer propio para personalizar la funcion de perdida

# Clase de trainer personalizado, heredar el Trainer
class CustomTrainer(Trainer):

  # constructor
  def __init__(self, *args, **kwargs):
    # poner los mismor argumentos que tendria un trainer normal
    super().__init__(*args, **kwargs)

  # personalizar la funcion de perdida
  def compute_loss(self, model, inputs, return_outputs = False):

    # mandar los inputs al device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # hacer forward en el modelo
    outputs = model(**inputs)

    # sacar los logits del output
    logits = outputs.logits

    # tomar las etiquietas correctas
    labels = inputs.pop("labels")

    # calcular el loss
    loss = weighted_criterion(logits, labels)

    # devolver outputs solo si se quiere
    return (loss, outputs) if return_outputs else loss

In [89]:
# crear el trainer, usar mi trainer personalizado
trainer = CustomTrainer(
    model= model,
    args= training_args,
    train_dataset= train_dataset,
    eval_dataset= val_dataset,
    compute_metrics= compute_metrics, # funcion especificada por mi
    data_collator= data_collator
)

# Entrenar el modelo

In [None]:
# hacer el entrenamiento
trainer.train()

Step,Training Loss,Validation Loss,F1 Macro,F1 Weighted,F1 Micro
500,2.6437,2.150465,0.126965,0.395648,0.457498


# Evaluar el modelo

In [None]:
# evaluar en los datos de entrenamiento
result_train = trainer.evaluate(eval_dataset= train_dataset)

# imrpimir lo importante
print(f"F1 macro:    {result_train['eval_f1_macro']}")
print(f"F1 weighted: {result_train['eval_f1_weighted']}")
print(f"F1 micro:    {result_train['eval_f1_micro']}")

In [None]:
# evaluar en los datos de validacion
result_val = trainer.evaluate(eval_dataset= val_dataset)

# imrpimir lo importante
print(f"F1 macro:    {result_val['eval_f1_macro']}")
print(f"F1 weighted: {result_val['eval_f1_weighted']}")
print(f"F1 micro:    {result_val['eval_f1_micro']}")

In [None]:
# funcion que toma textos y devuelve predicciones


def predecir_de_textos(lista_textos):

    # tokenizar los textos
    inputs = tokenizer(list(lista_textos), truncation=True, padding=True, return_tensors="pt")

    # mandarlos al gpu
    inputs.to(device)

    # evaluar el modelo
    with torch.no_grad():
        outputs = model(**inputs)

    # tomar las predicciones
    predictions_indices = np.argmax(outputs.logits.cpu(), axis=-1)

    # convertir a numpy
    predictions_indices = predictions_indices.numpy()

    # ver que subjects son estas predicciones
    predictions = [model.config.id2label[idx] for idx in predictions_indices]

    return predictions


In [None]:
# probar el modelo en textos arbitrarios
textos_ejemplos = np.array(["An Alternative Definition of Stable Models Via Lukasiewicz Logic",
                            "Energy of a graph and Randić index of subgraphs",
                            "Analyzing Mexico City's Airbnb Reviews During COVID-19: A Computational Framework Utilizing BERT Embeddings",
                            "A Probabilistic Model for Node Classification in Directed Graphs"])

predictions = predecir_de_textos(textos_ejemplos)
print(predictions)

# Salvar el modelo

In [18]:
# Set the path to your local folder with the model
model_path = "./Models/BERT"

# Load the tokenizer and model from the local directory
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

In [None]:
from transformers import pipeline

# cargar modelo y tokenizer
loaded_model =  BertForSequenceClassification.from_pretrained(model_path)
loaded_tokenizer = BertTokenizer.from_pretrained(model_path)

# hacer pipeline
pipeline_inferencia  = pipeline("text-classification", model = loaded_model, tokenizer = loaded_tokenizer)

# ejemlplos
print(pipeline_inferencia("An Alternative Definition of Stable Models Via Lukasiewicz Logic"))
print(pipeline_inferencia("Energy of a graph and Randić index of subgraphs"))
print(pipeline_inferencia("Analyzing Mexico City's Airbnb Reviews During COVID-19: A Computational Framework Utilizing BERT Embeddings"))
print(pipeline_inferencia("A Probabilistic Model for Node Classification in Directed Graphs"))

## Hacer predicciones para todos los datos

In [None]:
# cargar el df con todas las tesis
df_full = pd.read_csv("full_df.csv", encoding='utf-8-sig', index_col=0, na_values=None, keep_default_na=False)
print(f"Hacer predicciones en {df_full.shape[0]} datos")

In [None]:
# procesar
full_dataset_text = Dataset.from_pandas(df_full)

# renombrar las columnas
full_dataset_text = full_dataset_text.rename_columns({"thesis": "text"})

# tokenizar el datasets
full_dataset = full_dataset_text.map(preprocess_function, batched=True)

In [None]:
# hacer predicciones con el mejor modelo
y_full = trainer.predict(full_dataset)

In [None]:
# tomar los labels predichos, no toma la probabilidad
y_full_labels = np.argmax(y_full.predictions , axis=1)

# ver que subjects son estas predicciones
y_full_subjects =  [model.config.id2label[idx] for idx in y_full_labels]

# agregar las predicciones al df
df_full['prediction'] = y_full_subjects

# ver
df_full.head()

In [None]:
# poner las predicciones en un diccionario
# con formato correcto
predicciones_finales = {v: info_row['prediction']
                       for v, info_row in df_full.iterrows()}

# guardar estas predicciones
with open(results_path + 'BERT.pkl', 'wb') as f:
    pickle.dump(predicciones_finales, f)

## Actualizar reporte de hyperparametros

In [None]:
def update_hyperparameters(metric):
    '''
    Al finalizar el entrenamiento
    Guardar las metricas obtenidas para los hyperparametros usados
    '''

    # poner los hyperparametros junto con la metrica final obtenida
    dict_info = {
        "LEARNING_RATE" : LEARNING_RATE,
        "WEIGHT_DECAY" : WEIGHT_DECAY,
        "ADAM_EPSILON" : ADAM_EPSILON,
        "CLASS_WEIGHTS" : str(CLASS_WEIGHTS),
        "NUM_EPOCHS" : NUM_EPOCHS,
        "Val score" : metric
    }


    # hacer un df con esta info
    new_df = pd.DataFrame([dict_info])

    # intenta cargar un archivo existente
    try:

        # leer
        df = pd.read_csv(hyperparameters_file, na_values=None, keep_default_na=False)

        # agregar nueva fila, con la info de esta inferencia
        df = pd.concat([df, new_df], ignore_index=True)

        # quitar repetidos
        df = df.drop_duplicates()

        # ordenar segun la metrica
        df = df.sort_values(by=['Val score'], ascending=False)

        # guardar el archivo modificado
        df.to_csv(hyperparameters_file, index=False)

    # si no se puede abrir, no existe
    except FileNotFoundError:

        # guardar solo la info de esta iteracion
        new_df.to_csv(hyperparameters_file, index=False)

In [None]:
# actualizar el df, con el valor f1 macro en los datos de val
update_hyperparameters(result_val['eval_f1_macro'])