In [1]:
import transformers
print(transformers.__version__)

4.57.1


In [2]:
import os
import sys
PATH = os.getcwd()
DIR_DATA = PATH + '{0}data{0}'.format(os.sep)
sys.path.append(PATH) if PATH not in list(sys.path) else None

In [3]:
os.makedirs("./logs", exist_ok=True)
os.makedirs("./results", exist_ok=True)

# Paso 1: Importación de librerías

In [4]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments 
from datasets import Dataset
import numpy as np




# Paso 2: Carga y limpieza del dataset

In [5]:
df = pd.read_csv(DIR_DATA + "dataset_sentimientos_500.csv")

In [6]:
df.columns = df.columns.str.strip()

In [7]:
df = df[['Reseña', 'Sentimiento']].dropna()
df['Sentimiento'] = df['Sentimiento'].map({'Positiva': 1, 'Negativa': 0})

In [8]:
df

Unnamed: 0,Reseña,Sentimiento
0,"Estoy feliz con mi compra, funciona perfecto.",1
1,"Estoy feliz con mi compra, funciona perfecto.",1
2,Recomiendo este servicio sin dudarlo.,1
3,"Muy recomendable, volveré a comprar.",1
4,"Muy recomendable, volveré a comprar.",1
...,...,...
495,"Muy recomendable, volveré a comprar.",1
496,No funciona como se esperaba.,0
497,Buen precio y envío rápido.,1
498,Buen precio y envío rápido.,1


# Paso 3: Separación en entrenamiento y prueba

In [9]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
df['Reseña'].tolist(), df['Sentimiento'].tolist(), test_size=0.2, random_state=42)

# Paso 4: Tokenización

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128) 

test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128) 

# Paso 5: Creación de los datasets formales 

In [11]:
train_dataset = Dataset.from_dict({ 

    'input_ids': train_encodings['input_ids'], 

    'attention_mask': train_encodings['attention_mask'], 

    'labels': train_labels 

}) 

test_dataset = Dataset.from_dict({ 

    'input_ids': test_encodings['input_ids'], 

    'attention_mask': test_encodings['attention_mask'], 

    'labels': test_labels 

})

# Paso 6: Definición de métricas 

In [12]:
def compute_metrics(eval_pred): 

    logits, labels = eval_pred 

    preds = np.argmax(logits, axis=-1) 

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary') 

    acc = accuracy_score(labels, preds) 

    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall} 

# Paso 7: Carga del modelo preentrenado

In [13]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Paso 8: Configuración del entrenamiento 

In [15]:
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: IntervalStrategy.NO
- Save strategy: SaveStrategy.STEPS

# Paso 9: Entrenamiento del modelo 

In [None]:
trainer = Trainer( 

    model=model, 

    args=training_args, 

    train_dataset=train_dataset, 

    eval_dataset=test_dataset, 

    compute_metrics=compute_metrics 

) 

trainer.train() 

# Paso 10: Evaluación final del modelo

In [None]:
results = trainer.evaluate() 

print("Resultados:", results) 