In [1]:
# =============================
# 1. Importar librerías
# =============================
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

In [3]:
# =============================
# 2. Preparar dataset Kaggle
# =============================
ratings = pd.read_csv("./Dataset/rating.csv")
anime = pd.read_csv("./Dataset/anime.csv")

# Unir datasets
merged = ratings.merge(anime, on="anime_id")

# Mapear rating a sentimiento
def map_sentiment(score):
    if score <= 4:
        return "negative"
    elif score <= 6:
        return "neutral"
    else:
        return "positive"

merged["sentiment"] = merged["rating_x"].apply(map_sentiment)

# Crear dataset reducido (20K para pruebas)
df = merged[["name","sentiment"]].dropna().sample(20000, random_state=42)
df = df.rename(columns={"name": "text"})

# Convertir labels de texto a números
le = LabelEncoder()
df["labels"] = le.fit_transform(df["sentiment"])

# Guardar dataset procesado
df.to_csv("anime_sentiment.csv", index=False)
print("✅ Dataset procesado y guardado: anime_sentiment.csv")


✅ Dataset procesado y guardado: anime_sentiment.csv


In [4]:
# =============================
# 3. Convertir a Hugging Face Dataset
# =============================
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

In [5]:
# =============================
# 4. Tokenización
# =============================
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    tokens = tokenizer(batch["text"], padding=True, truncation=True)
    tokens["labels"] = batch["labels"]
    return tokens

dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [12]:
# =============================
# 5. Preparar modelo
# =============================
num_labels = len(le.classes_)

label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {0: "negative", 1: "neutral", 2: "positive"}

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label = id2label,
    label2id = label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# =============================
# 6. Configuración de entrenamiento
# =============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    push_to_hub=False
)

def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [18]:
# =============================
# 7. Entrenamiento
# =============================
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7687,0.852002,0.6705,0.563699
2,0.7822,0.855821,0.67275,0.562061
3,0.7636,0.863809,0.67175,0.56586
4,0.7316,0.879166,0.67075,0.568795
5,0.7475,0.886821,0.65225,0.576146


TrainOutput(global_step=5000, training_loss=0.7706249877929687, metrics={'train_runtime': 496.148, 'train_samples_per_second': 161.242, 'train_steps_per_second': 10.078, 'total_flos': 827397847995840.0, 'train_loss': 0.7706249877929687, 'epoch': 5.0})

In [19]:
# =============================
# 8. Guardar modelo entrenado
# =============================
trainer.save_model("anime-sentiment-model")
tokenizer.save_pretrained("anime-sentiment-model")
print("✅ Modelo guardado en ./anime-sentiment-model")

✅ Modelo guardado en ./anime-sentiment-model


In [None]:
# =============================
# 9. Probar el modelo
# =============================
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="anime-sentiment-model", tokenizer="anime-sentiment-model")

examples = [
    "Attack on Titan is a masterpiece!",
    "Boruto feels too slow and boring...",
    "Naruto Shippuden has good and bad arcs"
]

for text in examples:
    result = classifier(text)[0]
    print(f"Texto: {text}\n → Predicción: {result['label']} (confianzaa {result['score']:.2f})\n")

Device set to use cuda:0


Texto: Attack on Titan es una basura!
 → Predicción: positive (confianzaa 0.78)

Texto: Boruto feels too slow and boring...
 → Predicción: positive (confianzaa 0.51)

Texto: Naruto Shippuden has good and bad arcs
 → Predicción: positive (confianzaa 0.52)

