In [None]:
!pip install -q transformers datasets accelerate evaluate scikit-learn

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

import evaluate
import numpy as np

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
csv_path = "reddit_artist_posts_sentiment.csv"  # cambia la ruta si hace falta

df = pd.read_csv(csv_path)
print(df.head())
print(df["label"].value_counts())

                                                text     label
0  pitchfork track review: taylor swift’s “actual...  negative
1  taylor swift has regained the masters of her f...  positive
2  pitchfork review: taylor swift - the life of a...   neutral
3                  taylor swift announced engagement   neutral
4  taylor swift - the fate of ophelia (official m...   neutral
label
neutral     19728
positive     8825
negative     3395
Name: count, dtype: int64


In [None]:
class_names = sorted(df["label"].unique())   # ['negative', 'neutral', 'positive'] en orden alfabético
label2id = {name: idx for idx, name in enumerate(class_names)}
id2label = {idx: name for name, idx in label2id.items()}

print("Label2id:", label2id)


Label2id: {'negative': 0, 'neutral': 1, 'positive': 2}


In [None]:
df["label_id"] = df["label"].map(label2id)

In [None]:
df_model = df[["text", "label_id"]].rename(columns={"label_id": "label"})

In [None]:
train_df, valid_df = train_test_split(
    df_model,
    test_size=0.2,
    stratify=df_model["label"],
    random_state=42,
)

print("Train size:", len(train_df), "Valid size:", len(valid_df))

Train size: 25558 Valid size: 6390


In [None]:
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
valid_dataset = Dataset.from_pandas(valid_df.reset_index(drop=True))

raw_datasets = DatasetDict(
    {
        "train": train_dataset,
        "validation": valid_dataset,
    }
)

print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25558
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 6390
    })
})


In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_length = 128  # suficiente para posts cortos (<= 280 chars)

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/25558 [00:00<?, ? examples/s]

Map:   0%|          | 0/6390 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

In [None]:
tokenized_datasets.set_format("torch")

print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 25558
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 6390
    })
})


In [None]:
small_train = tokenized_datasets["train"].shuffle(seed=42).select(range(5000))
small_val = tokenized_datasets["validation"].shuffle(seed=42).select(range(1000))

small_train, small_val



(Dataset({
     features: ['label', 'input_ids', 'attention_mask'],
     num_rows: 5000
 }),
 Dataset({
     features: ['label', 'input_ids', 'attention_mask'],
     num_rows: 1000
 }))

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="pop-sentiment-fast",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=2000,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [None]:
trainer.train()

Step,Training Loss
50,0.7127
100,0.5141
150,0.5327
200,0.5499
250,0.4724
300,0.4807
350,0.4305
400,0.4176
450,0.4817
500,0.4429


TrainOutput(global_step=625, training_loss=0.4871043212890625, metrics={'train_runtime': 3555.397, 'train_samples_per_second': 1.406, 'train_steps_per_second': 0.176, 'total_flos': 165587201280000.0, 'train_loss': 0.4871043212890625, 'epoch': 1.0})

In [None]:
import torch


In [None]:
text = "Taylor Swift absolutely deserved that Grammy!"
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

pred = logits.argmax(dim=1).item()
print("Predicción:", id2label[pred])

Predicción: positive
