In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer
import torch

# Cargar los datos desde el archivo CSV
df = pd.read_csv('C:/Users/cesco/Desktop/Personal/UPY/9/NLP/proyecto/train.csv',
                 header=None, names=['polarity', 'summary', 'reviewText'])

# Crear columna con reseñas invertidas (cambiando algunas palabras clave)
df['invertedReview'] = df.apply(
    lambda row: row['reviewText'].replace("good", "bad").replace("excellent", "terrible") 
                if row['polarity'] == 1 else
                row['reviewText'].replace("bad", "good").replace("terrible", "excellent"),
    axis=1
)

# Dividir los datos en conjunto de entrenamiento y validación (90% entrenamiento, 10% validación)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Inicializar el tokenizador de T5
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Tokenizar las reseñas originales y las reseñas invertidas

def tokenize_function(examples):
    return tokenizer(examples['reviewText'], padding="max_length", truncation=True, max_length=512)

def tokenize_inverted_function(examples):
    return tokenizer(examples['invertedReview'], padding="max_length", truncation=True, max_length=512)

# Tokenizar datos de entrenamiento
train_encodings = train_df['reviewText'].apply(lambda x: tokenize_function({'reviewText': x}))
train_decodings = train_df['invertedReview'].apply(lambda x: tokenize_inverted_function({'invertedReview': x}))

# Tokenizar datos de validación
val_encodings = val_df['reviewText'].apply(lambda x: tokenize_function({'reviewText': x}))
val_decodings = val_df['invertedReview'].apply(lambda x: tokenize_inverted_function({'invertedReview': x}))

# Crear datasets de PyTorch (usando TensorDataset)
train_dataset = torch.utils.data.TensorDataset(
    torch.stack(train_encodings.apply(lambda x: torch.tensor(x['input_ids'])).values),
    torch.stack(train_decodings.apply(lambda x: torch.tensor(x['input_ids'])).values)
)

val_dataset = torch.utils.data.TensorDataset(
    torch.stack(val_encodings.apply(lambda x: torch.tensor(x['input_ids'])).values),
    torch.stack(val_decodings.apply(lambda x: torch.tensor(x['input_ids'])).values)
)


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

# Cargar modelo T5 preentrenado
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Configurar parámetros de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Entrenar el modelo
trainer.train()


In [None]:
# Evaluar el modelo
eval_results = trainer.evaluate()

# Calcular la perplejidad
import math

perplexity = math.exp(eval_results['eval_loss'])
print(f"Perplexity: {perplexity}")


In [None]:
# Generar reseñas invertidas
def generate_inverted_review(review):
    input_ids = tokenizer.encode(review, return_tensors='pt', truncation=True, max_length=512)
    generated_ids = model.generate(input_ids, max_length=512, num_return_sequences=1, num_beams=5, early_stopping=True)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# Probar con una reseña
review = "This product is amazing, I love it!"
inverted_review = generate_inverted_review(review)
print(f"Original: {review}")
print(f"Inverted: {inverted_review}")
