# **Importacion de librerias**

In [24]:
import torch
import pandas as pd
import tensorflow as tf
from datasets import Dataset
from transformers import pipeline
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, AutoModelForSequenceClassification

# **Analisis de sentimiento**

In [25]:
# Carga tus datos en un dataframe de pandas
url = "https://raw.githubusercontent.com/Duque-Campeon/Datathon/main/Datasets%20limpios/df_booking.csv"
df = pd.read_csv(url)

# Dividir el conjunto de datos en entrenamiento y validación
train, val = train_test_split(df, test_size=0.2, random_state=42)

# Inicializa el tokenizador
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased")

# Tokenizar el texto
train_encodings = tokenizer(list(train['Comments']), truncation=True, padding=True)
val_encodings = tokenizer(list(val['Comments']), truncation=True, padding=True)


In [26]:
# Cargar el pipeline de análisis de sentimiento
sentiment_analysis = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    tokenizer="nlptown/bert-base-multilingual-uncased-sentiment",
)

# Función para predecir el sentimiento de un comentario
def predict_sentiment(comment):
    # Truncar el comentario si excede 512 tokens
    truncated_comment = comment[:512]
    
    result = sentiment_analysis(truncated_comment)
    sentiment = result[0]['label'].split('_')[-1].lower()
    return sentiment

# Clasifica los comentarios en la columna 'Comments'
df['Sentiment'] = df['Comments'].apply(predict_sentiment)


In [27]:
class FeedbackDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Crear los conjuntos de datos de entrenamiento y validación
train_labels = train['Comments'].map({'positive': 0, 'negative': 1, 'neutral': 2}).tolist()
val_labels = val['Comments'].map({'positive': 0, 'negative': 1, 'neutral': 2}).tolist()
train_dataset = FeedbackDataset(train_encodings, train_labels)
val_dataset = FeedbackDataset(val_encodings, val_labels)


In [28]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

# 1. Cargar y preparar los datos
# Suponiendo que ya tienes un dataframe 'df' con las columnas 'Comments' y 'Sentiment'

# Renombrar las columnas
df = df.rename(columns={'Comments': 'text', 'Sentiment': 'label'})

# Crear un diccionario para convertir las etiquetas a números
label_to_id = {
    '1 star': 0, 
    '2 stars': 1, 
    '3 stars': 2,
    '4 stars': 3,
    '5 stars': 4
}
df['label'] = df['label'].apply(lambda x: label_to_id[x])

# Dividir los datos en conjuntos de entrenamiento y validación (ajusta la proporción según lo necesario)
train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

# Convertir los dataframes de pandas a datasets de Hugging Face
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# 2. Preparar el tokenizador y el modelo
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=5)

# 3. Tokenizar y codificar los conjuntos de datos
max_length = 128

def encode_examples(examples):
    encoded = tokenizer(
        examples['text'], 
        truncation=True, 
        padding='max_length', 
        max_length=max_length
    )
    
    labels = examples['label']
    encoded.update({'labels': labels})
    
    return encoded

train_dataset = train_dataset.map(encode_examples, batched=True)
val_dataset = val_dataset.map(encode_examples, batched=True)

# 4. Configurar y entrenar el Trainer
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()


# Guardar el modelo entrenado y el tokenizador
trainer.save_model("sentiment_analysis_multilingual")
tokenizer.save_pretrained("sentiment_analysis_multilingual")



Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'pre_classif

{'loss': 1.5357, 'learning_rate': 4.9931665983326504e-05, 'epoch': 0.0}



  0%|          | 10/7317 [11:54<6:52:04,  3.38s/it]

{'eval_loss': 1.3980334997177124, 'eval_accuracy': 0.4529984623270118, 'eval_runtime': 680.8021, 'eval_samples_per_second': 14.329, 'eval_steps_per_second': 0.896, 'epoch': 0.0}


  0%|          | 20/7317 [12:26<23:09:45, 11.43s/it]  

{'loss': 1.2468, 'learning_rate': 4.9863331966653006e-05, 'epoch': 0.01}



  0%|          | 20/7317 [22:29<23:09:45, 11.43s/it]

{'eval_loss': 1.3729135990142822, 'eval_accuracy': 0.4529984623270118, 'eval_runtime': 602.9313, 'eval_samples_per_second': 16.179, 'eval_steps_per_second': 1.012, 'epoch': 0.01}


  0%|          | 30/7317 [22:59<21:12:11, 10.48s/it]  

{'loss': 1.2965, 'learning_rate': 4.97949979499795e-05, 'epoch': 0.01}



  0%|          | 30/7317 [34:57<21:12:11, 10.48s/it]

{'eval_loss': 1.2379891872406006, 'eval_accuracy': 0.500871348026653, 'eval_runtime': 717.4378, 'eval_samples_per_second': 13.597, 'eval_steps_per_second': 0.85, 'epoch': 0.01}


  1%|          | 40/7317 [35:33<25:02:49, 12.39s/it]  

{'loss': 1.3806, 'learning_rate': 4.9726663933306e-05, 'epoch': 0.02}




KeyboardInterrupt: 

# **Evaluacion del modelo en el conjunto de datos de entrenamiento:**

In [None]:
# Evaluar el modelo en el conjunto de datos de entrenamiento
train_eval_results = trainer.evaluate(train_dataset)

print("Resultados de la evaluación en el conjunto de entrenamiento:")
print(train_eval_results)


100%|██████████| 206/206 [03:36<00:00,  1.05s/it]

Resultados de la evaluación en el conjunto de entrenamiento:
{'eval_loss': 0.23503988981246948, 'eval_accuracy': 0.9352189781021898, 'eval_runtime': 217.5767, 'eval_samples_per_second': 15.112, 'eval_steps_per_second': 0.947, 'epoch': 3.0}





# **Evaluacion del modelo en el conjunto de datos de validacion:**

In [None]:
# Evaluar el modelo en el conjunto de datos de validación
eval_results = trainer.evaluate()

print("Resultados de la evaluación en el conjunto de validación:")
print(eval_results)


100%|██████████| 52/52 [00:51<00:00,  1.00it/s]

Resultados de la evaluación en el conjunto de validación:
{'eval_loss': 0.5476061701774597, 'eval_accuracy': 0.8369829683698297, 'eval_runtime': 52.7696, 'eval_samples_per_second': 15.577, 'eval_steps_per_second': 0.985, 'epoch': 3.0}





# **Predicciones:**

In [None]:
def custom_sentiment_pipeline(text, model, tokenizer, negative_words):
    # Verificar si alguna palabra negativa está presente en el texto
    if any(word.lower() in text.lower() for word in negative_words):
        return "1 star"

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(dim=-1).item()

    return id_to_label[predictions]

# Cargar el modelo afinado y el tokenizador
model = AutoModelForSequenceClassification.from_pretrained("sentiment_analysis_multilingual")
tokenizer = DistilBertTokenizerFast.from_pretrained("sentiment_analysis_multilingual")

# Diccionario de conversiones de ID a etiqueta
id_to_label = {0: "1 star", 1: "2 stars", 2: "3 stars", 3: "4 stars", 4: "5 stars"}

# Lista de palabras negativas
negative_words = ["malo", "terrible", "horrible", "pésimo", "asqueroso"]

# Comentario de ejemplo en español
comentario = "horrible"

# Probar el pipeline con el comentario de ejemplo
sentiment_label = custom_sentiment_pipeline(comentario, model, tokenizer, negative_words)

print(f"Comentario: {comentario}")
print(f"Sentimiento: {sentiment_label}")


Comentario: horrible
Sentimiento: 1 star


# **Modificacion para la matriz de confusion**

In [None]:
df['label'] += 1
# Crear una nueva columna que indique si Rating y label coinciden
df['Coincide'] = (df['Rating'] == df['label'])

# **Matriz de confusion y classification report**

In [None]:
# Extraer los valores de Rating y label como listas
y_true = df['Rating'].tolist()
y_pred = df['label'].tolist()

# Calcular la matriz de confusión
conf_matrix = confusion_matrix(y_true, y_pred)

# Imprimir la matriz de confusión
print("Matriz de confusión:")
print(conf_matrix)

# Generar el informe de clasificación
informe = classification_report(y_true, y_pred)

# Imprimir el informe de clasificación
print("Informe de clasificación de 5x5:")
print(informe)

Matriz de confusión:
[[   0 1915  138   62   13   65]
 [   0  828   95   33    9   28]
 [   0  230   35   42   16   25]
 [   0   71   25   54   56  129]
 [   0   41    6   23   32  139]
 [   0    0    0    0    0    0]]
Informe de clasificación de 5x5:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00      2193
           2       0.27      0.83      0.41       993
           3       0.12      0.10      0.11       348
           4       0.25      0.16      0.20       335
           5       0.25      0.13      0.17       241
           6       0.00      0.00      0.00         0

    accuracy                           0.23      4110
   macro avg       0.15      0.20      0.15      4110
weighted avg       0.11      0.23      0.13      4110



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
