# Predecir la puntuación de un objeto en Amazon en base a la reseña

In [7]:
# Importación de librerías necesarias
import re
import spacy
import pandas as pd
import numpy as np
from spacy.lang.es.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score as balance_acuarcy_score
from sklearn.metrics import roc_auc_score as AUC
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier


# Descargar y cargar el modelo de spaCy
tokenizer = spacy.load("es_core_news_sm")

# Cargar dataset desde archivo CSV
file_path = "data/reviews.csv"
df = pd.read_csv(file_path)

# Preprocesamiento de texto: Minúsculas, eliminación de números/símbolos, tokenización, lematización
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-záéíóúüñ\s]', '', text)
    doc = tokenizer(text)
    tokens = [token.lemma_ for token in doc if token.lemma_ not in STOP_WORDS and not token.is_punct]
    return ' '.join(tokens)

# Aplicar preprocesamiento a cada review
df['processed_review'] = df['review'].apply(preprocess_text)

# División de datos para entrenamiento y prueba con estratificación
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
distrib = []
for i in np.unique(y):
    distrib.append(len(y[y == i]))
priori = distrib / np.sum(distrib) #Normalizar prob
factor = 1
priori = [factor / np.log(d + 1) for d in distrib]
priori = np.array(priori) / np.sum(priori)


### Procesamiento y resultados

In [8]:
# Vectorización con TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_review'])
y = df['stars']

# Verificar si hay suficientes datos para dividir
if len(df) > 2:
    # División de datos para entrenamiento y prueba con estratificación
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Aplicar SMOTE para balancear las clases en el conjunto de entrenamiento
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    
    # Definir clasificadores
    classifiers = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Naive Bayes": MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True),
        "Naive Bayes Balanceado": MultinomialNB(class_prior=priori, fit_prior=True),
        "OVO": OneVsOneClassifier(SVC(kernel='linear', C=1, random_state=42)),
        "OVR": OneVsRestClassifier(SVC(kernel='linear', C=1, random_state=42)) 
    }
    
    # Entrenar y evaluar cada clasificador
    for name, model in classifiers.items():
        model.fit(X_train_res, y_train_res)  # Entrenamos con los datos balanceados
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        matrix = confusion_matrix(y_test, y_pred)
        print(matrix)
        print(f'{name} Accuracy: {accuracy:.2f}')


[[16  1  1  4  1]
 [ 4  0  0  2  2]
 [ 1  0  1  3  6]
 [ 3  0  1  4 15]
 [ 3  0  3  8 67]]
Logistic Regression Accuracy: 0.60
[[14  1  1  7  0]
 [ 5  0  0  2  1]
 [ 3  1  0  2  5]
 [ 2  0  1  5 15]
 [ 5  0  4 19 53]]
Naive Bayes Accuracy: 0.49
[[13  4  1  5  0]
 [ 4  1  0  3  0]
 [ 0  4  3  2  2]
 [ 2  3  9  6  3]
 [ 8  9 15 18 31]]
Naive Bayes Balanceado Accuracy: 0.37
[[14  0  0  4  5]
 [ 4  0  0  1  3]
 [ 0  0  1  3  7]
 [ 2  0  1  4 16]
 [ 2  0  3  7 69]]
OVO Accuracy: 0.60
[[15  1  1  4  2]
 [ 4  0  0  1  3]
 [ 1  0  1  3  6]
 [ 4  0  1  3 15]
 [ 2  0  5  8 66]]
OVR Accuracy: 0.58


### Pruebas

In [12]:
# Clasificadores en un ensamblaje (Voting Classifier)
ovo_linear = OneVsRestClassifier(SVC(kernel='linear', probability=True))
logistic = LogisticRegression(max_iter=1000)
naive = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

ensemble = VotingClassifier(
    estimators=[('ovo_linear', ovo_linear), ('naive', naive)],
    voting='soft',         
    weights=[0.8, 0.2]
)

ensemble.fit(X_train_res, y_train_res)
y_pred = ensemble.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Ensemble Accuracy with soft voting:", accuracy)

# GridSearchCV para optimizar Naive Bayes
nb = MultinomialNB()
param_grid = {'alpha': [0.01 ,0.1, 0.5, 1.0, 2.0, 3.0], 'fit_prior': [True, False], 'class_prior': [None, [0.1, 0.9], [0.2, 0.8]]}
grid_search = GridSearchCV(nb, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_res, y_train_res)
print("Mejores parámetros de Naive Bayes:", grid_search.best_params_)
print("Mejor puntuación de Naive Bayes:", grid_search.best_score_)


# Mostrar algunos ejemplos antes y después del preprocesamiento
num_examples = 5  # Número de ejemplos a imprimir
sample_reviews = df[['review', 'processed_review']].sample(num_examples, random_state=41)

for i, row in sample_reviews.iterrows():
    print(f"🔹 **Texto Original:** {row['review']}")
    print(f"🔹 **Texto Procesado:** {row['processed_review']}")
    print("-" * 80)


Ensemble Accuracy with soft voting: 0.6027397260273972
Mejores parámetros de Naive Bayes: {'alpha': 0.01, 'class_prior': None, 'fit_prior': False}
Mejor puntuación de Naive Bayes: 0.9226006191950464
🔹 **Texto Original:** Se ha comprado un producto nuevo y el vendedor envía uno usado. La caja está sin precintar y defectuosa por qué se ha utilizado muchas veces. El plástico del consola se ve que no es nuevo tiene muchos usos. Un engaño. Vendedor no es de confianza.
🔹 **Texto Procesado:** comprar producto vendedor enviar usado caja precintar defectuoso utilizar plástico consola engaño vendedor confianza
--------------------------------------------------------------------------------
🔹 **Texto Original:** Para las reuniones familiares no puede faltar
🔹 **Texto Procesado:** reunión familiar faltar
--------------------------------------------------------------------------------
🔹 **Texto Original:** Solo buenas palabras, fui usuario de IOS muchos años y por motivos económicos tuve que pasarm

120 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Carlos\anaconda3\envs\py311ml\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Carlos\anaconda3\envs\py311ml\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Carlos\anaconda3\envs\py311ml\Lib\site-packages\sklearn\naive_bayes.py", line 762, in fit
    self._update_class_log_prior(class_prior=class_prior)
  File "c:\Users\Carlos\anaco