# 3. Etapa de entrenamiento y testeo de un modelo de análisis de sentimiento

In [None]:
pip install imblearn

In [None]:
###############################################
# Paso 1: Representación con Bag of Words (BoW)
###############################################

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import ast

# Para cargar el CSV
df = pd.read_csv("./Musical_Instruments_preprocessed.csv")

from imblearn.under_sampling import RandomUnderSampler

# Balancear resviews

# Definir variables X (texto) e y (clase)
X = df["review_clean"]  # Usaremos el texto limpio para el modelo
y = df["sentiment"]     # Etiquetas (0 = negativo, 1 = positivo)

# Aplicar undersampling para igualar el número de positivos y negativos
undersampler = RandomUnderSampler(sampling_strategy=1.0, random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X.values.reshape(-1, 1), y)

# Convertir X_resampled a DataFrame
df_balanced = pd.DataFrame({"review_clean": X_resampled.flatten(), "sentiment": y_resampled})




In [2]:
from sklearn.model_selection import train_test_split

# Variables de entrada y salida
X = df_balanced["review_clean"]  # Texto limpio
y = df_balanced["sentiment"]      # Etiquetas (0 = negativo, 1 = positivo)

# División en 80% entrenamiento y 20% prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Verificar distribución después del split
print("📌 Distribución en Train:\n", y_train.value_counts(normalize=True))
print("📌 Distribución en Test:\n", y_test.value_counts(normalize=True))


📌 Distribución en Train:
 sentiment
positive    0.500006
negative    0.499994
Name: proportion, dtype: float64
📌 Distribución en Test:
 sentiment
negative    0.500022
positive    0.499978
Name: proportion, dtype: float64


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Reemplazar valores NaN en la columna review_clean
df_balanced["review_clean"] = df_balanced["review_clean"].fillna("")

df_balanced.to_csv("./Musical_Instruments_balanced.csv", index=False)

# Crear vectorizador BoW con las 5000 palabras más frecuentes
vectorizer = CountVectorizer(max_features=5000)

# Aplicar transformación al texto
X_bow = vectorizer.fit_transform(df_balanced["review_clean"])

# Mostrar tamaño de la matriz resultante
print(f"📌 Matriz BoW generada con tamaño: {X_bow.shape}")



📌 Matriz BoW generada con tamaño: (113582, 5000)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# División en 80% entrenamiento y 20% prueba
X_train, X_test, y_train, y_test = train_test_split(X_bow, df_balanced["sentiment"], test_size=0.2, random_state=42, stratify=df_balanced["sentiment"])

# Modelo 1: Regresión Logística
logistic_model = LogisticRegression(class_weight="balanced", max_iter=200)
logistic_model.fit(X_train, y_train)

# Modelo 2: Random Forest
rf_model = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42)
rf_model.fit(X_train, y_train)

print("📌 Modelos entrenados correctamente.")



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


📌 Modelos entrenados correctamente.


In [10]:
import numpy as np
import gensim
from gensim.models import Word2Vec

df_balanced["tokens"] = df["tokens"]

# Entrenar modelo Word2Vec con los tokens limpios
word2vec_model = Word2Vec(sentences=df_balanced["tokens"], vector_size=100, window=5, min_count=5, workers=4)

# Obtener el tamaño de los embeddings 
vector_size = word2vec_model.vector_size

# Función para convertir una review en un 
# vector promedio de sus palabras
def get_review_vector(review_tokens, model):
    vectors = [model.wv[word] for word in review_tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

# Convertir todas las reviews en vectores
X_w2v = np.array([get_review_vector(tokens, word2vec_model) for tokens in df_balanced["tokens"]])

# Mostrar tamaño de la matriz Word2Vec
print(f"📌 Matriz Word2Vec generada con tamaño: {X_w2v.shape}")


📌 Matriz Word2Vec generada con tamaño: (113582, 100)


In [11]:
# División en 80% entrenamiento y 20% prueba
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_w2v, df_balanced["sentiment"], test_size=0.2, random_state=42, stratify=df_balanced["sentiment"])

# Modelo con Word2Vec
logistic_w2v = LogisticRegression(class_weight="balanced", max_iter=200)
logistic_w2v.fit(X_train_w2v, y_train_w2v)

print("📌 Modelo con Word2Vec entrenado correctamente.")


📌 Modelo con Word2Vec entrenado correctamente.


In [12]:
from sklearn.metrics import classification_report

# Predicciones con Word2Vec
y_pred_w2v = logistic_w2v.predict(X_test_w2v)

# Evaluación de Word2Vec
print("📌 Resultados de Word2Vec:")
print(classification_report(y_test_w2v, y_pred_w2v))


📌 Resultados de Word2Vec:
              precision    recall  f1-score   support

    negative       0.57      0.61      0.59     11359
    positive       0.58      0.54      0.56     11358

    accuracy                           0.58     22717
   macro avg       0.58      0.58      0.58     22717
weighted avg       0.58      0.58      0.58     22717



In [None]:
# Predicciones en Test
y_pred_logistic = logistic_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# Evaluación de Regresión Logística
print("📌 Resultados de Regresión Logística:")
print(classification_report(y_test, y_pred_logistic))

# Evaluación de Random Forest
print("📌 Resultados de Random Forest:")
print(classification_report(y_test, y_pred_rf))


📌 Resultados de Regresión Logística:
              precision    recall  f1-score   support

    negative       0.81      0.77      0.79     11359
    positive       0.78      0.82      0.80     11358

    accuracy                           0.80     22717
   macro avg       0.80      0.80      0.80     22717
weighted avg       0.80      0.80      0.80     22717

📌 Resultados de Random Forest:
              precision    recall  f1-score   support

    negative       0.75      0.81      0.78     11359
    positive       0.79      0.73      0.76     11358

    accuracy                           0.77     22717
   macro avg       0.77      0.77      0.77     22717
weighted avg       0.77      0.77      0.77     22717



In [None]:
import os
import joblib

# Definir carpeta donde se guardarán los modelos
model_dir = "./models"
os.makedirs(model_dir, exist_ok=True)  # Crear la carpeta si no existe

# Definir nombres de archivos
logistic_file = os.path.join(model_dir, "vd_logistic_bow.pkl")
rf_file = os.path.join(model_dir, "vd_random_forest_bow.pkl")
w2v_file = os.path.join(model_dir, "vd_logistic_word2vec.pkl")

# Guardar modelos
joblib.dump(logistic_model, logistic_file)
joblib.dump(rf_model, rf_file)
joblib.dump(logistic_w2v, w2v_file)

print(f"📌 Modelos guardados en {model_dir}")

📌 Modelos guardados en ./models
