# 🧠 Deep Learning - Análisis de Sentimientos

Este notebook implementa un modelo de Deep Learning para análisis de sentimientos.

## Objetivos:
1. **Preparación de datos**: Tokenización numérica, padding, embeddings
2. **Arquitectura**: Red neuronal con capas Embedding, LSTM, Dense
3. **Entrenamiento**: Optimización y validación
4. **Evaluación**: Métricas y comparación con modelos tradicionales
5. **Mejoras**: Hiperparámetros, regularización, arquitecturas avanzadas

In [3]:
# Importar librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Configurar TensorFlow
tf.random.set_seed(42)
np.random.seed(42)

print(f"✅ TensorFlow versión: {tf.__version__}")
print("✅ Librerías de Deep Learning importadas correctamente")

ModuleNotFoundError: No module named 'tensorflow'

## 1. Cargar y Preparar Datos

In [None]:
# Función para crear dataset sintético
def create_synthetic_dataset():
    """Crea un dataset sintético para pruebas"""
    positive_texts = [
        "I love this movie, it is absolutely fantastic!",
        "Amazing performance by all the actors!",
        "Great film with excellent cinematography",
        "Wonderful story and brilliant directing",
        "Outstanding movie, highly recommended!",
        "Perfect entertainment for the whole family",
        "Incredible acting and beautiful soundtrack",
        "This film exceeded all my expectations",
        "Masterpiece of modern cinema",
        "Absolutely loved every minute of it"
    ]
    
    negative_texts = [
        "This movie is terrible, I hate it completely",
        "Worst film I have ever seen in my life",
        "Boring and predictable storyline",
        "Poor acting and bad direction",
        "Complete waste of time and money",
        "Disappointing and poorly executed",
        "Terrible script and awful performances",
        "Not worth watching at all",
        "Very bad movie with no redeeming qualities",
        "Horrible experience, would not recommend"
    ]
    
    neutral_texts = [
        "The movie was okay, nothing special",
        "Average film with some good moments",
        "Not bad but could have been better",
        "Decent movie for a casual watch",
        "It was fine, met my expectations",
        "Moderate entertainment value",
        "Acceptable but forgettable",
        "Standard movie with typical plot",
        "Neither good nor bad, just average",
        "Okay for a one-time watch"
    ]
    
    # Crear dataset
    texts = (positive_texts * 50) + (negative_texts * 50) + (neutral_texts * 50)
    sentiments = ([4] * 500) + ([0] * 500) + ([2] * 500)
    labels = (['Positivo'] * 500) + (['Negativo'] * 500) + (['Neutral'] * 500)
    
    return pd.DataFrame({
        'text': texts,
        'sentiment': sentiments,
        'sentiment_label': labels
    })

# Cargar dataset
try:
    df = pd.read_csv('../data/dataset_procesado.csv')
    print(f"✅ Dataset cargado: {df.shape[0]} filas, {df.shape[1]} columnas")
except FileNotFoundError:
    print("❌ No se encontró el dataset procesado")
    print("💡 Creando dataset sintético para pruebas...")
    df = create_synthetic_dataset()
    print(f"✅ Dataset sintético creado: {df.shape[0]} filas")
    
# Mostrar información básica
print("\n=== INFORMACIÓN DEL DATASET ===")
print(f"Total de textos: {len(df)}")
print("\n=== DISTRIBUCIÓN DE SENTIMIENTOS ===")
print(df['sentiment_label'].value_counts())

In [2]:
# Preparar datos para Deep Learning
print("🔄 Preparando datos para Deep Learning...")

texts = df['text'].tolist()
labels = df['sentiment_label'].tolist()

# Codificar labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

print(f"Clases: {label_encoder.classes_}")
print(f"Número de clases: {num_classes}")

# Convertir a categorical
categorical_labels = to_categorical(encoded_labels, num_classes=num_classes)
print(f"Shape de labels categóricas: {categorical_labels.shape}")

🔄 Preparando datos para Deep Learning...


NameError: name 'df' is not defined

## 2. Tokenización y Preparación de Secuencias

In [None]:
# Configurar parámetros
MAX_FEATURES = 5000  # Vocabulario máximo
MAX_LENGTH = 50      # Longitud máxima de secuencia

# Crear tokenizer
tokenizer = Tokenizer(num_words=MAX_FEATURES, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

# Convertir textos a secuencias
sequences = tokenizer.texts_to_sequences(texts)

# Hacer padding
X = pad_sequences(sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

vocab_size = min(MAX_FEATURES, len(tokenizer.word_index) + 1)
print(f"Vocabulario total: {len(tokenizer.word_index)}")
print(f"Vocabulario usado: {vocab_size}")
print(f"Shape de X: {X.shape}")
print(f"Longitud máxima: {MAX_LENGTH}")

# Ejemplo de tokenización
print("\n=== EJEMPLO DE TOKENIZACIÓN ===")
print(f"Texto: {texts[0]}")
print(f"Secuencia: {sequences[0]}")
print(f"Padded: {X[0]}")

## 3. División de Datos

In [None]:
# Dividir datos
X_train, X_test, y_train, y_test = train_test_split(
    X, categorical_labels, test_size=0.2, random_state=42, stratify=encoded_labels
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=np.argmax(y_train, axis=1)
)

print(f"Train: {X_train.shape[0]} muestras")
print(f"Validation: {X_val.shape[0]} muestras")
print(f"Test: {X_test.shape[0]} muestras")

print("\n=== DISTRIBUCIÓN DE CLASES ===")
print("Train:", np.bincount(np.argmax(y_train, axis=1)))
print("Validation:", np.bincount(np.argmax(y_val, axis=1)))
print("Test:", np.bincount(np.argmax(y_test, axis=1)))

## 4. Crear Modelo

In [None]:
# Crear modelo
model = Sequential([
    Embedding(vocab_size, 128, input_length=MAX_LENGTH),
    LSTM(64, return_sequences=True, dropout=0.2),
    LSTM(32, dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compilar
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Mostrar arquitectura
print("=== ARQUITECTURA DEL MODELO ===")
model.summary()

total_params = model.count_params()
print(f"\nTotal de parámetros: {total_params:,}")

## 5. Entrenamiento

In [None]:
# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.0001)
]

# Entrenar
print("🔄 Iniciando entrenamiento...")
history = model.fit(
    X_train, y_train,
    batch_size=32,
    epochs=20,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

print("\n✅ Entrenamiento completado")

## 6. Visualización del Entrenamiento

In [None]:
# Curvas de entrenamiento
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Accuracy
ax1.plot(history.history['accuracy'], label='Training', color='blue')
ax1.plot(history.history['val_accuracy'], label='Validation', color='red')
ax1.set_title('Accuracy durante el entrenamiento')
ax1.set_xlabel('Época')
ax1.set_ylabel('Accuracy')
ax1.legend()
ax1.grid(True)

# Loss
ax2.plot(history.history['loss'], label='Training', color='blue')
ax2.plot(history.history['val_loss'], label='Validation', color='red')
ax2.set_title('Loss durante el entrenamiento')
ax2.set_xlabel('Época')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()

# Métricas finales
final_train_acc = history.history['accuracy'][-1]
final_val_acc = history.history['val_accuracy'][-1]
print(f"\nTraining Accuracy: {final_train_acc:.4f}")
print(f"Validation Accuracy: {final_val_acc:.4f}")

## 7. Evaluación en Test Set

In [None]:
# Evaluar en test
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"✅ Test Accuracy: {test_accuracy:.4f}")
print(f"✅ Test Loss: {test_loss:.4f}")

# Predicciones
y_pred_proba = model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)
y_true = np.argmax(y_test, axis=1)

# Reporte
print("\n=== REPORTE DE CLASIFICACIÓN ===")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

## 8. Matriz de Confusión

In [None]:
# Matriz de confusión
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title(f'Matriz de Confusión - Deep Learning\nAccuracy: {test_accuracy:.4f}')
plt.xlabel('Predicción')
plt.ylabel('Verdadero')
plt.show()

## 9. Función de Predicción

In [None]:
def predict_sentiment(text):
    """Predice el sentimiento de un texto"""
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=MAX_LENGTH, padding='post')
    prediction = model.predict(padded)
    predicted_class = np.argmax(prediction, axis=1)[0]
    confidence = np.max(prediction)
    sentiment = label_encoder.classes_[predicted_class]
    return sentiment, confidence

# Probar con ejemplos
test_texts = [
    "I love this movie! It's fantastic!",
    "This film is terrible and boring",
    "The movie was okay, nothing special",
    "Amazing performance by the actors!",
    "Not worth watching at all"
]

print("=== PREDICCIONES DE EJEMPLO ===")
for i, text in enumerate(test_texts, 1):
    sentiment, confidence = predict_sentiment(text)
    print(f"\n{i}. {text}")
    print(f"   Predicción: {sentiment} (Confianza: {confidence:.4f})")

## 10. Guardado del Modelo

In [None]:
# Guardar modelo
import os
import pickle

os.makedirs('../data/models', exist_ok=True)

# Guardar modelo
model.save('../data/models/deep_learning_model.h5')

# Guardar tokenizer
with open('../data/models/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Guardar label encoder
with open('../data/models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("\n✅ Modelo guardado exitosamente")

## 11. Resumen Final

In [None]:
print("\n" + "="*50)
print("🎉 RESUMEN FINAL - DEEP LEARNING")
print("="*50)
print(f"📊 Dataset: {len(texts)} textos procesados")
print(f"🏗️  Arquitectura: LSTM + Dense layers")
print(f"📈 Parámetros: {total_params:,}")
print(f"🎯 Test Accuracy: {test_accuracy:.4f}")
print(f"⚡ Épocas: {len(history.history['accuracy'])}")
print(f"🔤 Vocabulario: {vocab_size} palabras")
print(f"📏 Longitud máxima: {MAX_LENGTH} tokens")
print(f"🏷️  Clases: {', '.join(label_encoder.classes_)}")
print("\n✅ Modelo completado y guardado!")
print("\n💡 Próximos pasos:")
print("   - Probar embeddings pre-entrenados")
print("   - Experimentar con CNN o Transformer")
print("   - Optimizar hiperparámetros")
print("   - Crear interfaz web")