In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

# * 1. Cargar datos
df = pd.read_csv("movies.csv")

# * 2. Preprocesar los datos a analizar
texts = df["synopsis"].astype(str).values
genres = df["genre"].astype(str).values

# * Codificar géneros de peliculas
unique_genres = sorted(set(genres))
genre_to_index = {genre: i for i, genre in enumerate(unique_genres)}
index_to_genre = {i: genre for genre, i in genre_to_index.items()}
y = np.array([genre_to_index[g] for g in genres])
y = to_categorical(y, num_classes=len(unique_genres))

print(index_to_genre)
print(y)

# * Tokenizar texto
# ? convierte a las palabras en numeros (especificamente las 5000 mas comunes, el resto se marcan como "<OOV>")
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")

# ? analiza las palabras y crea un diccionario basado en las frecuencias.
tokenizer.fit_on_texts(texts)

# ? convierte cada sinopsis en secuencias de numeros.
sequences = tokenizer.texts_to_sequences(texts)

# ? normaliza las sequencias con un maximo de 100 tokens (si es mas larga se trunca, sino se rellena de 0s)
X = pad_sequences(sequences, maxlen=100)

# ? Separar datos manualmente (80% entrenamiento /20% prueba)
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# * 3. Construir modelo
model = Sequential([
    # ? convierte los tokens en vectores densos de 64 dimensiones.
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    # ? Reduce la dimensionalidad promediando los vectores.
    GlobalAveragePooling1D(),
    # ? Define una capa densa con 64 neuronas
    Dense(64, activation="relu"),
    # ? Define una capa densa con una neurona por cada genero
    Dense(len(unique_genres), activation="softmax")
])

# * Compilacion del modelo
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# * 4. Entrenar el modelo con 10 epocas (pasadas completas de TODOS los datos).
model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test))

# 5. Predicción de ejemplo
def predecir_genero(sinopsis):
    # ? convierte la sinopsis recibida en una sequencia de tokens
    seq = tokenizer.texts_to_sequences([sinopsis])
    # ? normaliza los datos al igual que a los de entrenamiento
    padded = pad_sequences(seq, maxlen=100)
    # ? realiza la prediccion
    pred = model.predict(padded)
    # ? realiza la prediccion
    print("prediccion: ", pred)
    return index_to_genre[np.argmax(pred)]

print(predecir_genero("A family moves into an old house where terrifying paranormal phenomena begin to occur. Desperate, they turn to supernatural investigators Ed and Lorraine Warren, who discover that a dark, demonic presence is haunting the place. To save the family, they must confront a malevolent entity that threatens to possess them all."))


{0: 'action', 1: 'adventure', 2: 'crime', 3: 'family', 4: 'fantasy', 5: 'horror', 6: 'mystery', 7: 'romance', 8: 'scifi', 9: 'thriller'}
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Epoch 1/50




[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.1122 - loss: 2.2969 - val_accuracy: 0.2107 - val_loss: 2.1439
Epoch 2/50
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.2545 - loss: 2.0436 - val_accuracy: 0.2685 - val_loss: 1.9567
Epoch 3/50
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.3230 - loss: 1.8670 - val_accuracy: 0.2886 - val_loss: 1.9569
Epoch 4/50
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.3531 - loss: 1.7910 - val_accuracy: 0.3306 - val_loss: 1.8611
Epoch 5/50
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.3751 - loss: 1.7424 - val_accuracy: 0.3269 - val_loss: 1.8751
Epoch 6/50
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.3930 - loss: 1.6987 - val_accuracy: 0.3384 - val_loss: 1.8637
Epoch 7/50
[1m1350/1350[0