In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

# 1. Cargar datos
df = pd.read_csv("peliculas.csv")

# 2. Preprocesar
texts = df["synopsis"].astype(str).values
genres = df["genre"].astype(str).values

# Codificar géneros manualmente (sin sklearn)
unique_genres = sorted(set(genres))
genre_to_index = {genre: i for i, genre in enumerate(unique_genres)}
index_to_genre = {i: genre for genre, i in genre_to_index.items()}
y = np.array([genre_to_index[g] for g in genres])
y = to_categorical(y, num_classes=len(unique_genres))

# Tokenizar texto
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=100)

# Separar datos manualmente (80/20)
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# 3. Construir modelo
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    GlobalAveragePooling1D(),
    Dense(64, activation="relu"),
    Dense(len(unique_genres), activation="softmax")
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# 4. Entrenar
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# 5. Predicción de ejemplo
def predecir_genero(sinopsis):
    seq = tokenizer.texts_to_sequences([sinopsis])
    padded = pad_sequences(seq, maxlen=100)
    pred = model.predict(padded)
    return index_to_genre[np.argmax(pred)]

print(predecir_genero("A young man finds a mysterious object in the woods."))
