In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss


# 1. Cargar dataset
df = pd.read_csv("fiction_books_clean.csv")

# 2. Eliminar libros sin descripción (Reemplazar por la limpieza de Noe)
df = df.dropna(subset=["description"])
df = df[df["description"].str.strip() != ""]
print("Libros después del filtrado:", len(df))


# 3. Generar embeddings
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")

# Convertimos todas las descripciones en embeddings
embeddings = model.encode(
    df["description"].tolist(),
    show_progress_bar=True,
    batch_size=32,
    normalize_embeddings=True  # normaliza a norma 1 → más fácil similitud coseno
)

embeddings = np.array(embeddings).astype("float32")
print("Embeddings shape:", embeddings.shape)  # (n_libros, 384)


# 4. Crear índice FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product (como coseno si está normalizado)
index.add(embeddings)

print("Total libros indexados:", index.ntotal)

# 5. Guardar para usar luego
faiss.write_index(index, "libros_clean.index")
df.to_parquet("libros_clean_metadata.parquet", engine="fastparquet", index=False)

print("✅ Índice y metadata guardados")


  from .autonotebook import tqdm as notebook_tqdm


Libros después del filtrado: 43837


Batches: 100%|██████████| 1370/1370 [38:15<00:00,  1.68s/it] 


Embeddings shape: (43837, 384)
Total libros indexados: 43837
✅ Índice y metadata guardados


In [None]:
# Esto no es necesario si corriste celda anterior
######
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cpu")
######

# Cargar lo guardado
index = faiss.read_index("libros_clean.index")
df = pd.read_parquet("libros_clean_metadata.parquet", engine="fastparquet")


# Enriquecedor de query
def build_expanded_query(query: str) -> str:
    """
    If the query is short (<40 characters), ask for more details
    and build an enriched query in English.
    """
    if len(query.strip()) < 40:
        print("⚠️ Your query was too short, to offer you better recomendations please answer a few more questions...\n")
        
        genre = input("📚 Which genre are you looking for? (e.g., romance, fantasy, mystery): ")
        place = input("🌍 Where should the story take place? (e.g., small town, big city, magical kingdom, outer space, historical setting): ")
        characters = input("👤 What kind of characters? (e.g., teenagers, heroes, families): ")
        tone = input("🎭 What tone do you prefer? (e.g., dramatic, funny, dark, hopeful): ")
        
        query_expanded = (
            f"A {genre} story set in {place}, featuring {characters} characters, with a {tone} tone."
        )
        
        print(f"\n✅ Expanded query: {query_expanded}\n")
        return query_expanded
    
    else:
        print(f"\n👤 User query: {query}")

    return query.strip()

# Filtros de búsqueda
def ask_filters() -> dict:
    """
    Pregunta al usuario si quiere filtrar por autor o último libro leído.
    Devuelve un diccionario con filtros a aplicar.
    """
    filters = {}
    
    author = input("👩‍💻 Do you have an author in mind? (leave empty if not): ").strip()
    if author:
        filters["author"] = author.lower()
    
    last_book = input("📖 What was the last book you read? (leave empty if not relevant): ").strip()
    if last_book:
        filters["last_book"] = last_book.lower()
    
    return filters


def filter_results(df, filters):
    """
    Aplica filtros de autor o último libro sobre los resultados de FAISS.
    """
    results = df
    
    # Filtra por autor, si el usuario eligió alguno
    if "author" in filters:
        results = results[results["authors"].str.lower().str.contains(filters["author"])]
    
    # Filtra por último libro, si el usuario eligió alguno
    if "last_book" in filters:
        mask_to_exclude = results["title"].str.lower().str.contains(filters["last_book"])
        results = results[~mask_to_exclude]
    
    return results


# Consulta del usuario
query_usuario = input("🔎 Enter your request: ")
query_final = build_expanded_query(query_usuario)
query_embedding = model.encode([query_final], normalize_embeddings=True)

# Buscar los 100 más similares (Por si los filtros son restrictivos, en especial el de autor)
k = 100
distances, indices = index.search(query_embedding.astype("float32"), k)

# Genero un dataframe con los 100 libros más parecidos
results_df = df.iloc[indices[0]].copy()
results_df["similarity"] = distances[0]

# Preguntar filtros adicionales
apply_filters = input("⚙️ Would you like to add filters (author, last book)? (y/n): ").strip().lower()

if apply_filters == "y":
    filters = ask_filters()
else:
    filters = {}

# Aplicar filtros sobre candidatos
filtered_df = filter_results(results_df, filters)

# Preguntar si el usuario valora el rating
consider_rating = input("⭐ Do you care about the book's rating? (y/n): ").strip().lower()

if consider_rating == "y":
    # Normalizar ratings entre 0 y 1 (de forma robusta)
    if filtered_df["avg_rating"].notna().any():
        min_r, max_r = filtered_df["avg_rating"].min(), filtered_df["avg_rating"].max()
        if max_r > min_r:
            filtered_df["rating_norm"] = (filtered_df["avg_rating"] - min_r) / (max_r - min_r)
        else:
            filtered_df["rating_norm"] = 0.5  # todos iguales, neutral
    else:
        filtered_df["rating_norm"] = 0.2

    # Ajustar la similitud ponderando con el rating
    filtered_df["similarity"] = (
        0.9 * filtered_df["similarity"] + 0.1 * filtered_df["rating_norm"]
    )

    # Reordenar resultados según la similitud ajustada
    filtered_df = filtered_df.sort_values(by="similarity", ascending=False)

if len(filtered_df) == 0:
    print("😔 Sorry, we couldn’t find a book that matches your request.")
elif len(filtered_df) < 5:
    print(f"ℹ️ We only found {len(filtered_df)} book{'s' if len(filtered_df) != 1 else ''} that matched your request")
    

# Mostrar resultados (top 5 después de filtrar)
for i, (_, row) in enumerate(filtered_df.head(5).iterrows()):
    print(f"\nRank {i+1}: {row['title']} - {row['authors']}")
    print(f"Score (similarity): {row['similarity']:.3f}")
    print(f"Rating: {row.get('avg_rating', 'N/A'):.2f}")
    print(f"Descripción: {row['description'][:200]}...")

  from .autonotebook import tqdm as notebook_tqdm


⚠️ Your query was too short, to offer you better recomendations please answer a few more questions...


✅ Expanded query: A fantasy story set in small town, featuring adult characters, with a dark tone.


Rank 1: Wildwood Imperium (Wildwood Chronicles #3) - Colin Meloy
Score (similarity): 0.597
Rating: 5.00
Descripción: A young girl's midnight séance awakens a long-slumbering malevolent spirit.... A band of runaway orphans allies with an underground collective of saboteurs and plans a daring rescue of their friends, ...

Rank 2: Winesburg, Ohio - Sherwood Anderson
Score (similarity): 0.558
Rating: 3.29
Descripción: A unified collection of short stories about life in a small town in the American Midwest....

Rank 3: Sunshine sketches of a little town - Stephen Leacock
Score (similarity): 0.557
Rating: 5.00
Descripción: "Set in the fictional landscape of Mariposa on the shores of Lake Wissanotti in Missinaba County, Leacock's Sunshine Sketches of A Little Town is an affectionate satire o