# This notebook prepares book data, builds a TF-IDF-based content recommender system, and saves it for use in the Django app.


In [129]:
import pandas as pd
import numpy as np


In [130]:
from data_loader import load_cleaned_ratings, load_cleaned_users
books = pd.read_csv("books_enriched.csv")
ratings = load_cleaned_ratings()
users = load_cleaned_users()


## 1. Mark Popular Books (Top 5%)


In [131]:
books["Popularity"] = False
top_n = int(0.05 * len(books))
popular_isbns = ratings["ISBN"].value_counts().index[:top_n]
books.loc[books["ISBN"].isin(popular_isbns), "Popularity"] = True


## 2. Clean Columns (drop unused)


In [133]:
books.drop(columns=["description", "genre"], errors='ignore', inplace=True)

books = books.iloc[:,[0,1,9,10,2,3,4,8,5,6,7]]

IndexError: positional indexers are out-of-bounds

## 3. Save Cleaned Dataset


In [None]:
books.to_csv("../data/processed/books_cleaned.csv", index=False)


## 4. Build TF-IDF Recommender


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

books["Description"] = books["Description"].fillna("")
books["combined"] = books["Book-Title"] + ' ' + books["Book-Author"] + ' ' + books["Description"]

tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(books["combined"])

cos_sim = cosine_similarity(tfidf_matrix)

isbn_to_index = pd.Series(books.index, index=books["ISBN"]).drop_duplicates()


## 5. Save Model for Production


In [None]:
import pickle

with open('../recommender/tfidf_similarity.pkl', 'wb') as f:
    pickle.dump({
        "cos_sim": cos_sim,
        "isbn_to_index": isbn_to_index,
        "books": books[["ISBN", "Book-Title", "Book-Author"]],
    }, f)


## 6. Test: Recommend Similar Books

In [None]:
def recommend_similar_books(isbn, N=10):
    if isbn not in isbn_to_index:
        return []
    idx = isbn_to_index[isbn]
    scores = list(enumerate(cos_sim[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:N+1]
    return books.iloc[[score[0] for score in scores]][["ISBN", "Book-Title"]]


In [None]:
recommend_similar_books("0452283868", 5)


# ✅ Ready! Now the model is saved and can be loaded in your `content_based_filtering.py`.
