In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pickle
import os
from PIL import Image

# Load dataset
books = pd.read_csv("data.csv")

# Assume you have images in a folder "images" with filenames matching book titles
# e.g., "The Hunger Games.jpg" -> "images/The Hunger Games.jpg"
image_folder = "images"

# Add image path column
def get_image_path(title):
    # Replace invalid filename characters if needed
    filename = title.replace("/", "_") + ".jpg"
    path = os.path.join(image_folder, filename)
    return path if os.path.exists(path) else None

books['image_path'] = books['title'].apply(get_image_path)

# Combine text features
books['combined_features'] = (
    books['title'].fillna('') + " " +
    books['subtitle'].fillna('') + " " +
    books['authors'].fillna('') + " " +
    books['categories'].fillna('') + " " +
    books['description'].fillna('')
)

# Titles
books['title'] = books['title'].astype(str).str.strip()
books['title_clean'] = books['title'].str.lower().str.strip()

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(books['combined_features'])

# KMeans clustering
num_clusters = 15
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
books['cluster'] = kmeans.fit_predict(tfidf_matrix)

# Save models
with open('books.pkl', 'wb') as f:
    pickle.dump(books, f)
with open('kmeans.pkl', 'wb') as f:
    pickle.dump(kmeans, f)
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

print("✅ Backend trained and images imported successfully.")


✅ Backend trained and images imported successfully.
