<a href="https://colab.research.google.com/github/Devika-Rudagi/AI-ML-Projects/blob/main/Book_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ----------------------------
# Step 0: Install required packages
# ----------------------------
# !pip install sentence-transformers tensorflow scikit-learn pandas numpy

# ----------------------------
# Step 1: Import libraries
# ----------------------------
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from sklearn.metrics import mean_squared_error

# ----------------------------
# Step 2: Load datasets
# ----------------------------
books = pd.read_csv('/content/sample_data/books.csv')          # Goodbooks-10k
ratings = pd.read_csv('/content/sample_data/ratings.csv')

# ----------------------------
# Step 3: Preprocess book features
# ----------------------------

# --- 3a. Numerical columns (scaled) ---
numerical_cols = ['original_publication_year','average_rating','ratings_count',
                  'work_ratings_count','work_text_reviews_count',
                  'ratings_1','ratings_2','ratings_3','ratings_4','ratings_5','books_count']

scaler_num = StandardScaler()
scaled_numeric = scaler_num.fit_transform(books[numerical_cols].fillna(0))

# --- 3b. Categorical columns (authors, language) ---
# Authors multi-hot
mlb_authors = MultiLabelBinarizer()
authors = mlb_authors.fit_transform(books['authors'].str.split(',').fillna(''))

# Language one-hot
languages = pd.get_dummies(books['language_code']).values

# --- 3c. Text features (title embeddings) ---
model_title = SentenceTransformer('all-MiniLM-L6-v2')
title_embeddings = model_title.encode(books['title'].fillna(''))

# --- 3d. Combine all book features ---
book_features = np.hstack([scaled_numeric, authors, languages, title_embeddings])
book_features = tf.convert_to_tensor(book_features, dtype=tf.float32)

# ----------------------------
# Step 4: Prepare ratings data
# ----------------------------

# Normalize ratings to [0,1]
scaler_rating = MinMaxScaler()
ratings['rating_scaled'] = scaler_rating.fit_transform(ratings[['rating']])

# Train-test split
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

# Map book_id to feature index
book_id_to_index = {book_id: idx for idx, book_id in enumerate(books['book_id'])}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

KeyError: 3423

In [3]:
# Filter ratings to include only book_ids present in the books data
train_ratings = train_ratings[train_ratings['book_id'].isin(books['book_id'])]
test_ratings = test_ratings[test_ratings['book_id'].isin(books['book_id'])]

In [4]:
# Book features for train/test
train_book_features = np.array([book_features[book_id_to_index[b]] for b in train_ratings['book_id']])
test_book_features = np.array([book_features[book_id_to_index[b]] for b in test_ratings['book_id']])


In [5]:

# User IDs
train_user_ids = train_ratings['user_id'].values
test_user_ids = test_ratings['user_id'].values

# Ratings targets
train_y = train_ratings['rating_scaled'].values
test_y = test_ratings['rating_scaled'].values

In [6]:
# ----------------------------
# Step 5: Define the neural network
# ----------------------------
embedding_dim = 32
num_users = ratings['user_id'].nunique()
book_input_dim = book_features.shape[1]

# Book tower
book_input = Input(shape=(book_input_dim,))
x = layers.Dense(128, activation='relu')(book_input)
x = layers.Dense(64, activation='relu')(x)
book_embedding = layers.Dense(embedding_dim, activation=None)(x)

# User tower
user_input = Input(shape=(), dtype=tf.int32)
user_embedding = layers.Embedding(input_dim=num_users+1, output_dim=embedding_dim)(user_input)
user_embedding = layers.Flatten()(user_embedding)

# Dot product for interaction
dot = layers.Dot(axes=1)([user_embedding, book_embedding])
output = layers.Activation('sigmoid')(dot)

model = Model(inputs=[user_input, book_input], outputs=output)
model.compile(optimizer='adam', loss='mse')

model.summary()



In [7]:
# ----------------------------
# Step 6: Train the model
# ----------------------------
model.fit(
    [train_user_ids, train_book_features],
    train_y,
    validation_data=([test_user_ids, test_book_features], test_y),
    epochs=5,
    batch_size=256
)

Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 49ms/step - loss: 0.1013 - val_loss: 0.0755
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 49ms/step - loss: 0.0483 - val_loss: 0.0726
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 48ms/step - loss: 0.0348 - val_loss: 0.0715
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 51ms/step - loss: 0.0299 - val_loss: 0.0731
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 49ms/step - loss: 0.0239 - val_loss: 0.0736


<keras.src.callbacks.history.History at 0x7becba6aec50>

In [8]:
# ----------------------------
# Step 7: Evaluate the model
# ----------------------------
preds = model.predict([test_user_ids, test_book_features]).flatten()
mse = mean_squared_error(test_y, preds)
print(f"Test MSE: {mse}")

# Optional: inverse transform ratings
preds_original = scaler_rating.inverse_transform(preds.reshape(-1,1))
test_y_original = scaler_rating.inverse_transform(test_y.reshape(-1,1))

[1m498/498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step
Test MSE: 0.07355337789721546


In [9]:
# ----------------------------
# Step 8: Recommendation function
# ----------------------------
def recommend_for_user(user_id, top_k=5):
    user_vec = np.array([user_id]*len(books))
    scores = model.predict([user_vec, book_features], batch_size=512).flatten()
    top_indices = scores.argsort()[::-1][:top_k]
    return books.iloc[top_indices][['title', 'authors']]

# Example: Recommend 5 books for user 1
recommend_for_user(1)

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step


Unnamed: 0,title,authors
0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins
1,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré"
3,To Kill a Mockingbird,Harper Lee
18,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. Tolkien
16,"Catching Fire (The Hunger Games, #2)",Suzanne Collins
