<a href="https://colab.research.google.com/github/Aaryan-Agr/Book-Recommendation-System/blob/main/BookRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from google.colab import drive
from nltk.tokenize import word_tokenize, sent_tokenize
!pip install nltk
import kagglehub
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

nltk.download('all')
path = kagglehub.dataset_download("arashnic/book-recommendation-dataset")

print("Path to dataset files:", path)

''' drive.mount('/content/drive')
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download arashnic/book-recommendation-dataset
! unzip book-recommendation-dataset.zip '''

In [None]:
books = pd.read_csv("/root/.cache/kagglehub/datasets/arashnic/book-recommendation-dataset/versions/3/Books.csv", low_memory = False)
ratings = pd.read_csv("/root/.cache/kagglehub/datasets/arashnic/book-recommendation-dataset/versions/3/Ratings.csv", low_memory = False)
users = pd.read_csv("/root/.cache/kagglehub/datasets/arashnic/book-recommendation-dataset/versions/3/Users.csv", low_memory = False)

#removing unwanted features
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]
#remove null values
books.dropna(inplace=True)

# Function to clean data (lowercase and remove spaces)
def clean_data(x):
    if isinstance(x, str):
        return str.lower(x.replace(" ", ""))
    else:
        return ''

# Apply data cleaning to the metadata columns
features = ['Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']
for feature in features:
    books[feature] = books[feature].apply(clean_data)

merged = ratings.merge(books, on='ISBN') #Merge Ratings and Books
x = merged.groupby('User-ID').count()['Book-Rating'] >= 100 #Filter out users with less than 100 book ratings
importantUsers = x[x].index #Get Important users
filteredRatings = merged[merged['User-ID'].isin(importantUsers)]
y = filteredRatings.groupby('Book-Title').count()['Book-Rating'] >= 25 #Filter out books with less that 25 unique ratings
importantBooks = y[y].index #Get important books
finalRatings = filteredRatings[filteredRatings['Book-Title'].isin(importantBooks)] #Only include important books
userItem = finalRatings.pivot_table(index='Book-Title', columns = 'User-ID', values = 'Book-Rating') #Make user Item Matrix
userItem.fillna(0, inplace=True) #Fill missing values with 0
simScore = cosine_similarity(userItem) #Get similarity Scores

In [None]:
def contentFiltering(books_df, test_title, top_n=10):
    """ Generates book recommendations based on cosine similarity. """
    # Clean the input title
    title = clean_data(test_title)
    # Combine metadata into a single 'soup' column
    books_df['soup'] = (
        books_df['Book-Title'].fillna('') + ' ' +
        books_df['Book-Author'].fillna('') + ' ' +
        books_df['Year-Of-Publication'].fillna('').astype(str)
    )
    # Vectorize the 'soup' column
    count_vectorizer = CountVectorizer(stop_words='english', max_features=10000)
    count_matrix = count_vectorizer.fit_transform(books_df['soup'])
    # Train NearestNeighbors model
    nn_model = NearestNeighbors(n_neighbors=top_n + 1, metric='cosine', algorithm='brute')
    nn_model.fit(count_matrix)
    # Create a reverse mapping of indices and book titles
    books_df = books_df.reset_index(drop=True)
    indices = pd.Series(books_df.index, index=books_df['Book-Title']).drop_duplicates()
    # Check if the title exists in indices
    if test_title not in indices:
        return f"Book '{test_title}' not found in the dataset."
    idx = indices[test_title]
    # Find the nearest neighbors
    distances, neighbors = nn_model.kneighbors(count_matrix[idx], n_neighbors=top_n + 1)
    # Generate recommendations
    recommendations = []
    for i in range(1, len(neighbors[0])):  # Start from 1 to exclude the input book itself
        neighbor_idx = neighbors[0][i]
        similarity_score = 1 - distances[0][i]  # Convert distance to similarity
        book_title = books_df['Book-Title'].iloc[neighbor_idx]
        recommendations.append((book_title, similarity_score))
    return recommendations

In [None]:
test_title = "1984"
recommendations = contentFiltering(books, test_title, top_n=5)

# Display recommendations
if isinstance(recommendations, list):
    print(f"Books similar to '{test_title}':")
    for book, score in recommendations:
        print(f"{book} (Similarity: {score:.2f})")
else:
    print(recommendations)

Books similar to '1984':
1984 (Similarity: 0.82)
nineteeneighty-four:thefacsimileoftheextantmanuscript (Similarity: 0.82)
1984 (Similarity: 0.82)
1984(spanishlanguageedition) (Similarity: 0.71)
1984 (Similarity: 0.67)


In [None]:
def colabfiltering(book, top_n):
    ann_model = NearestNeighbors(n_neighbors=top_n+1, metric='cosine', algorithm='brute')
    ann_model.fit(userItem)
    book_vector = userItem.loc[book_name].values.reshape(1, -1)
    distances, indices = ann_model.kneighbors(book_vector)
    recommendations = []
    for i in range(1, len(indices[0])):
        similar_book = userItem.index[indices[0][i]]
        similarity_score = 1 - distances[0][i]
        recommendations.append((similar_book, similarity_score))
    return recommendations

In [None]:
book_name = "1984"
recommendations = colabfiltering(book_name, 10)

if isinstance(recommendations, list):
    print(f"Books similar to '{book_name}':")
    for book, score in recommendations:
        print(f"{book} (Similarity: {score:.2f})")
else:
    print(recommendations)

Books similar to '1984':
animalfarm (Similarity: 0.23)
lyingawake (Similarity: 0.23)
waiting (Similarity: 0.21)
bravenewworld (Similarity: 0.21)
slaughterhousefiveorthechildren'scrusade:adutydancewithdeath (Similarity: 0.20)
therestaurantattheendoftheuniverse(hitchhiker'strilogy(paperback)) (Similarity: 0.19)
sarah'swindow (Similarity: 0.19)
awakening (Similarity: 0.18)
thehandmaid'stale (Similarity: 0.18)
rollofthunder,hearmycry (Similarity: 0.18)


In [None]:
def normalize(scores):
    min_score = min(scores.values())
    max_score = max(scores.values())
    if max_score - min_score == 0:
        return {item: 1 for item in scores}
    return {item: (score - min_score) / (max_score - min_score) for item, score in scores.items()}

In [None]:
# Hybrid recommendation
def hybrid_recommendations(user_id, book_title, content_weight=0.5, collaborative_weight=0.5, num_recommendations=10):
    content_recs = contentFiltering(books, book_title, num_recommendations)
    collaborative_recs = colabfiltering(user_id, num_recommendations)

    combined_scores = {}
    for book, score in content_recs:
        combined_scores[book] = combined_scores.get(book, 0) + score * content_weight
    for book, score in collaborative_recs:
        combined_scores[book] = combined_scores.get(book, 0) + score * collaborative_weight

    combined_scores = normalize(combined_scores)

    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:num_recommendations]
    return sorted_recommendations

In [None]:
test_user_id = 277427
test_book_title = "1984"
test_content_weight = 0.1
test_collaborative_weight = 0.9
test_num_recommendations = 20

recommendations = hybrid_recommendations(
    test_user_id,
    test_book_title,
    content_weight=test_content_weight,
    collaborative_weight=test_collaborative_weight,
    num_recommendations=test_num_recommendations
)

print("Hybrid Recommendations:")
for book, score in recommendations:
    print(f"{book} (Hybrid Score: {score:.2f})")

Hybrid Recommendations:
1984 (Hybrid Score: 1.00)
animalfarm (Hybrid Score: 0.56)
lyingawake (Hybrid Score: 0.30)
waiting (Hybrid Score: 0.26)
bravenewworld (Hybrid Score: 0.26)
slaughterhousefiveorthechildren'scrusade:adutydancewithdeath (Hybrid Score: 0.24)
therestaurantattheendoftheuniverse(hitchhiker'strilogy(paperback)) (Hybrid Score: 0.23)
sarah'swindow (Hybrid Score: 0.22)
awakening (Hybrid Score: 0.21)
thehandmaid'stale (Hybrid Score: 0.21)
rollofthunder,hearmycry (Hybrid Score: 0.21)
thecatcherintherye (Hybrid Score: 0.20)
lordoftheflies (Hybrid Score: 0.20)
biblioholism:theliteraryaddiction (Hybrid Score: 0.20)
perfume:thestoryofamurderer(vintageinternational) (Hybrid Score: 0.20)
thevampirelestat(vampirechronicles,bookii) (Hybrid Score: 0.19)
timeline (Hybrid Score: 0.19)
lookatme (Hybrid Score: 0.18)
orangesarenottheonlyfruit (Hybrid Score: 0.18)
wordfreak:heartbreak,triumph,genius,andobsessionintheworldofcompetitivescrabbleplayers (Hybrid Score: 0.18)


In [None]:
#3 Evaluation & Experiments
import numpy as np
from sklearn.metrics import mean_squared_error

def compare_baseline(user_item_matrix, test_indices):
    global_mean = user_item_matrix.values[user_item_matrix > 0].mean()
    user_means = user_item_matrix.mean(axis=1)
    item_means = user_item_matrix.mean(axis=0)

    actual_ratings = []
    global_predictions = []
    user_predictions = []
    item_predictions = []

    for row, col in test_indices:
        actual_ratings.append(user_item_matrix.iloc[row, col])
        global_predictions.append(global_mean)

        # User mean prediction
        if user_means.iloc[row] > 0:
            user_predictions.append(user_means.iloc[row])
        else:
            user_predictions.append(global_mean)

        # Item mean prediction
        if item_means.iloc[col] > 0:
            item_predictions.append(item_means.iloc[col])
        else:
            item_predictions.append(global_mean)
        returnVal = {
          "Global Mean RMSE": np.sqrt(mean_squared_error(actual_ratings, global_predictions)),
          "User Mean RMSE": np.sqrt(mean_squared_error(actual_ratings, user_predictions)),
          "Item Mean RMSE": np.sqrt(mean_squared_error(actual_ratings, item_predictions)),
        }

    return returnVal


In [None]:
# Filter out books with less than 50 unique ratings
filtered_books = filteredRatings.groupby('Book-Title').filter(lambda x: len(x) >= 50)

np.random.seed(42)

filtered_indices = [
    (row, col)
    for row in range(userItem.shape[0])
    for col in range(userItem.shape[1])
    if userItem.iloc[row, col] > 0 and userItem.columns[col] in filtered_books['Book-Title'].values
]

test_sample_size = int(0.02 * len(filtered_indices))
test_indices = np.random.choice(range(len(filtered_indices)), size=test_sample_size, replace=False)
test_indices = [filtered_indices[idx] for idx in test_indices]

train_data = userItem.copy()

for row, col in test_indices:
    train_data.iloc[row, col] = 0


In [None]:
# Compute the global mean once
global_mean = userItem.values[userItem > 0].mean()
actual_ratings = [userItem.iloc[row, col] for row, col in test_indices]
rmse_baseline = compare_baseline(userItem, test_indices)

for baseline, rmse in rmse_baseline.items():
    print(f"{baseline}: {rmse:.4f}")

top_fifty = filteredRatings.groupby('Book-Title').count()['Book-Rating'] >=
hybrid_predictions = []

for row, col in test_indices:
    user_id = userItem.columns[col]
    book_title = userItem.index[row]

    # Generate hybrid recommendations
    hybrid_recommendation_scores = hybrid_recommendations(
        user_id=user_id,
        book_title=book_title,
        content_weight=0.5,
        collaborative_weight=0.5,
        num_recommendations=10
    )

    predicted_score = next((score for book, score in hybrid_recommendation_scores if book == book_title), global_mean)
    hybrid_predictions.append(predicted_score)

# Compute RMSE for hybrid method
hybrid_rmse = np.sqrt(mean_squared_error(actual_ratings, hybrid_predictions))
print("Hybrid Method RMSE:", hybrid_rmse)
