In [None]:
# Step 1 : Import Required Libraries
import pandas as pd   # Used to load and manipulate data
from sklearn.feature_extraction.text import TfidfVectorizer   # Converts text to numerical features
from sklearn.metrics.pairwise import cosine_similarity  # Computes similarity between vectors   

In [None]:
# Step 2 :Load the dataset
books = pd.read_csv('books.csv')  
books = books[['title', 'authors', 'average_rating']]   # Use only relevant columns
books.drop_duplicates(subset='title', inplace=True)     # Remove duplicates based on title
books.reset_index(drop=True, inplace=True)              # Reset index after dropping duplicates
books.head()   # Display the first few rows of the dataset

Unnamed: 0,title,authors,average_rating
0,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49
2,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42
3,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78


In [None]:
#  Step 3: Data Preprocessing
# Combine 'title' and 'authors' to create a single text feature and convert to lowercase
books['combined'] = (books['title'] + ' ' + books['authors']).str.lower()

In [None]:
# Step 4: Vectorize the Text using TF-IDF
# TF-IDF (Term Frequency–Inverse Document Frequency) helps to convert text to numerical form
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(books['combined'])

In [None]:
# Step 5: Compute Cosine Similarity
# This measures how similar each book is to every other book based on the TF-IDF matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
#  Step 6: Define the Book Recommendation Function
def recommend_books(title, cosine_sim=cosine_sim):
    title = title.lower()
    if title not in books['title'].str.lower().values:
        return "Book not found in dataset."
    
    idx = books[books['title'].str.lower() == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]  # top 5
    book_indices = [i[0] for i in sim_scores]
    
    return books[['title', 'authors', 'average_rating']].iloc[book_indices]


In [15]:
# Step 7: Test the Recommendation Function
recommend_books("The Great Gatsby")  # Replace with any book title from the dataset

Unnamed: 0,title,authors,average_rating
1233,Cliffs Notes on Fitzgerald's the Great Gatsby,Kate Maurer/F. Scott Fitzgerald/CliffsNotes,3.7
3088,All the Sad Young Men (Works of F. Scott Fitzg...,F. Scott Fitzgerald/James L.W. West III,3.96
1230,The St. Paul Stories of F. Scott Fitzgerald,F. Scott Fitzgerald/Dave Page/Patricia Hampl,4.0
4344,The Love of the Last Tycoon,F. Scott Fitzgerald,3.65
1228,A Life in Letters,F. Scott Fitzgerald/Matthew J. Bruccoli,4.22
