# Mini Project
This recommendation system utilizes a content-based approach and mines insights from the '7k-books-with-metadata' dataset, considering features such as title, authors, categories, and descriptions. The system aims to provide users with book recommendations closely aligned with their preferences, enhancing their reading experience.

In [139]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [140]:
# Load the dataset
df = pd.read_csv('archive/books.csv')

In [141]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
vectorizer = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df = df.fillna('')

# Combine the title, authors, categories, and description into a single column
df['content'] = df['title'] + ' ' + df['authors']+ ' ' + df['categories']+ ' ' + df['description']

# drop duplicates based on 'title'
df = df.drop_duplicates(subset='title')

# reset index after dropping duplicates
df.reset_index(drop=True, inplace=True)

# Create a tfidf_matrix
tfidf_matrix = vectorizer.fit_transform(df['content'])

In [142]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

#Construct a reverse map of indices and book titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [143]:
# Function to get the most similar books to recommend
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the book matching the title
    idx = indices[title]

    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar books
    sim_scores = sim_scores[1:11]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar books
    return df[['title','average_rating']].iloc[book_indices]

In [144]:
# Get top 10 most similar books unsorted
similar_books = get_recommendations('The Great Gatsby')

# Sort the similar books by 'average_rating'
sorted_books = similar_books.sort_values('average_rating', ascending=False)

# Print the sorted books
print(sorted_books )

                                             title average_rating
3884                             A Life in Letters           4.24
3886      The Short Stories of F. Scott Fitzgerald           4.23
5172   The St. Paul Stories of F. Scott Fitzgerald           3.97
3176             Fitzgerald: All The Sad Young Men           3.95
2222                      Like Water for Chocolate           3.94
4868                                  The Crack-up           3.92
4461  CliffsNotes on Fitzgerald's The Great Gatsby           3.73
93                     The Love of the Last Tycoon           3.65
4952                      Reading Lolita in Tehran            3.6
5800             F. Scott Fitzgerald on Authorship           3.57
