In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sklearn
import pickleshare as ps

In [9]:
books_big = pd.read_csv('data/books2.csv', sep=',', header=0, low_memory=False)
ratings = pd.read_csv('data/ratings.csv', sep=',', header=0)
users = pd.read_csv('data/users.csv', sep=',', header=0)

In [10]:
books_big.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher\t\t\t
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.\t\t\t
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.\t\t\t
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic\t\t\t
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.\t\t\t
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic\t\t\t


In [11]:
!ls -lh data/books2.csv

ls: data/books2.csv: No such file or directory


In [12]:
# dropping URLS columns as they are not needed

books_big.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1, inplace=True)
books_big.head()

KeyError: "['Image-URL-S', 'Image-URL-M', 'Image-URL-L'] not found in axis"

In [None]:
#making headers lowercase and snakecase

books_big.columns = books_big.columns.str.lower()
books_big.columns = books_big.columns.str.replace('-', '_')
books_big.head()

In [None]:
# Define the specific ISBN numbers and the replacement publisher name
isbn_to_replace = ['193169656X', '1931696993']  # Replace with actual ISBN numbers
replacement_publisher = 'NovelBooks'

# Replace the publisher name
books_big.loc[books_big['isbn'].isin(isbn_to_replace), 'publisher'] = replacement_publisher

print(books_big[books_big['isbn'].isin(isbn_to_replace)])

In [None]:
books_big.book_title.nunique()

In [None]:
books_big.shape

In [None]:
#making headers lowercase and snakecase

ratings.columns = ratings.columns.str.lower()
ratings.columns = ratings.columns.str.replace('-', '_')
ratings.head()


In [None]:
# merge books with ratings df

books_ratings = pd.merge(books_big, ratings, on='isbn', how='left')

books_ratings.head()

In [None]:
books_ratings.shape

In [None]:
books_ratings.isna().sum()

In [None]:
books_ratings.dropna(subset=['user_id'], inplace=True)          # drop rows with no user_id
books_ratings.head()

In [None]:
books_ratings.info()

In [None]:
books_ratings['user_id'] = books_ratings['user_id'].astype(int)
books_ratings['book_rating'] = books_ratings['book_rating'].astype(int)

In [None]:
books_ratings.to_csv('data/books_ratings.csv', index=False) # saving the merged dataframe to a csv file

In [None]:
# Group by 'isbn' and count the number of ratings for each ISBN
isbn_rating_counts = books_ratings.groupby(['book_title', 'book_author', 'isbn']).size().reset_index(name='rating_count')

# Display the DataFrame to verify the result
ratings = isbn_rating_counts.sort_values('rating_count', ascending=False).head().plot(kind='bar', x='book_author', y='rating_count', color='skyblue')

In [None]:
isbn_rating_counts.head()

In [None]:
# including the average rating for each book
# Calculate the average rating for each book

to_be_rated = books_ratings[['isbn', 'book_rating']]
averageRating = to_be_rated.groupby('isbn')['book_rating'].mean().round(1).reset_index()
averageRating.rename(columns={'book_rating': 'average_rating'}, inplace=True)
average_rating = averageRating[['isbn','average_rating']]

# Merge the average ratings back with the original dataset
averageRatingdf = pd.merge(isbn_rating_counts, average_rating, on='isbn', how='left')

# Remove duplicate entries
#averageRatingdf = averageRatingdf[['isbn', 'average_rating']].drop_duplicates(subset=['isbn'])

averageRatingdf.head()  

In [None]:
averageRatingdf.shape

In [None]:
#saving the average rating dataframe to a csv file'

averageRatingdf.to_csv('data/averageRatingdf.csv', index=False)

### Creating a search engine ###

In [None]:
averageRatingdf["mod_titles"] = averageRatingdf['book_title'].str.replace("[^a-zA-Z0-9]", " ", regex=True)    #removing special characters from book titles   
averageRatingdf.head()

In [None]:
averageRatingdf.sort_values('rating_count', ascending=False).sample(15)

In [None]:
# Sort the DataFrame by 'average_rating' and 'rating_count' in descending order
sorted_df = averageRatingdf.sort_values(by=['average_rating', 'rating_count'], ascending=[False, False]).head(10)

# Display the top rows
sorted_df

In [None]:
# Plot the top rated books  with the highest rating count   

sns.barplot(x='mod_titles', y='rating_count', hue='average_rating', data=sorted_df, palette='viridis')
plt.xticks(rotation=90)

In [None]:
averageRatingdf['mod_titles'] = averageRatingdf['mod_titles'].str.lower()    #converting book titles to lowercase

In [None]:
averageRatingdf['mod_titles'] = averageRatingdf['mod_titles'].str.replace('\s+', ' ', regex=True)    #removing extra spaces from book titles

In [None]:
averageRatingdf = averageRatingdf[averageRatingdf['mod_titles'].str.len() > 0]    #removing rows with empty book titles

In [None]:
#books_ratings.drop('user_id', axis=1, inplace=True)   #dropping user_id column

In [None]:
averageRatingdf.duplicated().sum()

In [None]:
averageRatingdf.rating_count.describe()

In [None]:
sns.boxplot(averageRatingdf.rating_count)

In [None]:
averageRatingdf['mod_titles'].nunique()

#### Reducing shape of the dataframe ####

In [None]:
ratings = averageRatingdf[averageRatingdf['rating_count'] >= 15]    #filtering out books with less than 15 ratings
ratings.shape

#### Building a Term Frequency and then a Inverse Document Frequency matrix ####


In [None]:
# turning titles into TD-IDF matrix => Term Frequency-Inverse Document Frequency
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer takes a list of strings as input and turns it into a fd-idf matrix
vectorizer = TfidfVectorizer()

tdidf = vectorizer.fit_transform(ratings['mod_titles'])

In [None]:
# to do comparison between books, we need to calculate the cosine similarity between the books

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re


# query function:

def search(query, vectorizer):
# setting up a search query

    #query = 'The Hobbit'
# preparing the string in the same way as the mod titles above
    processed = re.sub('[^a-zA-Z0-9]', ' ', query.lower())

# we need to turn the query into a vector using the vectorizer

    query_vector = vectorizer.transform([processed])

# to find the similarities we calculate the cosine similarity between the query vector and the tdidf matrix
    similarity = cosine_similarity(query_vector, tdidf).flatten() # flatten is used to turn the matrix into a 1D array

# to find the indices of the 10 largest similarities

    indices = np.argpartition(similarity, -10)[-10:]

# use indices to index the titles

    results = ratings.iloc[indices]   

# as there are many books with the title "The Hobbit", we only want those with the highest number of ratings

    results = results.sort_values(by='rating_count', ascending=False)

    return results.head(10)


In [None]:
search('lord of the rings', vectorizer)    




In [None]:
# creating my list of liked books
liked_books = [ '0451526341', '0553212419', '0140132708']
liked_books

### Making recommendations based on my liked books ###

#### step 1: find all the users that liked the same books as us ####

In [None]:
ratings.head()

In [None]:
# use original dataset to find users who liked the same books as me (including individual ratings)

#books_ratings.head()

books_ratings_short = books_ratings[['user_id', 'isbn', 'book_rating', 'book_title']]   

books_ratings_short.head()

filtered_ratings = books_ratings_short[books_ratings_short['isbn'].isin(liked_books) & (books_ratings_short['book_rating'] > 8)]  #filtering out books with ratings higher or equal  8   

filtered_ratings.shape

# create a set with users who liked the same books as me

overlap_users = set()

# # Create tuples (user_id, isbn, book_rating) for the filtered rows
overlap_users = set(filtered_ratings.apply(lambda row: (row['user_id'], row['isbn'],row['book_title'], row['book_rating']), axis=1))

# # Display the overlap_users set
overlap_users

In [None]:
# making set a dataframe containing any user who read the same book as us and rated the book higher than 8

overlap_users_df = pd.DataFrame(list(overlap_users), columns=['user_id', 'isbn', 'book_title', 'book_rating'])
overlap_users_df

In [None]:
len(overlap_users_df)

In [None]:
overlap_users_df['isbn'] = overlap_users_df['isbn'].astype(str)   #converting isbn to string

#### step 2: finding what those users liked ####

In [None]:
# finding books that the users in the overlap_users_df have read and rated highly

total_books = books_ratings_short[books_ratings_short['user_id'].isin(overlap_users_df['user_id'])]   #filtering out books read by users in the overlap_users_df
total_books = total_books[total_books['book_rating'] > 8]   #filtering out books with ratings higher than 8
recommended = total_books[~total_books['isbn'].isin(liked_books)]   #filtering out books that I have already liked
recommended_2 = recommended['isbn'].value_counts().head(10)   #finding the top 10 books that the users in the overlap_users_df have read and rated highly



In [None]:
len(recommended_2)

In [None]:
total_books.head()

In [None]:
total_books['isbn'].astype(str)

In [None]:
# Convert recommended_2 Series to DataFrame
recommended_2_df = recommended_2.reset_index()
recommended_2_df.columns = ['isbn', 'count']



In [None]:
recommended_2_df

In [None]:
# Merge with books DataFrame to get book titles
recommended_with_titles = pd.merge(recommended_2_df, total_books[['isbn', 'book_title']], on='isbn', how='left')

# Display the result
recommended_with_titles[['isbn', 'book_title']].value_counts().reset_index(name='count')

#### step 3: fine tuning the recommendations ####

In [None]:
total_books.head()

In [None]:
#creating a column book_count  

recommended['book_count'] = recommended.groupby('isbn')['isbn'].transform('count')
recommended

In [None]:
# creatig a popularity score for each book / we want to look for books that are popular among users like us

recommended['popularity'] = recommended['book_count'] * recommended['book_count'] / recommended['book_rating']
recommended.head()

In [None]:
tailored_recs = recommended.sort_values('popularity', ascending=False)
tailored_recs

In [None]:
tailored_recs[['isbn', 'book_title', 'popularity']].value_counts().reset_index(name='count').head(10)