## Filtering for genre ##

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sklearn
# import pickleshare as ps

In [2]:
# df created in notebooks Bias_authors_countries and Genre_susanne 
clean_names_genre = pd.read_csv('../data/clean_names_genre.csv')
clean_names_genre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183600 entries, 0 to 183599
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   isbn                 183600 non-null  object 
 1   book_title           183600 non-null  object 
 2   book_author          183600 non-null  object 
 3   year_of_publication  183600 non-null  int64  
 4   publisher            183600 non-null  object 
 5   genre                183600 non-null  object 
 6   user_id              183600 non-null  float64
 7   book_rating          183600 non-null  float64
 8   location             183600 non-null  object 
 9   age                  183600 non-null  object 
 10  age_numeric          137577 non-null  float64
 11  age_bins             183600 non-null  object 
 12  mod_book_author      183600 non-null  object 
 13  mod_book_title       183600 non-null  object 
 14  mod_publisher        183600 non-null  object 
 15  country          

### Pre-processing ###

In [3]:
clean_names_genre['book_rating'] = clean_names_genre['book_rating'].astype(int)

In [6]:
# dropping columns we don't need
data_kept = clean_names_genre.drop(['user_id', 'genre', 'book_title', 'book_author', 'location', 'age', 'age_numeric', 'age_bins', 'country'], axis=1)

In [7]:
data_kept.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183600 entries, 0 to 183599
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 183600 non-null  object
 1   year_of_publication  183600 non-null  int64 
 2   publisher            183600 non-null  object
 3   book_rating          183600 non-null  int64 
 4   mod_book_author      183600 non-null  object
 5   mod_book_title       183600 non-null  object
 6   mod_publisher        183600 non-null  object
 7   categorized_genre    183600 non-null  object
dtypes: int64(2), object(6)
memory usage: 11.2+ MB


In [8]:
data_kept.categorized_genre.value_counts()

categorized_genre
Fiction                  108689
Other                     41134
Unknown                   31410
Poetry                      994
Guidebooks                  826
Romance                     357
Music                        72
Historical Fiction           63
Children's Literature        51
Suspense                      4
Name: count, dtype: int64

In [9]:
# calculate the rating count for each book and the average rating for each book
# 1. Group by 'isbn' and count the number of ratings for each ISBN
isbn_rating_counts = data_kept.groupby(['categorized_genre', 'mod_book_title', 'mod_book_author', 'isbn']).size().reset_index(name='rating_count')


In [10]:
# 2. Calculate the average rating for each book

to_be_rated = data_kept[['isbn', 'book_rating']]
averageRating = to_be_rated.groupby('isbn')['book_rating'].mean().round(1).reset_index()
averageRating.rename(columns={'book_rating': 'average_rating'}, inplace=True)
average_rating = averageRating[['isbn','average_rating']]

# Merge the average ratings back with the original dataset
averageRatingdf = pd.merge(isbn_rating_counts, average_rating, on='isbn', how='left')

# Remove duplicate entries
#averageRatingdf = averageRatingdf[['isbn', 'average_rating']].drop_duplicates(subset=['isbn'])

averageRatingdf.head() 

Unnamed: 0,categorized_genre,mod_book_title,mod_book_author,isbn,rating_count,average_rating
0,Children's Literature,alone across the arctic one womans epic journe...,pam flowers,088240539X,1,9.0
1,Children's Literature,author talk,leonard s marcus,068981383X,1,5.0
2,Children's Literature,childrens book of virtues,william j bennett,068481353X,9,4.4
3,Children's Literature,childrens classics to read aloud,edward blishen,185697538X,2,3.5
4,Children's Literature,childrens literature,francelia butler,087722076X,1,0.0


In [11]:
averageRatingdf["categorized_genre"] = averageRatingdf['categorized_genre'].str.replace("[^a-zA-Z0-9]", " ", regex=True)    #removing special characters in genre
averageRatingdf.head()

Unnamed: 0,categorized_genre,mod_book_title,mod_book_author,isbn,rating_count,average_rating
0,Children s Literature,alone across the arctic one womans epic journe...,pam flowers,088240539X,1,9.0
1,Children s Literature,author talk,leonard s marcus,068981383X,1,5.0
2,Children s Literature,childrens book of virtues,william j bennett,068481353X,9,4.4
3,Children s Literature,childrens classics to read aloud,edward blishen,185697538X,2,3.5
4,Children s Literature,childrens literature,francelia butler,087722076X,1,0.0


In [12]:
averageRatingdf['categorized_genre'] = averageRatingdf['categorized_genre'].str.replace('\s+', ' ', regex=True)    #removing extra spaces from genre

In [13]:
averageRatingdf = averageRatingdf[averageRatingdf['categorized_genre'].str.len() > 0]    #removing rows with empty genre

In [14]:
ratings = averageRatingdf
ratings.shape

(59938, 6)

### Filtering for genre ###

In [15]:
# Define the function
def get_isbns_by_genre(genre, df):
    # Filter the DataFrame based on the genre
    filtered_df = ratings[ratings['categorized_genre'].str.contains(genre, case=False, regex=True)]
    # Return the ISBNs of the filtered books
    return filtered_df[['categorized_genre', 'isbn', 'mod_book_title', 'mod_book_author', 'rating_count', 'average_rating']]

# Example usage
genre_to_search = 'Guide'
isbns = get_isbns_by_genre(genre_to_search, ratings)
isbns

Unnamed: 0,categorized_genre,isbn,mod_book_title,mod_book_author,rating_count,average_rating
23083,Guidebooks,1555660916,a field guide to rock art symbols of the great...,alex patterson,1,8.0
23084,Guidebooks,1885378017,a guide to biltmore estate,rachel carley,2,4.5
23085,Guidebooks,9631330702,a guide to birdwatching in hungary,gerard gorman,1,10.0
23086,Guidebooks,1885211260,a mothers world journeys of the heart traveler...,marybeth bond,1,0.0
23087,Guidebooks,1893121003,a natives guide to chicagos northwest suburbs ...,martin a bartels,1,0.0
...,...,...,...,...,...,...
23671,Guidebooks,1565543173,you can travel free,robert william kirk,2,0.0
23672,Guidebooks,1566910242,yucatan peninsula handbook the gulf of mexico ...,chicki mallan,1,9.0
23673,Guidebooks,1570063540,zagatsurvey 2002 san francisco bay area restau...,troy segal,1,0.0
23674,Guidebooks,1570066248,zagatsurvey us family travel guide,donna marino,1,6.0
