## Creating search engine for book authors ##

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sklearn
# import pickleshare as ps

In [2]:
# df created in notebooks Bias_authors_countries and Genre_susanne 
clean_names_genre = pd.read_csv('data/clean_names_genre.csv')
clean_names_genre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183600 entries, 0 to 183599
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   isbn                 183600 non-null  object 
 1   book_title           183600 non-null  object 
 2   book_author          183600 non-null  object 
 3   year_of_publication  183600 non-null  object 
 4   publisher            183600 non-null  object 
 5   genre                183600 non-null  object 
 6   user_id              183600 non-null  float64
 7   book_rating          183600 non-null  float64
 8   location             183600 non-null  object 
 9   age                  183600 non-null  object 
 10  age_numeric          137577 non-null  float64
 11  age_bins             183600 non-null  object 
 12  mod_book_author      183600 non-null  object 
 13  mod_book_title       183600 non-null  object 
 14  mod_publisher        183600 non-null  object 
 15  country          

### Pre-processing ###

In [3]:

clean_names_genre['book_rating'] = clean_names_genre['book_rating'].astype(int)

In [4]:
data_kept = clean_names_genre.drop(['user_id', 'surname', 'name_surname', 'book_title', 'book_author', 'genre', 'location', 'age', 'age_numeric', 'age_bins', 'country', 'name_surname', 'surname'], axis=1)

In [5]:
data_kept.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183600 entries, 0 to 183599
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 183600 non-null  object
 1   year_of_publication  183600 non-null  object
 2   publisher            183600 non-null  object
 3   book_rating          183600 non-null  int64 
 4   mod_book_author      183600 non-null  object
 5   mod_book_title       183600 non-null  object
 6   mod_publisher        183600 non-null  object
 7   categorized_genre    183600 non-null  object
dtypes: int64(1), object(7)
memory usage: 11.2+ MB


In [6]:
# calculate the rating count for each book and the average rating for each book
# 1. Group by 'isbn' and count the number of ratings for each ISBN
isbn_rating_counts = data_kept.groupby(['mod_book_title', 'mod_book_author', 'isbn']).size().reset_index(name='rating_count')


In [7]:
# 2. Calculate the average rating for each book

to_be_rated = data_kept[['isbn', 'book_rating']]
averageRating = to_be_rated.groupby('isbn')['book_rating'].mean().round(1).reset_index()
averageRating.rename(columns={'book_rating': 'average_rating'}, inplace=True)
average_rating = averageRating[['isbn','average_rating']]

# Merge the average ratings back with the original dataset
averageRatingdf = pd.merge(isbn_rating_counts, average_rating, on='isbn', how='left')

# Remove duplicate entries
#averageRatingdf = averageRatingdf[['isbn', 'average_rating']].drop_duplicates(subset=['isbn'])

averageRatingdf.head() 

Unnamed: 0,mod_book_title,mod_book_author,isbn,rating_count,average_rating
0,0815 heute,kirst,3442013453,1,7.0
1,1 2 3,tana hoban,068802579X,3,5.3
2,10 000 dreams interpreted,gustavus hindman miller,1862044082,1,0.0
3,100 chocolate,katherine khodorowsky,1577173074,1,0.0
4,1000 years 1000 people ranking the men and wom...,agnes hooper gottlieb,1568362536,4,6.8


In [8]:
averageRatingdf["mod_book_author"] = averageRatingdf['mod_book_author'].str.replace("[^a-zA-Z0-9]", " ", regex=True)    #removing special characters from authors 
averageRatingdf.head()

Unnamed: 0,mod_book_title,mod_book_author,isbn,rating_count,average_rating
0,0815 heute,kirst,3442013453,1,7.0
1,1 2 3,tana hoban,068802579X,3,5.3
2,10 000 dreams interpreted,gustavus hindman miller,1862044082,1,0.0
3,100 chocolate,katherine khodorowsky,1577173074,1,0.0
4,1000 years 1000 people ranking the men and wom...,agnes hooper gottlieb,1568362536,4,6.8


In [11]:
averageRatingdf['mod_book_author'] = averageRatingdf['mod_book_author'].str.replace('\s+', ' ', regex=True)    #removing extra spaces from book authors

In [12]:
averageRatingdf = averageRatingdf[averageRatingdf['mod_book_author'].str.len() > 0]    #removing rows with empty authors

In [13]:
ratings = averageRatingdf[averageRatingdf['rating_count'] >= 15]    #filtering out books with less than 15 ratings
ratings.shape

(1488, 5)

### Creating search engine ###

In [14]:
# turning authors into TD-IDF matrix => Term Frequency-Inverse Document Frequency
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer takes a list of strings as input and turns it into a fd-idf matrix
vectorizer = TfidfVectorizer()

tdidf = vectorizer.fit_transform(ratings['mod_book_author'])

In [19]:
# to do comparison between authors, we need to calculate the cosine similarity between the authors

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re


# query function:

def search(query, vectorizer):
# setting up a search query

    #query = 'The Hobbit'
# preparing the string in the same way as the mod authors above
    processed = re.sub('[^a-zA-Z0-9]', ' ', query.lower())

# we need to turn the query into a vector using the vectorizer

    query_vector = vectorizer.transform([processed])

# to find the similarities we calculate the cosine similarity between the query vector and the tdidf matrix
    similarity = cosine_similarity(query_vector, tdidf).flatten() # flatten is used to turn the matrix into a 1D array

# to find the indices of the 10 largest similarities

    indices = np.argpartition(similarity, -10)[-10:]

# use indices to index the titles

    results = ratings.iloc[indices]   

#  we only want authors with books with the highest number of ratings

    # results = results.sort_values(by='rating_count', ascending=False)

    return results.head(10)


In [24]:
search('gottlieb', vectorizer)

Unnamed: 0,mod_book_title,mod_book_author,isbn,rating_count,average_rating
19142,free,paul vincent,1844262553,54,8.0
19076,frankenstein doesnt plant petunias adventures ...,debbie dadey,059047071X,15,0.9
19014,fowl prey bedandbreakfast mysteries paperback,mary daheim,038076296X,20,2.0
19016,fox river,emilie richards,1551668068,33,2.2
19002,four seasons,mary alice monroe,1551667894,39,2.4
19085,frankenstein wordsworth classics,mary wollstonecraft shelley,1853260231,30,4.3
19251,friedhof der kuscheltiere roman,stephen king,3453007867,34,4.3
19204,french for cats all the french your cat will e...,henri de la barbe,067940676X,15,4.7
19243,friday,robert heinlein,034530988X,51,3.6
59934,zwlf,nick mcdonell,3462032283,27,2.9
