## Creating search engine for book authors ##

In [1]:
%run import_data_.py

Continuing with existing version of data folder
Goodreads dataset loaded successfully as books_goodreads
Pandas dataframes (books_goodreads, books_big, book, users, ratings) loaded successfully
Columns in DataFrames 'users' and 'ratings' renamed
You can use the DataFrames 'books' or 'books_big' - they are exactly the same (big) dataset
loading books_ratings and books_users_ratings
Ready to go!


## Pre-processing Data For Search Engines

In [32]:
# The following steps will complete the columns 
# 'mod_author', 'mod_publisher', 'mod_title'
# in books_ratings, books_users_ratings, raters_15plus
# so that the data is ready for our search engines
# All of these changes have already been saved to the .csv's

# import sys
# import os

# # Add the scripts folder to the Python path
# sys.path.append(os.path.abspath('py_files_and_test_notebook'))

# # Now we can import the function to clean the string 
# # values of the columns we need for search engines
# from clean_string_columns import mod_col_values

# books_ratings = mod_col_values(df=books_ratings, col="book_author")
# books_ratings = books_ratings.rename(columns={'mod_book_author': 'mod_author'})
# books_ratings = mod_col_values(df=books_ratings, col="publisher")

# raters_15plus = mod_col_values(df=raters_15plus, col="book_author")
# raters_15plus = raters_15plus.rename(columns={'mod_book_author': 'mod_author'})
# raters_15plus = mod_col_values(df=raters_15plus, col="publisher")

# books_users_ratings = mod_col_values(df=books_users_ratings, col="publisher")
# books_users_ratings = mod_col_values(df=books_users_ratings, col="book_author")
# books_users_ratings = books_users_ratings.rename(columns={'mod_book_author': 'mod_author'})

# books_ratings.to_csv("data/books_rated.csv", sep=";", encoding="utf-8-sig", index=False)
# books_users_ratings.to_csv("data/books_users_ratings.csv", sep=";", encoding="utf-8-sig", index=False)
# raters_15plus.to_csv("data/raters_15plus.csv", sep=";", encoding="utf-8-sig", index=False)

### Creating search engine ###

In [2]:
# turning authors into TD-IDF matrix => Term Frequency-Inverse Document Frequency
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer takes a list of strings as input and turns it into a fd-idf matrix
vectorizer = TfidfVectorizer()

tdidf = vectorizer.fit_transform(raters_15plus['mod_author'])
# to do comparison between authors, we need to calculate the cosine similarity between the authors

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re


# query function:

def search(query, vectorizer):
# setting up a search query

    #query = 'The Hobbit'
# preparing the string in the same way as the mod authors above
    processed = re.sub('[^a-zA-Z0-9]', ' ', query.lower())

# we need to turn the query into a vector using the vectorizer

    query_vector = vectorizer.transform([processed])

# to find the similarities we calculate the cosine similarity between the query vector and the tdidf matrix
    similarity = cosine_similarity(query_vector, tdidf).flatten() # flatten is used to turn the matrix into a 1D array

# to find the indices of the 10 largest similarities

    indices = np.argpartition(similarity, -10)[-10:]

# use indices to index the titles

    results = raters_15plus.iloc[indices]   

#  we only want authors with books with the highest number of ratings

    # results = results.sort_values(by='rating_count', ascending=False)

    return results.head(10)


In [3]:
search('gottlieb', vectorizer)

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,genre,avg_rating,nr_ratings,nr_readers,annotations,mod_title,mod_author,mod_publisher,image_url_s,image_url_m,image_url_l
1036,380001411,Christy,Catherine Marshall,1976,Avon,"['Fiction', 'Historical Fiction', 'Other', 'Ro...",8.5625,32,79,,christy,catherine marshall,avon,http://images.amazon.com/images/P/0380001411.0...,http://images.amazon.com/images/P/0380001411.0...,http://images.amazon.com/images/P/0380001411.0...
1033,451166582,The Eyes of the Dragon,Stephen King,2001,Signet Book,"['Science', ""Children's Literature"", 'Other', ...",8.421053,38,79,New York Times bestseller,the eyes of the dragon,stephen king,signet book,http://images.amazon.com/images/P/0451166582.0...,http://images.amazon.com/images/P/0451166582.0...,http://images.amazon.com/images/P/0451166582.0...
1032,451191153,The Fountainhead,Ayn Rand,1996,New American Library,"['Psychology', 'Fiction', 'Other']",8.307692,39,79,,the fountainhead,ayn rand,new american library,http://images.amazon.com/images/P/0451191153.0...,http://images.amazon.com/images/P/0451191153.0...,http://images.amazon.com/images/P/0451191153.0...
1034,767907817,Bookends : A Novel,Jane Green,2003,Broadway,"['Fiction', 'Other']",7.75,36,79,I met Josh right at the beginning,bookends a novel,jane green,broadway,http://images.amazon.com/images/P/0767907817.0...,http://images.amazon.com/images/P/0767907817.0...,http://images.amazon.com/images/P/0767907817.0...
1037,441783589,Starship Troopers,Robert A. Heinlein,1987,Ace Books,"['Fiction', 'Science', 'Other']",7.78125,32,79,I always get the shakes before a drop.,starship troopers,robert a heinlein,ace books,http://images.amazon.com/images/P/0441783589.0...,http://images.amazon.com/images/P/0441783589.0...,http://images.amazon.com/images/P/0441783589.0...
1028,446608653,The Alibi,Sandra Brown,2000,Warner Books,"['Fiction', 'Other', 'Thriller']",7.9,20,80,He noticed her the moment she stepped into the...,the alibi,sandra brown,warner books,http://images.amazon.com/images/P/0446608653.0...,http://images.amazon.com/images/P/0446608653.0...,http://images.amazon.com/images/P/0446608653.0...
1031,60930187,The Bell Jar : A Novel (Perennial Classics),Sylvia Plath,2000,Perennial,"[""Children's Literature"", 'Other', 'Poetry', '...",8.02381,42,79,the summer they electrocuted the Rosenbergs.an...,the bell jar a novel perennial classics,sylvia plath,perennial,http://images.amazon.com/images/P/0060930187.0...,http://images.amazon.com/images/P/0060930187.0...,http://images.amazon.com/images/P/0060930187.0...
1030,156528207,The Little Prince,Antoine de Saint-ExupÃ©ry,1968,Harcourt,"['Juvenile Fiction', ""Children's Literature"", ...",8.980392,51,79,Pilotes d'aéronef -- Romans.Pied Piper of Ham...,the little prince,antoine de saint exup ry,harcourt,http://images.amazon.com/images/P/0156528207.0...,http://images.amazon.com/images/P/0156528207.0...,http://images.amazon.com/images/P/0156528207.0...
1029,446353205,The Charm School,Nelson DeMille,1989,Warner Books,"['Fiction', 'Other', 'Thriller']",8.777778,18,80,,the charm school,nelson demille,warner books,http://images.amazon.com/images/P/0446353205.0...,http://images.amazon.com/images/P/0446353205.0...,http://images.amazon.com/images/P/0446353205.0...
3093,8478886451,Harry Potter y el cÃ¡liz de fuego,J. K. Rowling,2001,Lectorum Publications,"[""Children's Literature"", 'Fiction', 'Juvenile...",8.4,15,16,New York Times bestseller.Romans nouvelles etc...,harry potter y el c liz de fuego,j k rowling,lectorum publications,http://images.amazon.com/images/P/8478886451.0...,http://images.amazon.com/images/P/8478886451.0...,http://images.amazon.com/images/P/8478886451.0...
