In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from Utilities.similarity_matrix import compute_similarity_matrix
from Utilities.recommendation import get_recommendations

In [2]:
#Load the cleaned dataset
df = pd.read_pickle('File_dump/df.pkl')
df

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,6549,said the shotgun to the head.,Saul Williams,4.22,0743470796,9780743470797,en-US,192,2762,214,9/1/2003,MTV Books
1,14490,$30 Film School: How to Write Direct Produce...,Michael W. Dean,3.49,1592000673,9781592000678,eng,528,30,4,5/13/2003,Cengage Learning
2,5413,'Salem's Lot,Stephen King/Jerry N. Uelsmann,4.25,0385516487,9780385516488,eng,594,84123,571,11/1/2005,Doubleday
3,11525,1 000 Places to See Before You Die,Patricia Schultz,3.85,0761104844,9780761104841,eng,992,36303,439,5/22/2003,Workman Publishing Company
4,8542,10 lb Penalty,Dick Francis,3.90,042519745X,9780425197455,eng,320,3490,177,8/3/2004,G.P. Putnam's Sons
...,...,...,...,...,...,...,...,...,...,...,...,...
10343,31993,鋼之鍊金術師 6,Hiromu Arakawa/荒川弘/方郁仁,4.58,9861146148,9789861146140,zho,191,5,0,6/2/2004,東立
10344,31999,鋼之鍊金術師 7,Hiromu Arakawa/荒川弘/方郁仁,4.57,9861146156,9789861146157,zho,191,5,0,6/12/2004,東立
10345,31996,鋼之鍊金術師 9,Hiromu Arakawa/荒川弘/方郁仁,4.57,9861156526,9789861156521,zho,184,4,0,12/16/2004,東立
10346,15318,魔戒二部曲：雙城奇謀,J.R.R. Tolkien/托爾金/Alan Lee/朱學恆,4.44,9570823372,9789570823370,zho,467,24,0,12/20/2001,聯經出版事業股份有限公司


In [3]:
# I want to find the total number of occurences an author has in the whole dataset. 
# In some rows, there are multiple author names separated by /
# Hence I split the values and then count 
# Make a copy of 'authors' column, then split and count the values
author_counts = df['authors'].str.split('/').explode().value_counts()
print(author_counts)


authors
Stephen King           80
William Shakespeare    49
Sandra Brown           47
P.G. Wodehouse         46
J.R.R. Tolkien         44
                       ..
Rosaleen Linehan        1
Suzanne Higgins         1
Aine Greaney            1
Catherine Foley         1
Christopher   Clark     1
Name: count, Length: 8904, dtype: int64


In [4]:
#I use this block of code to see a list of author with occurences under a certain value
author_counts = df['authors'].str.split('/').explode().value_counts()
author_counts = author_counts[author_counts < 5]

print(author_counts)


authors
Josepha Sherman        4
Philip José Farmer     4
Robert M. Pirsig       4
Tanith Lee             4
Tyler Florence         4
                      ..
Rosaleen Linehan       1
Suzanne Higgins        1
Aine Greaney           1
Catherine Foley        1
Christopher   Clark    1
Name: count, Length: 8193, dtype: int64


In [5]:
cosine_sim = compute_similarity_matrix(df)

# Save the cosine similarity matrix
with open('File_dump/cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

#example use of get_recommendations function
result = get_recommendations('a short history of nearly everything', cosine_sim, df)
result

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,content
292,5089,A Short History of Nearly Everything (Illustra...,Bill Bryson,4.21,0385663552,9780385663557,eng,624,28,7,10/5/2010,Anchor Canada,A Short History of Nearly Everything (Illustra...
2058,42876,Down Under,Bill Bryson,4.07,055299703X,9780552997034,eng,398,4510,392,8/6/2001,Black Swan,Down Under Bill Bryson
963,42883,Bill Bryson: The Complete Notes,Bill Bryson,4.09,038560131X,9780385601313,eng,544,901,36,10/5/2000,Doubleday,Bill Bryson: The Complete Notes Bill Bryson
962,22,Bill Bryson's African Diary,Bill Bryson,3.44,0767915062,9780767915069,eng,55,7270,499,12/3/2002,Broadway Books,Bill Bryson's African Diary Bill Bryson
8846,12560,The Short History of a Prince,Jane Hamilton,3.65,055299801X,9780552998017,eng,432,1799,121,4/1/1999,Black Swan,The Short History of a Prince Jane Hamilton
1197,23,Bryson's Dictionary of Troublesome Words: A Wr...,Bill Bryson,3.87,0767910435,9780767910439,eng,256,2088,131,9/14/2004,Broadway Books,Bryson's Dictionary of Troublesome Words: A Wr...
3590,27306,Islam: A Short History,Karen Armstrong,4.02,081296618X,9780812966183,eng,230,7660,448,8/6/2002,Modern Library,Islam: A Short History Karen Armstrong
293,33541,A Short History of World War I,James L. Stokesbury,3.95,0688001297,9780688001292,eng,352,353,43,2/1/1981,William Morrow Paperbacks,A Short History of World War I James L. Stokes...
4839,27,Neither Here nor There: Travels in Europe,Bill Bryson,3.86,0380713802,9780380713806,eng,254,48701,2238,3/28/1993,William Morrow Paperbacks,Neither Here nor There: Travels in Europe Bill...
3731,42891,Journeys in English,Bill Bryson,3.72,0563496266,9780563496267,eng,3,510,60,2/2/2004,BBC Physical Audio,Journeys in English Bill Bryson


The function simulate_multiple_user_preferences is designed to simulate and evaluate how our book recommendation system performs when user preferences are skewed towards particular authors.

The function takes as input:

1. df: The DataFrame containing the book information.
2. authors: A list of authors that the simulated users have a preference for.
3. n_recommendations: The number of recommendations to generate for each book liked by the user.

The function works in the following way:

1. It iterates over each author in the authors list. For each author, it simulates a user who likes all books written by that author.

2. It then generates n_recommendations for each of the books that the simulated user likes.

3. It counts how many of these recommended books are written by the same author.

4. It calculates the percentage of recommended books that are by the same author as a measure of how well the recommendation system is tailoring its recommendations to the user's preferences.

5. It appends the author's name and the calculated percentage to the results list.

6. Finally, it converts the results list into a DataFrame and returns it.

I chose this function to evaluate the model as it helps to gauge the performance of the recommendation system in terms of its responsiveness to a user's expressed preferences. In real-world scenarios, a user might express a strong preference for certain authors by consistently choosing their books. A good recommendation system should be able to pick up on these preferences and recommend more books by the preferred authors.

The output of the function is a DataFrame showing, for each author, the percentage of times the recommendation system suggested a book by the same author when a user expressed a preference for that author's books. The higher this percentage, the more responsive the system is to the user's preferences.

In [6]:
def simulate_multiple_user_preferences(df, authors, n_recommendations):
    
    # Initialize an empty list to store the results
    results = []

    for author in authors:
        # User likes all books by the author
        user_likes = df[df['authors'] == author]

        # Count the number of times a recommended book is by the author
        count = 0
        for _, book in user_likes.iterrows():
            recommendations = get_recommendations(book['title'], cosine_sim, df)
            count += recommendations['authors'].str.contains(author).sum()

        # Calculate the percentage of recommendations by the author
        percentage = count / (n_recommendations * len(user_likes)) * 100

        # Append the result to the results list
        results.append({'author': author, 'percentage': percentage})

    # Convert the results list to a DataFrame
    results_df = pd.DataFrame(results)
    return results_df

# Here I have randomly chosen authors with different number of occurences, 
# this will help me get a comprehensive understanding of model performance

authors = ['J.K. Rowling', 'Stephen King', 'J.R.R. Tolkien', 'P.G. Wodehouse', 'Bill Bryson', 'Hiromu Arakawa', 'Piers Anthony']
print(simulate_multiple_user_preferences(df, authors, 10))

           author  percentage
0    J.K. Rowling  100.000000
1    Stephen King   94.137931
2  J.R.R. Tolkien   92.500000
3  P.G. Wodehouse   87.948718
4     Bill Bryson   77.058824
5  Hiromu Arakawa  100.000000
6   Piers Anthony   93.000000
