# Content Based Filtering

In [71]:
import pandas as pd
import numpy as np

import json
import gzip

import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp

import math

sns.set_theme(style="darkgrid")

In [53]:
def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return pd.DataFrame(data)

In [54]:
itemitem = pd.read_csv('data/sample/item_item_filter.csv')
itemitem.set_index('book_id', drop=True, inplace=True)
itemitem.head()

Unnamed: 0_level_0,fiction,fantasy,romance,classic,mystery,kindle,sci-fi,literature,horror,contemporary,...,children,school,philosophy,novel,young,author,publisher,year,format,is_series
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5333265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,604031,St. Martin's Press,0.40678,Paperback,0
1333909,0.068966,0.0,0.0,0.0,0.0,0.103448,0.0,0.172414,0.0,0.0,...,0.0,0.0,0.0,0.137931,0.0,626222,Simon & Schuster Audio,0.694915,Audiobook,0
7327624,0.166667,0.0,0.0,0.166667,0.0,0.0,0.166667,0.166667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10333,"Nelson Doubleday, Inc.",0.457627,Hardcover,1
6066819,0.18,0.0,0.23,0.0,0.0,0.18,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.18,0.0,9212,Atria Books,0.830508,Hardcover,0
287140,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,149918,,0.0,Undefined,0


In [55]:
#Encodes features that need to be encoded according one book 
def encodeLabels(book_id, df, columns=['author', 'publisher', 'format']):
    for col in columns:
        df[col] = (df[col] == df.loc[book_id, col]).astype(float)
    return df

In [68]:
def getScore(array1, array2, method):
    if method == 'cos':
        return sp.spatial.distance.cosine(array1, array2)
    if method == 'euc':
        return sp.spatial.distance.euclidean(array1, array2)

In [73]:
def getBestRecommendations(book_id, data, top=10, method='cos'):
    best = {}
    bestscore = math.inf
    bestbook = None
    data = encodeLabels(book_id, data)

    refer = data.loc[book_id].tolist()
    book_ids = data.index.tolist()
    items = data.to_numpy()

    for index, item in zip(book_ids, items):
        score = getScore(refer, item, method)
        if index != book_id and score < bestscore:
            bestscore = score
            bestbook= index
            best[index] = score 

    best_reco = sorted(best.items(), key=lambda kv: kv[1])

    return best_reco[:top]

In [74]:
initial_book = 27421523
reco = getBestRecommendations(initial_book, itemitem, method='euc')

In [75]:
books = load_data('data/raw/goodreads_books.json.gz', head=10000)

In [76]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(itemitem.loc[[reco[0][0],initial_book]])

           fiction  fantasy   romance   classic  mystery    kindle    sci-fi  \
book_id                                                                        
6015186   0.042470      0.0  0.051602  0.040296      0.0  0.052761  0.106827   
27421523  0.043652      0.0  0.000000  0.053861      0.0  0.063600  0.109951   

          literature  horror  contemporary  adventure  historical     adult  \
book_id                                                                       
6015186     0.041310     0.0           0.0   0.272358         0.0  0.061893   
27421523    0.054799     0.0           0.0   0.288547         0.0  0.057264   

          paranormal  thriller  history  dystopia     audio  children  school  \
book_id                                                                         
6015186     0.102334       0.0      0.0       0.0  0.065372  0.040441     0.0   
27421523    0.109012       0.0      0.0       0.0  0.043182  0.049988     0.0   

          philosophy     novel     yo

In [None]:
books.set_index('book_id', drop=True, inplace=True)

In [None]:
result = books.loc[[str(initial_book)]+[str(index) for index, score in reco]]
result.to_excel('data/excel/firstresults.xlsx')