# Content Based Filtering

In [1]:
import pandas as pd
import numpy as np

import json
import gzip

import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp

sns.set_theme(style="darkgrid")

In [2]:
def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return pd.DataFrame(data)

In [3]:
itemitem = pd.read_csv('data/sample/item_item_filter.csv')
itemitem.set_index('book_id', drop=True, inplace=True)
itemitem.head()

Unnamed: 0_level_0,fiction,fantasy,romance,classic,mystery,kindle,sci-fi,literature,horror,contemporary,...,children,school,philosophy,novel,young,author,publisher,year,format,is_series
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5333265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,604031,St. Martin's Press,0.40678,Paperback,0
1333909,0.068966,0.0,0.0,0.0,0.0,0.103448,0.0,0.172414,0.0,0.0,...,0.0,0.0,0.0,0.137931,0.0,626222,Simon & Schuster Audio,0.694915,Audiobook,0
7327624,0.166667,0.0,0.0,0.166667,0.0,0.0,0.166667,0.166667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10333,"Nelson Doubleday, Inc.",0.457627,Hardcover,1
6066819,0.18,0.0,0.23,0.0,0.0,0.18,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.18,0.0,9212,Atria Books,0.830508,Hardcover,0
287140,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,149918,,0.0,Undefined,0


In [4]:
#Encodes features that need to be encoded according one book 
def encodeLabels(book_id, df, columns=['author', 'publisher', 'format']):
    for col in columns:
        df[col] = (df[col] == df.loc[book_id, col]).astype(float)
    return df

In [5]:
def getScore(array1, array2):
    return sp.spatial.distance.cosine(array1, array2)

In [28]:
def getBestRecommendation(book_id, data):
    bestscore = 1
    data = encodeLabels(book_id, data)

    refer = data.loc[book_id].tolist()
    book_ids = data.index.tolist()
    items = data.to_numpy()

    for index, item in zip(book_ids, items):
        score = getScore(refer, item)
        if score < bestscore and index != book_id:
            bestscore = score
            bestbook = index

    return bestbook, bestscore


In [29]:
reco = getBestRecommendation(7327624, itemitem)
print(reco)

  dist = 1.0 - uv / np.sqrt(uu * vv)


(6665450, 0.28815205769052554)


In [8]:
books = load_data('data/raw/goodreads_books.json.gz', head=10000)

In [34]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(itemitem.loc[[6665450,7327624]])

          fiction  fantasy   romance   classic   mystery    kindle    sci-fi  \
book_id                                                                        
6665450  0.081871      0.0  0.099415  0.076023  0.070175  0.076023  0.000000   
7327624  0.166667      0.0  0.000000  0.166667  0.000000  0.000000  0.166667   

         literature  horror  contemporary  adventure  historical     adult  \
book_id                                                                      
6665450    0.070175     0.0           0.0        0.0    0.081871  0.099415   
7327624    0.166667     0.0           0.0        0.0    0.000000  0.000000   

         paranormal  thriller   history  dystopia     audio  children  school  \
book_id                                                                         
6665450    0.000000   0.05848  0.081871       0.0  0.064327       0.0     0.0   
7327624    0.166667   0.00000  0.000000       0.0  0.166667       0.0     0.0   

         philosophy     novel  young  aut

In [35]:
itemitem.loc[[6665450,7327624]].to_excel('data/excel/firstresults.xlsx')