In [204]:
import pandas as pd
import numpy as np
import math

In [205]:
#Github URLs
patron_url = 'https://github.com/BenB1116/hrp-machine-learning/blob/master/Data/Clean_Data/Clean_Patron.csv?raw=true'
inventory_url = 'https://github.com/BenB1116/hrp-machine-learning/blob/master/Data/Clean_Data/Clean_Inventory.csv?raw=true'

#Import and convert csv files
raw_patron_df = pd.read_csv(patron_url)
raw_inventory_df = pd.read_csv(inventory_url)

In [206]:
# Create copies of dfs
patron_df = raw_patron_df.copy()
inventory_df = raw_inventory_df.copy()

# Drop random columns
patron_df.drop(columns = 'Unnamed: 0', inplace = True)
inventory_df.drop(columns = 'Unnamed: 0', inplace = True)

In [207]:
def get_can_list(item_id):
    users = set(patron_df[patron_df['Item_ID'] == item_id]['Patron_ID'].values)

    common_items = []
    for user in users:
        common_items += list(patron_df[patron_df['Patron_ID'] == user]['Item_ID'].values)
    return list(set(common_items))   

def get_user_per_item(item_id):
    can_list = get_can_list(item_id)

    return {item: set(patron_df[patron_df['Item_ID'] == item]['Patron_ID'].values) for item in can_list}


In [208]:
# Set Operations

# Union
#Takes the union of any two patron arrays given the item ids
def union(list1, item2):
    return set.union(patrons_by_item[item1], patrons_by_item[item2])

# Intersection
#Takes the intersection of any two patron arrays give the item id
def intersect(item1, item2):
    return set.intersection(patrons_by_item[item1], patrons_by_item[item2])

In [209]:
# Similarity Functions

#Naive Similarity
def naive_similarity(item1_list, item2_list):
    return len(intersect(item1_list, item2_list))

# Jaccard Similarity
def jaccard_similarity(item1_list, item2_list):
    return len(set.intersection(item1_list, item2_list)) / len(set.union(item1_list, item2_list))

# Cosine Similarity
def cosine_similarity(item1_list, item2_list):
    num = len(intersect(item1_list, item2_list))
    dem = math.sqrt(len(item1_list) * len(item2_list))
    
    return  num / dem

In [210]:
def sim_scores(target_item_id, func, dic):
    scores = {item: func(dic[target_item_id], dic[item]) for item in list(dic.keys())}
    sorted_scores = {item: sim for item, sim in sorted(scores.items(), key = lambda item: item[1], reverse = True)}

    return list(sorted_scores.keys())

In [236]:
num = 5
top_n = sim_scores(3, jaccard_similarity, get_user_per_item(3))[:num + 1]

for i, item in enumerate(top_n):
    if i == 0:
        print('Chosen book:')
    elif i== 1:
        print(f"The top {num} closest results are:")
    print(f"Title: {inventory_df.iloc[item]['Title']}\nAuthor: {inventory_df.iloc[item]['Author_Last']}, {inventory_df.iloc[item]['Author_First']}\n")

Chosen book:
Title: beastars
Author: itagaki,  paru

The top 5 closest results are:
Title: tokyo ghoul  re
Author: ishida,  sui

Title: tokyo ghoulre
Author: ishida,  sui

Title: assassination classroom vol 3
Author: matsui,  yusei

Title: assassination classroom vol 2
Author: matsui,  yusei

Title: tokyo ghoul re
Author: ishida,  sui

