In [147]:
import pandas as pd
import numpy as np
import math
import random

In [148]:
#Github URLs
patron_url = 'https://github.com/BenB1116/hrp-machine-learning/blob/master/Data/Clean_Data/Clean_Patron.csv?raw=true'
inventory_url = 'https://github.com/BenB1116/hrp-machine-learning/blob/master/Data/Clean_Data/Clean_Inventory.csv?raw=true'

#Import and convert csv files
raw_patron_df = pd.read_csv(patron_url)
raw_inventory_df = pd.read_csv(inventory_url)

In [149]:
# Create copies of dfs
patron_df = raw_patron_df.copy()
inventory_df = raw_inventory_df.copy()

# Drop random columns
patron_df.drop(columns = 'Unnamed: 0', inplace = True)
inventory_df.drop(columns = 'Unnamed: 0', inplace = True)

In [150]:
# Return a list of canidate items for comparision
def get_can_list(item_id):
    users = set(patron_df[patron_df['Item_ID'] == item_id]['Patron_ID'].values)

    common_items = []
    for user in users:
        common_items += list(patron_df[patron_df['Patron_ID'] == user]['Item_ID'].values)
    return list(set(common_items))   

# Using the canidate list return a dictoinary of all items in it paired with all users who have purchased that item 
def get_user_per_item(item_id):
    can_list = get_can_list(item_id)

    return {item: set(patron_df[patron_df['Item_ID'] == item]['Patron_ID'].values) for item in can_list}


In [151]:
# Similarity Functions

#Naive Similarity
def naive_similarity(item1_list, item2_list):
    return len(set.intersection(item1_list, item2_list))

# Jaccard Similarity
def jaccard_similarity(item1_list, item2_list):
    return len(set.intersection(item1_list, item2_list)) / len(set.union(item1_list, item2_list))

# Cosine Similarity
def cosine_similarity(item1_list, item2_list):
    num = len(set.intersection(item1_list, item2_list))
    dem = math.sqrt(len(item1_list) * len(item2_list))
    
    return  num / dem

In [152]:
# Calculate the similarity between any item and every item in its canidate list
def sim_scores(target_item_id, func, dic):
    scores = {item: func(dic[target_item_id], dic[item]) for item in list(dic.keys())}
    sorted_scores = {item: sim for item, sim in sorted(scores.items(), key = lambda item: item[1], reverse = True)}

    return list(sorted_scores.keys())

In [153]:
# num = 5
# top_n = sim_scores(3, jaccard_similarity, get_user_per_item(3))[:num + 1]

# for i, item in enumerate(top_n):
#     if i == 0:
#         print('Chosen book:')
#     elif i== 1:
#         print(f"The top {num} closest results are:")
#     print(f"Title: {inventory_df.iloc[item]['Title']}\nAuthor: {inventory_df.iloc[item]['Author_Last']}, {inventory_df.iloc[item]['Author_First']}\n")

In [154]:
users = set(patron_df['Patron_ID'].values)
items = set(inventory_df.index)

# Create a users by item list and an item by users list
users_by_item = {item: set(patron_df[patron_df['Item_ID'] == item]['Patron_ID'].values) for item in items}
items_by_user = {user: list(set(patron_df[patron_df['Patron_ID'] == user]['Item_ID'].values)) for user in users}

In [155]:

sample_items_by_user = {}

# Select two elements within each patrons items list and one not in
for user_id in users:
    if len(items_by_user[user_id]) > 1:
        ran_sample = random.sample(items_by_user[user_id], k=2)
        can_list = get_can_list(ran_sample[0])
        sample_items_by_user[user_id] = ran_sample + random.sample(list(set(can_list) - users_by_item[ran_sample[0]]), k = 1)

In [156]:
# Return True if the similarity between i and j is greater than i and k
def is_greater(item_list, fuc):
    sim1 = fuc(users_by_item[item_list[0]], users_by_item[item_list[1]])
    sim2 = fuc(users_by_item[item_list[0]], users_by_item[item_list[2]])
    if sim1 >= sim2:
        return True
    return False

#Returns the generated score
def get_score(fuc):
    count = 0

    # Count how many times sim(i, j) is greater than sim(i, k)
    for user in sample_items_by_user.keys():
        if is_greater(sample_items_by_user[user], fuc):
            count += 1
    
    return count / len(sample_items_by_user)

In [157]:
functions = [naive_similarity, jaccard_similarity, cosine_similarity]

for fuc in functions:
    print(f'{fuc.__name__}: {"{:.2f}".format(round(get_score(fuc) * 100, 2))} %')

naive_similarity: 95.08 %
jaccard_similarity: 63.57 %
cosine_similarity: 63.46 %
