In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
patron_url = 'https://github.com/BenB1116/hrp-machine-learning/blob/master/Data/Clean_Data/Clean_Patron.csv?raw=true'
inventory_url = 'https://github.com/BenB1116/hrp-machine-learning/blob/master/Data/Clean_Data/Clean_Inventory.csv?raw=true'

patron_df = pd.read_csv(patron_url)
inventory_df = pd.read_csv(inventory_url)

In [3]:
#patron_df = patron_df.drop('Unnamed: 0', axis = 1).drop_duplicates()
len(patron_df)

74729

In [4]:
patrons_per_item = {}

# Create an array of all item ids and an empty list
item_ids = inventory_df.index
patrons = []

# Given an item id get a list of patrons that have checked out that book
for item_id in item_ids:
    patrons_per_item[item_id] = patron_df[patron_df['Item_ID'] == item_id]['Patron_ID'].values 

In [5]:
#Takes the intersection of any two patron arrays give the item id
def intersect(item1, item2):
    return np.array(np.intersect1d(patrons_per_item[item1], patrons_per_item[item2]))
   
#Takes the union of any two patron arrays give the item id
def union(item1, item2):
    arr1 = list(patrons_per_item[item1])
    arr2 = list(patrons_per_item[item2])
    
    return np.unique(np.concatenate((arr1, arr2)))

# Sorts a dictionary by values in descending order
def sort_dict_values(sim_dict):
    sim_list = reversed(sorted(sim_dict.items(), key=lambda x:x[1]))
    return dict(sim_list)

In [6]:
# Takes the Jaccard Similarity between any two item ids
def jacc_sim(item1, item2):
    #Divide the cardinality of the intersect of the list by the union
    return len(intersect(item1, item2)) / len(union(item1, item2))

# Calculates the similartiy of an item with every other item in the inventory
def get_sims(item_id):
    sims = {}
    
    # Calculate the Jaccard Similartiy for every item
    for item in item_ids:
        sims[item] = jacc_sim(item_id, item)
    
    return sims

# Returns the top n items that have the  highest Jaccard Similarity to an item
def get_topn(item_id, top_n = 1):
    # Calculate the similarity of item_id to all items and remove item_id from the dictionary
    sims = get_sims(item_id)
    del sims[item_id]
    
    # Sort the resulting dictionary and select the top_n items
    sort_sims = sort_dict_values(sims)
    topn_items = {k: sort_sims[k] for k in list(sort_sims.keys())[:top_n]}

    return topn_items

In [7]:
# Returns the entry from the inventory given an item id
def get_item_info(item_id):
    return inventory_df.iloc[item_id]

# Returns a string version of an item containing the title and author name
def get_item_string(item_id):
    item_ser = get_item_info(item_id)
    
    author_first = item_ser.loc['Author_First']
    author_last = item_ser.loc['Author_Last']
    item_title = item_ser.loc['Title']

    return str(item_title) +'\n\tby: ' + str(author_last) + ', ' + str(author_first)

# Returns the number of spaces needed to space a string correctly
def num_space(i):
    # Get the number of digits
    digits = math.floor(math.log(i, 10))
    return 5 - digits

# Returns a string of the top n items closest to an item
def get_topn_string(item_id, top_n):
    # Get the title and name of the item 
    topn_items = get_topn(item_id, top_n)  
    topn_string = 'The top ' + str(top_n) + ' closest items to: \n' + '\t' + get_item_string(item_id) +'\nAre:'

    # Append each of the top n items to the topn string
    i = 1
    for item in topn_items.keys():
        topn_string += '\n' + ' '*num_space(i) + str(i) + '. ' + get_item_string(item) + '\n'
        i = i + 1
        
    return(topn_string)

In [10]:
# Get the item id an number of desired entries
item_id = int(input('Please enter an item id: '))
topn = int(input('Please enter the number of like items you would like: '))

print(get_topn_string(item_id, topn))

The top 5 closest items to: 
	the lorax
	by: seuss, nan
Are:
     1. lookalikes christmas
	by: steiner,  joan

     2. happy halloween biscuit
	by: capucilli,  alyssa satin

     3. watching you
	by: jewell,  lisa

     4. puppies and kittens
	by: corse,  nicole

     5. molly pitcher
	by: gleiter,  jan



Some recommended item IDs are 3, 45, 1056 and 6724