# Memory Management Comparision

The purpose of this program is to measure the similarity of the various memory management recommendation functions.

### Prelimary Work

In [1]:
#Import packages
import pandas as pd
import numpy as np
import math
import collections
import random

In [2]:
#Github URLs
patron_url = 'https://github.com/BenB1116/hrp-machine-learning/blob/master/Data/Clean_Data/Clean_Patron.csv?raw=true'
inventory_url = 'https://github.com/BenB1116/hrp-machine-learning/blob/master/Data/Clean_Data/Clean_Inventory.csv?raw=true'

#Import and convert csv files
raw_patron_df = pd.read_csv(patron_url)
raw_inventory_df = pd.read_csv(inventory_url)

In [3]:
#Create a copy of DataFrames
patron_df = raw_patron_df.copy()
inventory_df = raw_inventory_df.copy()

print(f'The length of patron_df is: {len(patron_df)}')
print(f'The length of inventory_df is: {len(inventory_df)}')

The length of patron_df is: 74729
The length of inventory_df is: 11438


In [4]:
# Drop random columns
patron_df.drop(columns = 'Unnamed: 0', inplace = True)
inventory_df.drop(columns = 'Unnamed: 0', inplace = True)

print(patron_df.head(3))
print(inventory_df.head(3))

   Patron_ID  Item_ID
0      10010     4240
1      13341     4240
2      13341     4240
                         Title  Author_First Author_Last  Num_Checkouts
0           sonic the hedgehog           ian       flynn            235
1    babysitters little sister          katy      farina            163
2  if you give a pig a pancake   laura joffe    numeroff            119


### Data Manipulation

In [5]:
# Create a patrons by items dictionary
item_ids = inventory_df.index
patrons_by_item = {item_id: set(patron_df[patron_df['Item_ID'] == item_id]['Patron_ID'].values) for item_id in item_ids}

len(patrons_by_item)

11438

In [6]:
# Set Operations

# Union
#Takes the union of any two patron arrays given the item ids
def union(item1, item2):
    return set.union(patrons_by_item[item1], patrons_by_item[item2])

# Intersection
#Takes the intersection of any two patron arrays give the item id
def intersect(item1, item2):
    return set.intersection(patrons_by_item[item1], patrons_by_item[item2])

In [7]:
# Similarity Functions

#Naive Similarity
def naive_similarity(item1, item2):
    return len(intersect(item1, item2))

# Jaccard Similarity
def jaccard_similarity(item1, item2):
    return len(intersect(item1, item2)) / len(union(item1, item2))

# Cosine Similarity
def cosine_similarity(item1, item2):
    num = len(intersect(item1, item2))
    dem = math.sqrt(len(patrons_by_item[item1]) * len(patrons_by_item[item2]))
    
    return  num / dem

In [8]:
# Most Similar Items List Generation

# Calculaltes similarity between an item and all items
def get_sims(item, fun):
    return {item_id : fun(item, item_id) for item_id in item_ids}

# Returns the top n similar items to item using any function
def get_top_n(item, n, fun):
    all_sims = get_sims(item, fun)
    #Sort the dictionary by values and creates a list of keys
    sorted_sims = list({item_id: sim for item_id, sim in sorted(all_sims.items(), key = lambda item: item[1], reverse = True)}.keys())

    return set(sorted_sims[1:n + 1])

In [9]:
# Sim to Sim Comparision

# Returns the how similar any two similarity functions are
def inv_sim_to_sim(item, n, fun1, fun2):
    fun1_top = get_top_n(item, n, fun1)
    fun2_top = get_top_n(item, n, fun2)

    num = len(set.intersection(fun1_top, fun2_top))

    return num / n

# Randomly samples a list of item ids to do a sim to sim comparision and takes the average
def samp_avg_sim_to_sim(n, approx, fun1, fun2):
    sum = 0
    ran_ids = random.sample(range(0, len(item_ids)), approx)

    for item in ran_ids:
        sum += inv_sim_to_sim(item, n, fun1, fun2)

    return sum / approx

# This takes about 3.5 minutes to run
# Takes the avg sim to sim similarity across the entire database
def avg_sim_to_sim(n, fun1, fun2):
    sum = 0

    for item in item_ids:
        sum += inv_sim_to_sim(item, n, fun1, fun2)

    return sum / len(item_ids)

In [15]:
# Avg Number of Valid Comparisions

# Returns the number of valid entries given a sim function and item
# A valid entry is any entry with a similarity score greater than 0
def get_num_valid(item, fun):
    sim_scores = list(get_sims(item, fun).values())
    scores = np.array(sim_scores)

    scores = scores[scores > 0]
    
    return len(scores)

#Takes about 2.5 minutes to run
#Takes the avg number of valid entryies for an entire function
def avg_valid(fun):
    sum = 0

    for item_id in item_ids:
        sum += get_num_valid(item_id, fun)

    return sum / len(item_ids)

3302


### Output

In [11]:
# This take 10 minutes to run
# naive_to_jacc = avg_sim_to_sim(5, naive_similarity, jaccard_similarity)
# jacc_to_cos = avg_sim_to_sim(5, jaccard_similarity, cosine_similarity)
# cos_to_naive = avg_sim_to_sim(5, cosine_similarity, naive_similarity)

# I went ahead and ran the code above
naive_to_jacc = 0.1169610071690865
jacc_to_cos = 0.9331876202133261
cos_to_naive = 0.10526315789473722

# Round to 2 decimals
nj_percent = "{:.2f}".format(round(naive_to_jacc * 100, 2))
jc_percent = "{:.2f}".format(round(jacc_to_cos * 100, 2))
cn_percent = "{:.2f}".format(round(cos_to_naive * 100, 2))

In [12]:
# Only need to run for 1 since all others are the same
# avg_valid(jaccard_similarity)

# I went ahead and ran the code above
avg_num_of_valid = int(416.35093547823044)

In [13]:
print(f'On average the Naive method and Jaccard method share {nj_percent}% elements')
print(f'On average the Jaccard method and Cosine method share {jc_percent}% elements')
print(f'On average the Cosine method and Naive method share {cn_percent}% elements')
print()
print(f'On average the simlarities give {avg_num_of_valid} valid entries')

On average the Naive method and Jaccard method share 11.70% elements
On average the Jaccard method and Cosine method share 93.32% elements
On average the Cosine method and Naive method share 10.53% elements

On average the simlarities give 416 valid entries
