In [1]:
import pandas as pd
import numpy as np
import random
from tkinter import Tk
from tkinter.filedialog import askopenfilename
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from ObjectToObjectRecommender import ObjectToObjectRecommender as OTO

In [2]:
Tk().withdraw()

# def getPatronPath():
#     filename = askopenfilename()
#     return(filename)

# def getInventoryPath():
#     filename = askopenfilename()
#     return(filename)

# Remove to select folder or paste path below.
# patron_path = getPatronPath()
# inventory_pat = getInventoryPath()

patron_path = 'C:/Users/Ben/Desktop/HRP/Data/Clean_Data/Clean_Patron.csv'
inventory_path = 'C:/Users/Ben/Desktop/HRP/Data/Clean_Data/Clean_Inventory.csv'

patron_df = pd.read_csv(patron_path)
inv_df = pd.read_csv(inventory_path)

# Drop random columns
patron_df.drop(columns = 'Unnamed: 0', inplace = True)
inv_df.drop(columns = 'Unnamed: 0', inplace = True)

In [3]:
# Create Instances of an item to item and user to user comparators
iti = OTO(patron_df, 'Patron_ID', 'Item_ID')
ptp = OTO(patron_df, 'Item_ID', 'Patron_ID')

In [4]:
def get_patron_pairs(patron_id):
    '''
    Creates a list of positive patron and item pairs and an equally as large list of negative pairs
    '''
    pair_list = []

    # Get a list of pairs of patron, item that the patron has read.
    has_list = iti.objects_by_actor[patron_id]
    for item in has_list:
        pair_list.append([patron_id, item, 1])

    has_len = len(pair_list)

    # Get a list of pairs of patron, item that the patron hasn't read.
    hasnt_list = random.sample([x for x in iti.object_ids if x not in has_list], has_len)

    for item in hasnt_list:
        pair_list.append([patron_id, item, 0])

    return pair_list

In [5]:
def get_feat_vec(pair):
    '''
    Creates a list of selected features for any patron and item pair.
    '''
    feat_vec = []

    # User-Item pair (u, i)
    patron = pair[0]
    item = pair[1]

    # Popularity of i
    feat_vec.append(len(patron_df[patron_df['Item_ID'] == item]))

    # The similarity between i and the most similar book the user u has read
    max = 0
    items_read = iti.get_objects(patron)

    for item_id in items_read:
        sim_score = iti.jaccard(item, item_id)
        if sim_score > max:
            max = sim_score

    feat_vec.append(max)

    # The similarity between the user u and the most similar user who has read i
    max = 0
    patron_read = ptp.get_objects(item)

    for patron_id in patron_read:
        sim_score = ptp.jaccard(patron, patron_id)
        if sim_score > max:
            max = sim_score

    feat_vec.append(max)

    return feat_vec

In [6]:
%%time
# Create a user and item pairs for all users in the test DataFrame
pairs_lists = [get_patron_pairs(patron_id) for patron_id in set(iti.actor_ids)]

# unpack user and item pair list into the pair list and label list
pairs  = []
label_list = []

for pair_list in pairs_lists:
    for pair in pair_list:
        pairs.append(pair[:2])
        label_list.append(pair[2])

CPU times: total: 3.16 s
Wall time: 3.15 s


In [7]:
%%time
# Generate the feature vector for each pair
pair_feats = [get_feat_vec(pair) for pair in pairs]

CPU times: total: 40.7 s
Wall time: 40.7 s


In [8]:
# Split the feature vectors and labels into 75% for training an 25% for testing.
X_train, X_test, y_train, y_test = train_test_split(pair_feats, label_list, test_size=0.25, random_state=16)

In [9]:
# Initailize the model and fit to the training data.
logreg = LogisticRegression()

logreg.fit(X_train, y_train)

LogisticRegression()

In [10]:
# Make predictions on the test set.
y_pred = logreg.predict(X_test)

In [11]:
len(y_pred)

17710

In [12]:
len(inv_df)

11438

In [13]:
# from sklearn import metrics

# cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
# print(cnf_matrix)

# metrics.precision_score(y_test, y_pred)

# from sklearn.metrics import classification_report

# target_names = ['shouldn\'t checkout', 'should checkout']
# print(classification_report(y_test, y_pred, target_names=target_names))

In [14]:
inv_df.head()

Unnamed: 0,Title,Author_First,Author_Last,Num_Checkouts
0,sonic the hedgehog,ian,flynn,235
1,babysitters little sister,katy,farina,163
2,if you give a pig a pancake,laura joffe,numeroff,119
3,beastars,paru,itagaki,114
4,if you give a dog a donut,laura joffe,numeroff,112


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import re

In [16]:
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(inv_df['Title'])

In [17]:
def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([query])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = inv_df.iloc[indices]
    results = results.sort_values("Num_Checkouts", ascending=False)
    
    return results.head(10)

In [18]:
search('wuthering heights', vectorizer)

Unnamed: 0,Title,Author_First,Author_Last,Num_Checkouts
48,dog man mothering heights,dav,pilkey,51
3814,the boy of the threeyear nap,dianne,snyder,6
3815,when turtle grew feathers a folktale from the...,tim,tingle,6
3817,celebrate your feeling the positive mindset p...,lauren,rivers,6
3816,the midwife murders,james,patterson,6
3810,bartholomew and the oobleck,dr,seuss,6
3811,l is for lone star a texas alphabet,carol,crane,6
3813,mary howitts the spider and the fly,mary botham,howitt,6
3812,and the mountains echoed,khaled,hosseini,6
11437,north or be eaten,andrew,peterson,1


In [19]:
harry_potter_user = [101, 155, 674]
magic_school_bus_user = [1772, 2242, 3952, 7935]
classics_user = [1448, 4240, 10433, 2510]

In [20]:
def get_reccs(item_list):
    id = iti.add_actor(item_list)
    pairs = get_patron_pairs(id)
    feats = [get_feat_vec(pair) for pair in pairs]
    
    return feats

In [21]:
get_reccs(harry_potter_user)

KeyError: 13742