# Case Study Wesfarmers: Instacart Product Recommendation

### Instacart at a glance

Instacart, a grocery ordering and delivery app, aims to make it easy to fill your refrigerator and pantry with your personal favourites and staples when you need them. After selecting products through the Instacart app, personal shoppers review your order and do the in-store shopping and delivery for you.

### Project Aim

The goal of the project is to build a product recommender system model/solution to predict top 10 best products according to the user’s purchase history to improve the shopping experience of customer & impacts the increment in the sales & engagement to Instacart website.

### Methodology 1 TF-IDF Approach:

<img src="../input/abstract.PNG" width="800" height="400">

### Importing libraries

In [37]:
### Imports

from implicit.nearest_neighbours import tfidf_weight
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from pathlib import Path
from numpy import bincount, log, sqrt

import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
import pickle
import time
import heapq

In [38]:
### Helper Functions

def sparsity(matrix):
    """
    Given a matrix, returns its sparsity
    """
    total_size = matrix.shape[0] * matrix.shape[1]
    actual_size = matrix.size
    sparsity = (1 - (actual_size / total_size)) * 100
    return(sparsity)


def get_k_popular(k, df_merged_order_products_prior):
    """
    Returns the `k` most popular products based on purchase count in the dataset
    """
    popular_products = list(df_merged_order_products_prior["product_id"].value_counts().head(k).index)
    return popular_products


def make_prior_data():
    """
    Generates the prior dataset including prior_user_products and product_frequency
    """
    # Read prior order csv
    df_order_products_prior = pd.read_csv("../input/order_products__prior.csv")
    current_order_user_df = df_orders.loc[(df_orders.eval_set == "prior")].reset_index()
    current_order_user_df = current_order_user_df[["order_id", "user_id"]]

    assert len(current_order_user_df["order_id"].unique()) == len(df_order_products_prior["order_id"].unique())

    # Group product_id for each order into products
    df_order_products_prior = df_order_products_prior[["order_id", "product_id"]]
    df_product_frequency = df_order_products_prior['product_id'].value_counts()
    df_order_products_prior = df_order_products_prior.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})
    
    
    assert current_order_user_df.size == df_order_products_prior.size

    df_prior_user_products = pd.merge(current_order_user_df, df_order_products_prior, on="order_id")
    df_prior_user_products = df_prior_user_products[["user_id", "products"]]
    df_prior_user_products = df_prior_user_products.groupby("user_id")["products"].agg(sum).reset_index()

    return df_prior_user_products, df_product_frequency


def make_test_data(test_data_path, df_orders, df_order_products_train):
    """
    Generates the test dataset and saves it to disk at the given path
    """
    start = time.time()
    print("Creating test data ...")

    # Read train csv
    df_order_user_current = df_orders.loc[(df_orders.eval_set == "train")].reset_index()
    df_order_user_current = df_order_user_current[["order_id", "user_id"]]
    
    # Sanity check #1: `current_order_user_df` and `df_order_products_train` should have the same number of 
    # unique order ids
    assert len(df_order_user_current["order_id"].unique()) == len(df_order_products_train["order_id"].unique())

    # Convert train dataframe to a similar format
    df_order_products_test = df_order_products_train[["order_id", "product_id"]]
    df_order_products_test = df_order_products_test.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})

    # Sanity check #2: `df_order_products_test` and `df_order_user_current` should have the same number of 
    # records before attempting to merge them
    assert df_order_products_test.size == df_order_user_current.size

    # Merge on order id
    df_user_products_test = pd.merge(df_order_user_current, df_order_products_test, on="order_id")
    df_user_products_test = df_user_products_test[["user_id", "products"]]

    # Write to disk
    df_user_products_test.to_csv(test_data_path, index_label=False)
    
    print("Completed in {:.2f}s".format(time.time() - start))


def save_data_to_disk(dataframe, df_name):
    """
    Save the data to disk
    """
    filepath = "../input/df_{}.pkl".format(df_name)
    dataframe.to_pickle(filepath)

# Load

sets

In [39]:
# Order datasets
df_order_products_prior = pd.read_csv("../input/order_products__prior.csv")
df_order_products_train = pd.read_csv("../input/order_products__train.csv")
df_orders = pd.read_csv("../input/orders.csv")

# Products
df_products = pd.read_csv("../input/products.csv")

# Merge prior orders and products
df_merged_order_products_prior = pd.merge(df_order_products_prior, df_products, on="product_id", how="left")

In [40]:
df_products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [41]:
df_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [42]:
df_order_products_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [43]:
# Skip this block if you already have the df_user_products.pkl and df_product_frequency.pkl in the disk
# Make prior data
# Running time: 3 min
df_prior_user_products, df_product_frequency = make_prior_data()

# save data to disk, running time : 2 mi
save_data_to_disk(df_prior_user_products, "user_products")
save_data_to_disk(df_product_frequency, "product_frequency")

# understanding make prior data

In [44]:
df_order_products_prior = pd.read_csv("../input/order_products__prior.csv")
current_order_user_df = df_orders.loc[(df_orders.eval_set == "prior")].reset_index()
current_order_user_df = current_order_user_df[["order_id", "user_id"]]

In [45]:
df_order_products_prior

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
32434484,3421083,39678,6,1
32434485,3421083,11352,7,0
32434486,3421083,4600,8,0
32434487,3421083,24852,9,1


In [46]:
current_order_user_df

Unnamed: 0,order_id,user_id
0,2539329,1
1,2398795,1
2,473747,1
3,2254736,1
4,431534,1
...,...,...
3214869,2558525,206209
3214870,2266710,206209
3214871,1854736,206209
3214872,626363,206209


In [47]:
df_order_products_prior = df_order_products_prior[["order_id", "product_id"]]

In [48]:
df_order_products_prior

Unnamed: 0,order_id,product_id
0,2,33120
1,2,28985
2,2,9327
3,2,45918
4,2,30035
...,...,...
32434484,3421083,39678
32434485,3421083,11352
32434486,3421083,4600
32434487,3421083,24852


In [49]:
df_product_frequency = df_order_products_prior['product_id'].value_counts()
df_product_frequency

24852    472565
13176    379450
21137    264683
21903    241921
47209    213584
          ...  
14756         1
20264         1
31254         1
13397         1
23624         1
Name: product_id, Length: 49677, dtype: int64

# understanding ends

In [50]:
# Read user_products and product_frequency from the disk
df_prior_user_products = pd.read_pickle("../input/df_user_products.pkl")
df_product_frequency = pd.read_pickle("../input/df_product_frequency.pkl")
df_product_frequency = pd.DataFrame(df_product_frequency).rename(columns={"product_id": "frequency"})

In [51]:
# Make test data
REBUILD_TEST_DATA = False
test_data_path = "../input/user_products__test.csv"
if REBUILD_TEST_DATA or not Path(test_data_path).is_file():
    make_test_data(test_data_path, df_orders, df_order_products_train)
df_user_products_test = pd.read_csv(test_data_path)

In [52]:
df_user_products_test.head()

Unnamed: 0,user_id,products
0,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
2,5,"[15349, 19057, 16185, 21413, 20843, 20114, 482..."
3,7,"[12053, 47272, 37999, 13198, 43967, 40852, 176..."
4,8,"[15937, 5539, 10960, 23165, 22247, 4853, 27104..."


In [53]:
df_prior_user_products.head()

Unnamed: 0,user_id,products
0,1,"[196, 14084, 12427, 26088, 26405, 196, 10258, ..."
1,2,"[32792, 47766, 20574, 12000, 48110, 22474, 165..."
2,3,"[9387, 17668, 15143, 16797, 39190, 47766, 2190..."
3,4,"[36606, 7350, 35469, 2707, 42329, 7160, 1200, ..."
4,5,"[15349, 21413, 48775, 28289, 8518, 11777, 3171..."


# Load Product Item Matrix

In [54]:
def get_user_product_prior_df(filepath, df_orders, df_order_products_prior):
    """
    Generates a dataframe of users and their prior products purchases, and writes it to disk at the given path
    """
    start = time.time()
    print("Creating prior user product data frame ...")
    
    # Consider ony "prior" orders and remove all columns except `user_id` from `df_orders`
    df_order_user_prior = df_orders.loc[df_orders.eval_set == "prior"]
    df_order_user_prior = df_order_user_prior[["order_id", "user_id"]]
    
    # Remove all columns except order_id and user_id from df_orders and 
    # merge the above on `order_id` and remove `order_id`
    df_merged = pd.merge(df_order_user_prior, df_order_products_prior[["order_id", "product_id"]], on="order_id")
    df_user_product_prior = df_merged[["user_id", "product_id"]]
    df_user_product_prior = df_user_product_prior.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})
    
    # Write to disk
    df_user_product_prior.to_csv(filepath, index_label=False)

    print("Completed in {:.2f}s".format(time.time() - start))


# Build dataframe of users, products and quantity bought using prior datasets
REBUILD_MATRIX_DF = False
matrix_df_path = "../input/user_products__prior.csv"
if REBUILD_MATRIX_DF or not Path(matrix_df_path).is_file():
    get_user_product_prior_df(matrix_df_path, df_orders, df_order_products_prior)
df_user_product_prior = pd.read_csv(matrix_df_path)
df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")

In [55]:
df_user_product_prior.head(30)

Unnamed: 0,user_id,product_id,quantity
0,1,196,10
1,1,10258,9
2,1,10326,1
3,1,12427,10
4,1,13032,3
5,1,13176,2
6,1,14084,1
7,1,17122,1
8,1,25133,8
9,1,26088,2


In [87]:
def build_product_user_matrix(matrix_path, df_user_product_prior):
    """
    Generates a utility matrix representing purchase history of users, and writes it to disk.
    Rows and Columns represent products and users respectively.
    """
    start = time.time()
    print("Creating product user matrix ...")
    
    # Make the dataframe a sparse matrix
    df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
    df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")
    product_user_matrix = sparse.coo_matrix((df_user_product_prior["quantity"],
                                            (df_user_product_prior["product_id"].cat.codes.copy(),
                                             df_user_product_prior["user_id"].cat.codes.copy())))
    
    sparse.save_npz(matrix_path, product_user_matrix)
    
    print("Completed in {:.2f}s".format(time.time() - start))


# Get the `product x user` matrix
REBUILD_MATRIX = False
matrix_path = "../input/product_user_matrix.npz"
if REBUILD_MATRIX or not Path(matrix_path).is_file():
    build_product_user_matrix(matrix_path, df_user_product_prior)
product_user_matrix = sparse.load_npz(matrix_path).tocsr()

In [77]:
df_user_product_prior

Unnamed: 0,user_id,product_id,quantity
0,1,196,10
1,1,10258,9
2,1,10326,1
3,1,12427,10
4,1,13032,3
...,...,...,...
13307948,206209,43961,3
13307949,206209,44325,1
13307950,206209,48370,1
13307951,206209,48697,1


In [93]:
def build_product_user_matrix1(matrix_path, df_user_product_prior):
    """
    Generates a utility matrix representing purchase history of users, and writes it to disk.
    Rows and Columns represent products and users respectively.
    """
    start = time.time()
    print("Creating product user matrix ...")



    # Make the dataframe a sparse matrix
    df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
    df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")
    product_user_matrix = sparse.coo_matrix((df_user_product_prior["user_id"].cat.codes.copy(),
                                            (df_user_product_prior["quantity"],
                                             df_user_product_prior["product_id"].cat.codes.copy())))

    sparse.save_npz(matrix_path, product_user_matrix)

    print("Completed in {:.2f}s".format(time.time() - start))


# Get the `product x user` matrix
REBUILD_MATRIX = False
matrix_path = "../input/product_user_matrix2.npz"
if REBUILD_MATRIX or not Path(matrix_path).is_file():
    build_product_user_matrix1(matrix_path, df_user_product_prior)
product_user_matrix1 = sparse.load_npz(matrix_path).tocsr()

Creating product user matrix ...
Completed in 7.71s


In [84]:
print(product_user_matrix)

  (0, 137)	2
  (0, 708)	1
  (0, 763)	2
  (0, 776)	1
  (0, 824)	1
  (0, 909)	1
  (0, 1051)	2
  (0, 1378)	1
  (0, 1479)	3
  (0, 1493)	3
  (0, 1539)	17
  (0, 1597)	3
  (0, 1646)	5
  (0, 2832)	1
  (0, 2849)	1
  (0, 2856)	1
  (0, 3009)	1
  (0, 3028)	1
  (0, 3394)	1
  (0, 3903)	1
  (0, 4005)	5
  (0, 4121)	1
  (0, 4140)	1
  (0, 4342)	1
  (0, 4552)	1
  :	:
  (49676, 151976)	1
  (49676, 153611)	1
  (49676, 157882)	1
  (49676, 159098)	1
  (49676, 159486)	4
  (49676, 159637)	1
  (49676, 163631)	2
  (49676, 164452)	1
  (49676, 166037)	2
  (49676, 166212)	1
  (49676, 171742)	1
  (49676, 172261)	1
  (49676, 176941)	1
  (49676, 178692)	1
  (49676, 178751)	2
  (49676, 182947)	2
  (49676, 183547)	1
  (49676, 184080)	1
  (49676, 187522)	1
  (49676, 188072)	2
  (49676, 197370)	1
  (49676, 200214)	1
  (49676, 200376)	1
  (49676, 200872)	2
  (49676, 205925)	1


In [95]:
print(product_user_matrix1)

  (1, 0)	44275104
  (1, 1)	7013446
  (1, 2)	4143077
  (1, 3)	12041393
  (1, 4)	336784
  (1, 5)	347357
  (1, 6)	772480
  (1, 7)	5159646
  (1, 8)	4626395
  (1, 9)	88175506
  (1, 10)	3143183
  (1, 11)	8514039
  (1, 12)	324252
  (1, 13)	1771808
  (1, 14)	176434
  (1, 15)	1653657
  (1, 16)	1395719
  (1, 17)	3069025
  (1, 18)	441546
  (1, 19)	559986
  (1, 20)	521063
  (1, 21)	1336858
  (1, 22)	39743060
  (1, 23)	501821
  (1, 24)	45618143
  :	:
  (94, 24950)	75123
  (94, 31975)	74314
  (94, 40563)	75123
  (94, 45181)	84477
  (95, 1159)	123745
  (95, 4917)	103592
  (95, 11780)	36334
  (95, 18922)	140439
  (95, 19656)	76677
  (95, 24848)	178106
  (95, 38681)	99752
  (96, 14943)	141735
  (96, 24848)	69918
  (96, 27839)	99752
  (97, 195)	98084
  (97, 24848)	99706
  (97, 31975)	84477
  (98, 12009)	120896
  (98, 28198)	103592
  (98, 29665)	41355
  (99, 4207)	17996
  (99, 6580)	41355
  (99, 14362)	41355
  (99, 25129)	141735
  (99, 38644)	41355


In [58]:
# User=1 bought product=196 10 times
assert product_user_matrix[195, 0] == 10

In [59]:
sparsity(product_user_matrix)

99.8700882953749

# Term Frequency-Inverse Document Frequency

In [60]:
# Fetch Term Frequency matrix
user_product_matrix = product_user_matrix.T

In [61]:
print(user_product_matrix)

  (137, 0)	2
  (708, 0)	1
  (763, 0)	2
  (776, 0)	1
  (824, 0)	1
  (909, 0)	1
  (1051, 0)	2
  (1378, 0)	1
  (1479, 0)	3
  (1493, 0)	3
  (1539, 0)	17
  (1597, 0)	3
  (1646, 0)	5
  (2832, 0)	1
  (2849, 0)	1
  (2856, 0)	1
  (3009, 0)	1
  (3028, 0)	1
  (3394, 0)	1
  (3903, 0)	1
  (4005, 0)	5
  (4121, 0)	1
  (4140, 0)	1
  (4342, 0)	1
  (4552, 0)	1
  :	:
  (151976, 49676)	1
  (153611, 49676)	1
  (157882, 49676)	1
  (159098, 49676)	1
  (159486, 49676)	4
  (159637, 49676)	1
  (163631, 49676)	2
  (164452, 49676)	1
  (166037, 49676)	2
  (166212, 49676)	1
  (171742, 49676)	1
  (172261, 49676)	1
  (176941, 49676)	1
  (178692, 49676)	1
  (178751, 49676)	2
  (182947, 49676)	2
  (183547, 49676)	1
  (184080, 49676)	1
  (187522, 49676)	1
  (188072, 49676)	2
  (197370, 49676)	1
  (200214, 49676)	1
  (200376, 49676)	1
  (200872, 49676)	2
  (205925, 49676)	1


In [62]:
def tfidf_weight(tf):
    """
    Given a Term Frequency matrix
    Returns a TF-IDF weight matrix
    """
    
    tf_idf = coo_matrix(tf)

    # calculate IDF
    N = float(tf_idf.shape[0])
    idf = log(N / (1 + bincount(tf_idf.col)))

    # apply TF-IDF adjustment
    tf_idf.data = sqrt(tf_idf.data) * idf[tf_idf.col]
    return tf_idf

tf_idf = tfidf_weight(user_product_matrix)

# convert to Compressed Sparse Row format
tf_idf = tf_idf.tocsr()

## Example Recommendation

In [63]:
def generateRecommendations(target_user, cos_vec, K, N):
    """
    Given a target_user (a row), a cosine similarity vector, the number of similar users K, 
          the number of products to be recommended.
    Returns product set by target user and N recommendations
    """
    
    # Select top K similar users
    top_K_similar_users = heapq.nlargest(K+1, range(len(cos_vec)), cos_vec.take)

    # Initialize the result for recommendations
    recommendations = []
    
    # Exclude the user with same purchase history (1.00000) as the target user and implement set-minus
    products_target_user = df_prior_user_products.loc[df_prior_user_products['user_id'] == target_user_id].products

    # Products of Target User
    productset_target_user = set(products_target_user.tolist()[0])

    # Fetch the preliminary recommendations
    for similar_user_id in top_K_similar_users:
        
        products_similar_user = df_prior_user_products.loc[df_prior_user_products['user_id'] == similar_user_id + 1].products

        # Recommend the products bought by the user who firstly differs in the purchase history from A.
        candidate_recommendation = set(products_similar_user.tolist()[0]) - productset_target_user

        # If similar_user_id equals to target_user_id or the candidate_recommendation is empty,
        # skip current user
        if similar_user_id == target_user_id or not candidate_recommendation: continue

        # One candidate_recommendation found, and extend it to the result
        recommendations.extend(candidate_recommendation)

        # If length of recommendations exceed N, break
        # Needed because this will ensure the recommentations are the products bought by most similar users
        if len(recommendations) > N: break
        
    # Pick the top N popularity (overall sales) to recommend
    h = []
    for rec in recommendations:
        heapq.heappush(h, (df_product_frequency.loc[rec]['frequency'], rec))
        if len(h) > N:
            heapq.heappop(h)
            
    return productset_target_user, [item[1] for item in h]

In [64]:
# Selecting one user to test
target_user_id = 1

# Fetch row of target user
target_user = tf_idf[target_user_id - 1]

# Calculate Cosine Similarity Vector of target user
similarities = cosine_similarity(tf_idf, target_user, False)

productset_target_user, recommendations = generateRecommendations(target_user, similarities.toarray(), 20, 10)

In [76]:
print(product_user_matrix)

  (0, 137)	2
  (0, 708)	1
  (0, 763)	2
  (0, 776)	1
  (0, 824)	1
  (0, 909)	1
  (0, 1051)	2
  (0, 1378)	1
  (0, 1479)	3
  (0, 1493)	3
  (0, 1539)	17
  (0, 1597)	3
  (0, 1646)	5
  (0, 2832)	1
  (0, 2849)	1
  (0, 2856)	1
  (0, 3009)	1
  (0, 3028)	1
  (0, 3394)	1
  (0, 3903)	1
  (0, 4005)	5
  (0, 4121)	1
  (0, 4140)	1
  (0, 4342)	1
  (0, 4552)	1
  :	:
  (49676, 151976)	1
  (49676, 153611)	1
  (49676, 157882)	1
  (49676, 159098)	1
  (49676, 159486)	4
  (49676, 159637)	1
  (49676, 163631)	2
  (49676, 164452)	1
  (49676, 166037)	2
  (49676, 166212)	1
  (49676, 171742)	1
  (49676, 172261)	1
  (49676, 176941)	1
  (49676, 178692)	1
  (49676, 178751)	2
  (49676, 182947)	2
  (49676, 183547)	1
  (49676, 184080)	1
  (49676, 187522)	1
  (49676, 188072)	2
  (49676, 197370)	1
  (49676, 200214)	1
  (49676, 200376)	1
  (49676, 200872)	2
  (49676, 205925)	1


In [73]:
tf_idf1 = tfidf_weight(product_user_matrix)

# convert to Compressed Sparse Row format
tf_idf1 = tf_idf.tocsr()

In [74]:
# Selecting one user to test
target_product_id = 137

# Fetch row of target user
target_product = tf_idf1[target_product_id - 1]

# Calculate Cosine Similarity Vector of target user
similarities = cosine_similarity(tf_idf1, target_product, False)

In [75]:
print(similarities)

  (13, 0)	0.007298916194918597
  (18, 0)	0.015505516589637457
  (20, 0)	0.02058792075315681
  (21, 0)	0.047950826058247985
  (26, 0)	0.0019862789875225474
  (30, 0)	0.006758907825177951
  (32, 0)	0.013577021272401443
  (36, 0)	0.0028246953524359536
  (41, 0)	0.00895494912306569
  (47, 0)	0.004919411827097022
  (50, 0)	0.006196724109281364
  (54, 0)	0.0048956294599833555
  (60, 0)	0.011110904128843675
  (62, 0)	0.009929504527880459
  (63, 0)	0.011152479352418006
  (69, 0)	0.04035016717837186
  (71, 0)	0.015499857588297944
  (82, 0)	0.008457332268607603
  (86, 0)	0.0026126149414244597
  (89, 0)	0.017083250666007247
  (90, 0)	0.00464004244548469
  (93, 0)	0.013940630785078092
  (96, 0)	0.0033087778625540748
  (108, 0)	0.007575862329255163
  (111, 0)	0.01170637871242454
  :	:
  (206054, 0)	0.010762418582906445
  (206061, 0)	0.008970373146730859
  (206079, 0)	0.022524799277570624
  (206080, 0)	0.00512917901912987
  (206083, 0)	0.046773913833690076
  (206092, 0)	0.015121471049373324
  (20609

In [72]:
print(target_user)

  (0, 195)	10.275263695642945
  (0, 10254)	17.736859602527698
  (0, 10322)	4.674483865331559
  (0, 12423)	15.210860465376676
  (0, 13028)	8.792888060980058
  (0, 13172)	1.6648858373116806
  (0, 14080)	4.225953957426909
  (0, 17118)	3.823924326737684
  (0, 25129)	13.723627724905615
  (0, 26083)	7.321423024085006
  (0, 26400)	8.083679185492812
  (0, 30444)	3.0974789440355526
  (0, 35945)	2.688334011836079
  (0, 38920)	4.504276274272822
  (0, 39649)	5.1684734961691685
  (0, 41779)	2.8112743933773685
  (0, 46139)	8.409355396794442
  (0, 49224)	3.555531448298375


In [66]:
# Output the product_name of Target User's products as well as Recommendations
print('Actual products bought by User {}:'.format(target_user_id))
print(productset_target_user)
print()
print('Recommended products for User {}:'.format(target_user_id))
print(recommendations)

Actual products bought by User 1:
{17122, 196, 26405, 14084, 46149, 26088, 13032, 39657, 12427, 25133, 35951, 38928, 10258, 30450, 49235, 10326, 13176, 41787}

Recommended products for User 1:
[500, 26104, 4149, 41400, 22802, 12916, 37710, 9755, 16797, 5258]


# Evaluation

In [67]:
# Get the 10 most popular products
popular_products = get_k_popular(10, df_merged_order_products_prior)

In [68]:
def recall_score(actual, pred):
    """
    Given two lists representing actual and predicted values
    Returns the recall of the prediction
    """
    if len(actual) == 0:
        return 0
    actual, pred = set(actual), set(pred)
    return len(actual.intersection(pred)) / len(actual)

def new_products(row):
    """
    Given a row in the test dataset
    Returns the list of new products purchased
    """
    actual = row["products"][1:-1]
    actual = set([int(p.strip()) for p in actual.strip().split(",")])
    products_target_user = df_prior_user_products.loc[df_prior_user_products['user_id'] == row["user_id"]].products
    liked = set(products_target_user.tolist()[0])
    return actual - liked

def popular_recommend(row):
    """
    Given a row in the test dataset
    Returns the recall score when popular products are recommended
    """
    actual = new_products(row)
    return recall_score(actual, popular_products)

def tfidf_recommend(row):
    """
    Given a row in the test dataset
    Returns the recall score when our model recommends products
    """
    actual = row["products"][1:-1]
    actual = [int(p.strip()) for p in actual.strip().split(",")]
    target_user = tf_idf[row["user_id"] - 1]
    similarities = cosine_similarity(tf_idf, target_user, False)
    cos_vec = similarities.toarray()
    productset_target_user, recommended = generateRecommendations(target_user, cos_vec, 20, 10)

    cur_recall_score = recall_score(actual, recommended)
    
    global count, progress, recall_sum
    count += 1; recall_sum += cur_recall_score
    if level[progress] and int(count / total * 10) - 1 == progress:
        level[progress] = False
        progress += 1
        print("{:.1f}% completed, current mean of recall = {}".format(progress * 10, recall_sum / count))    
    
    return cur_recall_score

def build_eval_df( df_user_products_test, subset=None):
    """
    Builds a dataframe of recall values of the baseline and our model for all the users
    in the test data, and saves its to disk at `filepath`
    """
    start = time.time()
    print("Building dataframe with recall values ...")
    
    df_eval = df_user_products_test.copy()
    if subset:
        df_eval = df_eval.sample(n=int(len(df_eval) * subset), random_state=7)
    df_eval["popular_score"] = df_eval.apply(popular_recommend, axis=1)
    df_eval["tfidf_score"] = df_eval.apply(tfidf_recommend, axis=1)

    #df_eval.to_csv(filepath) #, index_label=False)
    
    print("Completed in {:.2f}s".format(time.time() - start))    

    return df_eval

# Get the dataframe with recall values of the baseline and the model
REBUILD_EVAL_DF = False
subset = 0.1

# How many users in the test?
total = len(df_user_products_test) * subset

# Counter
count = 0
progress = 0
recall_sum = 0
level = [True] * 10

In [69]:
# Estimated 3 hours to run 20% of the test dataset
eval_path = "../input/eval/eval_tfidf_{}.csv".format(subset if subset is not None else "full")
if REBUILD_EVAL_DF or not Path(eval_path).exists():
    df_eval = build_eval_df( df_user_products_test, subset=subset)

In [70]:
df_eval.to_csv(eval_path)

NameError: name 'df_eval' is not defined

In [None]:
df_eval = pd.read_csv(eval_path)

In [None]:
# Mean recall scores
model_mean_recall, baseline_mean_recall = np.mean(df_eval["tfidf_score"]), np.mean(df_eval["popular_score"])
print("Model: {:.2f}%".format(model_mean_recall * 100))
print("Baseline: {:.2f}%".format(baseline_mean_recall * 100))

Recommendations through TF-IDF are almost a factor of 8 times better than the baseline model.

In [None]:
df_eval.head(5)