#  Data Prep & Filtering

In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from pandas import Series, DataFrame
import time
import turicreate as tc
from sklearn.model_selection import train_test_split
from functools import reduce
import random

In [3]:

file1 = "Data1.csv"
file2 = "Data2.csv"


In [7]:
def pre_process_df(filename):
    if filename:
        df = pd.read_csv(filename)
        df = df.drop_duplicates()
        df = df[df.brand.notnull()]
        return df[df.price > 0]
    return None
  
def gen_purchase_count(files):
    df = pd.concat([pre_process_df(f) for f in files])
    df = df.loc[:, ['event_type', 'product_id', 'user_id']]
    df['purchase_count'] = [1 if e == 'purchase' else 0 for e in df['event_type']]
    df = df.drop(['event_type'], axis = 1)
    df = df.groupby(['product_id', 'user_id']).sum()
    df = df[df.purchase_count != 0]
    return df.reset_index()

In [8]:
purchase_count = gen_purchase_count([file1,file2])

In [9]:
purchase_count

Unnamed: 0,product_id,user_id,purchase_count
0,3762,254751820,1
1,3762,258117654,1
2,3762,288191779,1
3,3762,293416727,1
4,3762,296682241,1
...,...,...,...
318443,5909237,571392419,1
318444,5909237,579752599,1
318445,5909238,579137281,1
318446,5909240,579137281,1


In [10]:
user_id  = np.unique(purchase_count['user_id'])
num_products_purchased = np.zeros(len(user_id))
for loc, user in enumerate(user_id):
    num_products_purchased[loc] = np.sum(purchase_count['user_id'] == user)

In [7]:
num_products_purchased

array([2., 2., 3., ..., 2., 1., 1.])

In [8]:
np.sum(purchase_count['user_id'] == 579681799)

3

In [9]:
purchase_count[purchase_count['user_id'] == 579681799]

Unnamed: 0,brand,user_id,purchase_count
72519,irisk,579681799,1
89748,lovely,579681799,2
151239,zinger,579681799,10


In [10]:
user_id

array([  9794320,  10079204,  10280338, ..., 579924768, 579925377,
       579944216])

In [11]:
idx = np.where(num_products_purchased>=10)

In [13]:
filitered_user_id = user_id[idx]

In [14]:
filtered_data = purchase_count.loc[purchase_count['user_id'].isin(filitered_user_id)]

In [15]:
filtered_data

Unnamed: 0,product_id,user_id,purchase_count
1,3762,258117654,1
2,3762,288191779,1
12,3762,419505462,1
19,3762,445330308,1
23,3762,469218581,1
...,...,...,...
318442,5909233,502873541,1
318443,5909237,571392419,1
318445,5909238,579137281,1
318446,5909240,579137281,1


In [16]:
filtered_data=filtered_data[['user_id','product_id','purchase_count']]

In [17]:
filtered_data.columns=['customerId','productId','purchase_count']

In [18]:
data=filtered_data

In [19]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)

In [20]:
df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')

In [21]:
df_matrix

productId,3762,3763,3774,3776,3806,3928,3929,3936,3945,3959,...,5908835,5909042,5909057,5909070,5909231,5909233,5909237,5909238,5909240,5909245
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10280338,,,,,,,,,,,...,,,,,,,,,,
29025780,,,,,,,,,,,...,,,,,,,,,,
36180886,,,,,,,,,,,...,,,,,,,,,,
40821287,,,,,,,,,,,...,,,,,,,,,,
43713532,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579811230,,,,,,,,,,,...,,,,,,,,,,
579811378,,,,,,,,,,,...,,,,,,,,,,
579849574,,,,,,,,,,,...,,,,,,,,,,
579857098,,,,,,,,,,,...,,,,,,,,,,


In [22]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

In [23]:
d = df_matrix_norm.reset_index() 
d.index.names = ['scaled_purchase_freq'] 
data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()


In [24]:
print(data_norm.shape)
data_norm.head()

(111468, 3)


Unnamed: 0,customerId,productId,scaled_purchase_freq
357,258117654,3762,0.0
483,288191779,3762,0.0
1745,419505462,3762,0.0
2362,445330308,3762,0.0
2978,469218581,3762,0.0


In [25]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [26]:
train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [27]:
user_id = 'customerId'
item_id = 'productId'
users_to_recommend = list(filtered_data[user_id])
n_rec = 5 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [28]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
            model = tc.item_similarity_recommender.create(train_data, 
                                                        user_id=user_id, 
                                                        item_id=item_id, 
                                                        target=target, 
                                                    similarity_type='pearson')
    elif name == 'jaccard':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='jaccard')

    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model


In [29]:
name = 'popularity'
target = 'purchase_count'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
| 258117654  |  5797165  |  3.0  |  1   |
| 258117654  |   20921   |  3.0  |  2   |
| 258117654  |  5888538  |  3.0  |  3   |
| 258117654  |  5769287  |  3.0  |  4   |
| 258117654  |  5805502  |  3.0  |  5   |
| 288191779  |  5797165  |  3.0  |  1   |
| 288191779  |   20921   |  3.0  |  2   |
| 288191779  |  5888538  |  3.0  |  3   |
| 288191779  |  5769287  |  3.0  |  4   |
| 288191779  |  5805502  |  3.0  |  5   |
| 419505462  |  5797165  |  3.0  |  1   |
| 419505462  |   20921   |  3.0  |  2   |
| 419505462  |  5888538  |  3.0  |  3   |
| 419505462  |  5769287  |  3.0  |  4   |
| 419505462  |  5805502  |  3.0  |  5   |
| 445330308  |  5797165  |  3.0  |  1   |
| 445330308  |   20921   |  3.0  |  2   |
| 445330308  |  5888538  |  3.0  |  3   |
| 445330308  |  5769287  |  3.0  |  4   |
| 445330308  |  5805502  |  3.0  |  5   |
| 469218581  |  5797165  |  3.0  |

In [30]:
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
| 258117654  |  5622676  |  1.0  |  1   |
| 258117654  |  5890938  |  1.0  |  2   |
| 258117654  |  5674915  |  1.0  |  3   |
| 258117654  |  5754292  |  1.0  |  4   |
| 258117654  |  5692280  |  1.0  |  5   |
| 288191779  |  5622676  |  1.0  |  1   |
| 288191779  |  5890938  |  1.0  |  2   |
| 288191779  |  5674915  |  1.0  |  3   |
| 288191779  |  5754292  |  1.0  |  4   |
| 288191779  |  5692280  |  1.0  |  5   |
| 419505462  |  5622676  |  1.0  |  1   |
| 419505462  |  5890938  |  1.0  |  2   |
| 419505462  |  5674915  |  1.0  |  3   |
| 419505462  |  5754292  |  1.0  |  4   |
| 419505462  |  5692280  |  1.0  |  5   |
| 445330308  |  5622676  |  1.0  |  1   |
| 445330308  |  5890938  |  1.0  |  2   |
| 445330308  |  5674915  |  1.0  |  3   |
| 445330308  |  5754292  |  1.0  |  4   |
| 445330308  |  5692280  |  1.0  |  5   |
| 469218581  |  5622676  |  1.0  |

In [31]:
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
| 258117654  |  5881602  |  1.0  |  1   |
| 258117654  |  5887685  |  1.0  |  2   |
| 258117654  |  5812505  |  1.0  |  3   |
| 258117654  |  5888344  |  1.0  |  4   |
| 258117654  |  5798771  |  1.0  |  5   |
| 288191779  |  5881602  |  1.0  |  1   |
| 288191779  |  5887685  |  1.0  |  2   |
| 288191779  |  5812505  |  1.0  |  3   |
| 288191779  |  5888344  |  1.0  |  4   |
| 288191779  |  5798771  |  1.0  |  5   |
| 419505462  |  5881602  |  1.0  |  1   |
| 419505462  |  5887685  |  1.0  |  2   |
| 419505462  |  5812505  |  1.0  |  3   |
| 419505462  |  5888344  |  1.0  |  4   |
| 419505462  |  5798771  |  1.0  |  5   |
| 445330308  |  5881602  |  1.0  |  1   |
| 445330308  |  5887685  |  1.0  |  2   |
| 445330308  |  5812505  |  1.0  |  3   |
| 445330308  |  5888344  |  1.0  |  4   |
| 445330308  |  5798771  |  1.0  |  5   |
| 469218581  |  5881602  |  1.0  |

In [42]:
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
| 258117654  |  5804264  | 0.06981423497200012  |  1   |
| 258117654  |    5256   | 0.06981423497200012  |  2   |
| 258117654  |  5809146  | 0.057925575971603395 |  3   |
| 258117654  |  5804296  | 0.055694013833999634 |  4   |
| 258117654  |  5788729  | 0.055694013833999634 |  5   |
| 288191779  |  5645615  | 0.08520509799321492  |  1   |
| 288191779  |    5067   |  0.0832304060459137  |  2   |
| 288191779  |  5804264  | 0.08302372694015503  |  3   |
| 288191779  |    5256   | 0.08302372694015503  |  4   |
| 288191779  |  5645620  | 0.07210172712802887  |  5   |
| 419505462  |    4768   | 0.043328512992177694 |  1   |
| 419505462  |    4554   | 0.03367892120565687  |  2   |
| 419505462  |    4765   | 0.031920403242111206 |  3   |
| 419505462  |    4649   | 0.028358167835644314 |  4   |
| 419505462  |    4766   | 0.02

In [32]:
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
| 258117654  |  5769986  | 0.07162437174055311  |  1   |
| 258117654  |  5805750  | 0.06902786758210924  |  2   |
| 258117654  |  5835177  | 0.06353014707565308  |  3   |
| 258117654  |  5712788  | 0.05823798312081231  |  4   |
| 258117654  |  5769981  | 0.05731268061531915  |  5   |
| 288191779  |  5663277  | 0.11693533108784603  |  1   |
| 288191779  |    4092   | 0.11693533108784603  |  2   |
| 288191779  |    4131   | 0.08268576860427856  |  3   |
| 288191779  |  5764550  | 0.08020166250375602  |  4   |
| 288191779  |    3959   |  0.0732483772131113  |  5   |
| 419505462  |  5798778  | 0.03443503996421551  |  1   |
| 419505462  |  5700696  | 0.03443503996421551  |  2   |
| 419505462  |  5870817  | 0.031920126799879406 |  3   |
| 419505462  |  5770320  | 0.031920126799879406 |  4   |
| 419505462  |  5848403  | 0.03

In [33]:
name = 'cosine' 
target = 'scaled_purchase_freq' 
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+---------------------+------+
| customerId | productId |        score        | rank |
+------------+-----------+---------------------+------+
| 258117654  |  5838738  |         0.0         |  1   |
| 258117654  |    4958   |         0.0         |  2   |
| 258117654  |  5841842  |         0.0         |  3   |
| 258117654  |  5857684  |         0.0         |  4   |
| 258117654  |  5724658  |         0.0         |  5   |
| 288191779  |  5877502  | 0.08247860840388707 |  1   |
| 288191779  |  5852479  | 0.08247860840388707 |  2   |
| 288191779  |  5862299  | 0.08247860840388707 |  3   |
| 288191779  |  5877029  | 0.08247860840388707 |  4   |
| 288191779  |  5862276  | 0.08247860840388707 |  5   |
| 419505462  |  5838738  |         0.0         |  1   |
| 419505462  |    4958   |         0.0         |  2   |
| 419505462  |  5841842  |         0.0         |  3   |
| 419505462  |  5857684  |         0.0         |  4   |
| 419505462  |  5724658  |         0.0         |

In [35]:
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
| 258117654  |  5797165  |  3.0  |  1   |
| 258117654  |   20921   |  3.0  |  2   |
| 258117654  |  5888538  |  3.0  |  3   |
| 258117654  |  5805502  |  3.0  |  4   |
| 258117654  |  5769287  |  3.0  |  5   |
| 288191779  |  5797165  |  3.0  |  1   |
| 288191779  |   20921   |  3.0  |  2   |
| 288191779  |  5888538  |  3.0  |  3   |
| 288191779  |  5805502  |  3.0  |  4   |
| 288191779  |  5769287  |  3.0  |  5   |
| 419505462  |  5797165  |  3.0  |  1   |
| 419505462  |   20921   |  3.0  |  2   |
| 419505462  |  5888538  |  3.0  |  3   |
| 419505462  |  5805502  |  3.0  |  4   |
| 419505462  |  5769287  |  3.0  |  5   |
| 445330308  |  5797165  |  3.0  |  1   |
| 445330308  |   20921   |  3.0  |  2   |
| 445330308  |  5888538  |  3.0  |  3   |
| 445330308  |  5805502  |  3.0  |  4   |
| 445330308  |  5769287  |  3.0  |  5   |
| 469218581  |  5797165  |  3.0  |

In [36]:
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
| 258117654  |  5622676  |  0.0  |  1   |
| 258117654  |  5890938  |  0.0  |  2   |
| 258117654  |  5674915  |  0.0  |  3   |
| 258117654  |  5754292  |  0.0  |  4   |
| 258117654  |  5692280  |  0.0  |  5   |
| 288191779  |  5622676  |  0.0  |  1   |
| 288191779  |  5890938  |  0.0  |  2   |
| 288191779  |  5674915  |  0.0  |  3   |
| 288191779  |  5754292  |  0.0  |  4   |
| 288191779  |  5692280  |  0.0  |  5   |
| 419505462  |  5622676  |  0.0  |  1   |
| 419505462  |  5890938  |  0.0  |  2   |
| 419505462  |  5674915  |  0.0  |  3   |
| 419505462  |  5754292  |  0.0  |  4   |
| 419505462  |  5692280  |  0.0  |  5   |
| 445330308  |  5622676  |  0.0  |  1   |
| 445330308  |  5890938  |  0.0  |  2   |
| 445330308  |  5674915  |  0.0  |  3   |
| 445330308  |  5754292  |  0.0  |  4   |
| 445330308  |  5692280  |  0.0  |  5   |
| 469218581  |  5622676  |  0.0  |

In [37]:
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
| 258117654  |  5887685  |  1.0  |  1   |
| 258117654  |  5812505  |  1.0  |  2   |
| 258117654  |  5888344  |  1.0  |  3   |
| 258117654  |  5798771  |  1.0  |  4   |
| 258117654  |  5648651  |  1.0  |  5   |
| 288191779  |  5887685  |  1.0  |  1   |
| 288191779  |  5812505  |  1.0  |  2   |
| 288191779  |  5888344  |  1.0  |  3   |
| 288191779  |  5798771  |  1.0  |  4   |
| 288191779  |  5648651  |  1.0  |  5   |
| 419505462  |  5887685  |  1.0  |  1   |
| 419505462  |  5812505  |  1.0  |  2   |
| 419505462  |  5888344  |  1.0  |  3   |
| 419505462  |  5798771  |  1.0  |  4   |
| 419505462  |  5648651  |  1.0  |  5   |
| 445330308  |  5887685  |  1.0  |  1   |
| 445330308  |  5812505  |  1.0  |  2   |
| 445330308  |  5888344  |  1.0  |  3   |
| 445330308  |  5798771  |  1.0  |  4   |
| 445330308  |  5648651  |  1.0  |  5   |
| 469218581  |  5887685  |  1.0  |

In [38]:
name = 'jaccard'
target = 'purchase_count'
jacc = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
| 258117654  |  5614842  |       0.034375       |  1   |
| 258117654  |  5749227  | 0.023593074083328246 |  2   |
| 258117654  |  5769986  | 0.02254902124404907  |  3   |
| 258117654  |  5597372  | 0.022134387493133546 |  4   |
| 258117654  |  5805750  | 0.021896547079086302 |  5   |
| 288191779  |  5803428  | 0.041666666666666664 |  1   |
| 288191779  |  5828847  | 0.041666666666666664 |  2   |
| 288191779  |  5862931  | 0.041666666666666664 |  3   |
| 288191779  |  5645615  | 0.04069548348585764  |  4   |
| 288191779  |  5848331  | 0.03333333134651184  |  5   |
| 419505462  |    4768   | 0.024493466530527388 |  1   |
| 419505462  |    4554   | 0.015869357756205967 |  2   |
| 419505462  |  5772461  | 0.014919191598892212 |  3   |
| 419505462  |    4765   | 0.012645419154848372 |  4   |
| 419505462  |    4571   | 0.01

In [39]:
name = 'jaccard'
target = 'purchase_dummy'
jacc_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
| 258117654  |  5769986  | 0.03901234600279066  |  1   |
| 258117654  |  5614842  | 0.036150978671179876 |  2   |
| 258117654  |  5769981  | 0.028730160660213895 |  3   |
| 258117654  |  5805750  | 0.02525252103805542  |  4   |
| 258117654  |    3945   | 0.023073878553178575 |  5   |
| 288191779  |  5663277  | 0.06837606888550979  |  1   |
| 288191779  |    4092   | 0.06837606888550979  |  2   |
| 288191779  |    4131   |  0.0489203746502216  |  3   |
| 288191779  |  5764550  |  0.0415384631890517  |  4   |
| 288191779  |  5812621  | 0.038461538461538464 |  5   |
| 419505462  |  5772461  | 0.017718588483744656 |  1   |
| 419505462  |  5772471  | 0.01722345886559322  |  2   |
| 419505462  |  5809875  | 0.016608731500033676 |  3   |
| 419505462  |    4600   | 0.016348507897607212 |  4   |
| 419505462  |  5772307  | 0.01

In [40]:
name = 'jaccard'
target = 'scaled_purchase_freq'
jacc_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
| 258117654  |  5838738  |        0.0         |  1   |
| 258117654  |    4958   |        0.0         |  2   |
| 258117654  |  5841842  |        0.0         |  3   |
| 258117654  |  5857684  |        0.0         |  4   |
| 258117654  |  5724658  |        0.0         |  5   |
| 288191779  |  5877502  | 0.0476190447807312 |  1   |
| 288191779  |  5852479  | 0.0476190447807312 |  2   |
| 288191779  |  5862299  | 0.0476190447807312 |  3   |
| 288191779  |  5877029  | 0.0476190447807312 |  4   |
| 288191779  |  5862276  | 0.0476190447807312 |  5   |
| 419505462  |  5838738  |        0.0         |  1   |
| 419505462  |    4958   |        0.0         |  2   |
| 419505462  |  5841842  |        0.0         |  3   |
| 419505462  |  5857684  |        0.0         |  4   |
| 419505462  |  5724658  |        0.0         |  5   |
| 44533030

In [43]:
models_w_counts = [popularity, cos, pear,jacc]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy,jacc_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm,jacc_norm]
names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts','Jaccard Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy','Jaccard Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts','Jaccard Similarity on Scaled Purchase Counts']

In [44]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |          0.0           |          0.0           |
|   3    | 7.223606746848703e-05  |  5.85112146494745e-05  |
|   4    | 5.4177050601365274e-05 |  5.85112146494745e-05  |
|   5    | 4.3341640481092164e-05 | 5.851121464947455e-05  |
|   6    | 3.611803373424351e-05  | 5.851121464947449e-05  |
|   7    | 4.6437471944027345e-05 | 0.0001126882652508398  |
|   8    | 4.0632787951023954e-05 | 0.00011268826525083977 |
|   9    | 4.8157378312324664e-05 | 0.00013435908549138604 |
|   10   | 4.334164048109216e-05  | 0.00013435908549138596 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.22948995081623383

Per User RMSE (best)
+------------+-


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    |  0.0649041066204355  | 0.018885202460464722 |
|   2    | 0.054339581753169344 | 0.03153610766987308  |
|   3    | 0.04948170621591344  | 0.04213383852408288  |
|   4    | 0.04472315527142702  | 0.049868735155520455 |
|   5    | 0.041824683064253576 | 0.057695338475607524 |
|   6    | 0.03922418463538851  | 0.06543467598732898  |
|   7    | 0.03677847777966989  | 0.07088456672402334  |
|   8    | 0.03453786975837034  |  0.075894927759948   |
|   9    | 0.032903528731895844 |  0.080601409328828   |
|   10   | 0.03161772673095673  | 0.08615752262545989  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0504021390134042

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse  


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |          0.0           |          0.0           |
|   3    |          0.0           |          0.0           |
|   4    | 5.4177050601365274e-05 | 5.851121464947446e-05  |
|   5    | 4.334164048109214e-05  | 5.851121464947451e-05  |
|   6    |  3.61180337342435e-05  | 5.851121464947451e-05  |
|   7    | 4.643747194402736e-05  | 0.00011268826525083974 |
|   8    | 4.063278795102396e-05  | 0.00011268826525083977 |
|   9    | 3.6118033734243496e-05 | 0.00011268826525083977 |
|   10   | 3.2506230360819155e-05 | 0.00011268826525083974 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.27928885515744084

Per User RMSE (best)
+------------+-


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    |  0.0777982446635605  | 0.023146754523798028 |
|   2    | 0.06880485426373399  | 0.039555546557359114 |
|   3    | 0.06132842128074546  |  0.0525707228746132  |
|   4    | 0.05585653917000753  | 0.06283447497356359  |
|   5    | 0.05227001842019689  |  0.0726855593831542  |
|   6    | 0.04926499801350823  | 0.08157652902419343  |
|   7    | 0.04674705509032112  | 0.08996277249540764  |
|   8    | 0.044330371654567144 | 0.09685380086755238  |
|   9    | 0.042113627334127816 | 0.10283832025346842  |
|   10   | 0.04035106728789688  | 0.10977152864686332  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0568340185621625

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse  


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    | 0.00010835410120273055 | 4.445296459599202e-05  |
|   3    | 7.223606746848695e-05  | 4.4452964595991976e-05 |
|   4    | 8.126557590204792e-05  | 5.430333743260389e-05  |
|   5    | 6.501246072163838e-05  | 5.4303337432603816e-05 |
|   6    | 5.417705060136523e-05  | 5.4303337432603816e-05 |
|   7    | 4.6437471944027386e-05 | 5.430333743260395e-05  |
|   8    | 5.417705060136529e-05  | 8.139186273328646e-05  |
|   9    | 6.019672289040585e-05  | 9.493612538362774e-05  |
|   10   | 7.584787084191134e-05  | 0.00015633678273184156 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+------------+------+-------+
|


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.07129699859139675  | 0.02099742603910313  |
|   2    | 0.05926969335789361  | 0.034339999694725766 |
|   3    |  0.053274099758009   | 0.04591810508016743  |
|   4    | 0.049328204572543055 | 0.05619319157907916  |
|   5    | 0.04531368512298159  |  0.0639293355714339  |
|   6    | 0.041933037165456724 | 0.07025978193543582  |
|   7    | 0.03948733030973809  | 0.07709799908099804  |
|   8    | 0.037517607541445445 | 0.08353692941152978  |
|   9    | 0.035901325531838106 | 0.09042042512507185  |
|   10   | 0.034586629103911554 | 0.09669094243924545  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9861722820950886

Per User RMSE (best)
+------------+-------------------+-------+
| customerId |        rmse   


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    | 0.00010835410120273055 | 2.167082024054611e-05  |
|   2    | 5.4177050601365274e-05 |  2.16708202405461e-05  |
|   3    | 3.611803373424349e-05  | 2.167082024054611e-05  |
|   4    | 8.126557590204783e-05  | 5.4177050601365274e-05 |
|   5    | 0.00010835410120273047 | 0.00013002492144327654 |
|   6    |  9.02950843356087e-05  | 0.00013002492144327657 |
|   7    | 7.739578657337894e-05  | 0.00013002492144327657 |
|   8    | 8.126557590204794e-05  | 0.00016614295517752037 |
|   9    | 0.00010835410120273052 | 0.0002594478756576494  |
|   10   | 9.751869108245742e-05  | 0.00025944787565764914 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+------------+------+-------+
|


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.08245747101527787  | 0.024175350159405123 |
|   2    | 0.06967168707335575  | 0.03967452219857751  |
|   3    | 0.06313432296745776  | 0.05346059891866649  |
|   4    | 0.057590204789251274 | 0.06500016958707812  |
|   5    | 0.05348358435366748  | 0.07542752984094912  |
|   6    | 0.05002347672192741  | 0.08475076331419756  |
|   7    | 0.04736622138290815  | 0.09232511102227445  |
|   8    | 0.04515657167623792  | 0.10006801465968376  |
|   9    | 0.043269404413623695 | 0.10749523939760094  |
|   10   | 0.041358760429082264 | 0.11426303981436343  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9918507068917665

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse  


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    | 0.00024431956999755686 | 4.7991344106662945e-05 |
|   2    | 0.00018323967749816773 | 0.00010907123660605204 |
|   3    | 0.00012215978499877816 |  0.000109071236606052  |
|   4    | 0.0001221597849987785  | 0.00017015112910544138 |
|   5    | 0.00012215978499877838 | 0.00029231091410422016 |
|   6    | 0.00012215978499877832 | 0.00032285086035391454 |
|   7    | 0.00012215978499877843 | 0.0003309948460204998  |
|   8    | 0.00013742975812362562 | 0.00043279466685281463 |
|   9    | 0.0001357330944430869  |  0.000442974648936046  |
|   10   | 0.0001221597849987785  | 0.00044297464893604667 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.22466460690680395

Per User RMSE (best)
+------------+-


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0037869533349621305 | 0.0009301149779513123 |
|   2    |  0.002931834839970681 | 0.0015963705672462371 |
|   3    | 0.0028096750549719014 | 0.0024966203161400115 |
|   4    |  0.002473735646225263 |  0.00315395630208582  |
|   5    | 0.0023454678719765404 | 0.0036711522749377904 |
|   6    |  0.002198876129978011 |  0.004133517365937138 |
|   7    | 0.0021639733342640735 |  0.004846964443603614 |
|   8    |  0.00216833618372832  |  0.005411984297653523 |
|   9    |  0.00206314303553492  |  0.005860283860294641 |
|   10   |  0.002040068409479597 |  0.00665853988393547  |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.2167513959888032

Per User RMSE (best)
+------------+------+-------+
| customerId |


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    | 0.00024431956999755686 | 0.00018323967749816765 |
|   2    | 0.00018323967749816765 | 0.0001913836631647527  |
|   3    | 0.00012215978499877813 | 0.00019138366316475276 |
|   4    | 0.00018323967749816762 | 0.0002924004084522043  |
|   5    |  0.000146591741998534  | 0.00029240040845220425 |
|   6    | 0.0001221597849987782  | 0.0002924004084522041  |
|   7    | 0.00010470838714181007 | 0.0002924004084522038  |
|   8    | 0.00013742975812362575 |  0.000475640085950372  |
|   9    | 0.0001357330944430872  | 0.0005367199784497616  |
|   10   | 0.00017102369899828947 | 0.0008115794946970128  |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.2236984181647702

Per User RMSE (best)
+------------+--


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0054971903249450265 | 0.0016655049510642269 |
|   2    |  0.004275592474957246 | 0.0026604814163558627 |
|   3    |  0.003664793549963349 | 0.0033569376191643755 |
|   4    | 0.0031150745174688492 |  0.004002348483241255 |
|   5    | 0.0028585389689714104 |  0.004597930318134109 |
|   6    |  0.002646795341640203 |  0.005375002283820787 |
|   7    |  0.00246064709783253  |  0.005752739501356224 |
|   8    |  0.002336305888101638 |  0.006064118304977886 |
|   9    | 0.0022938892960881687 |  0.006589771154720638 |
|   10   | 0.0022843879794771566 |  0.007532582923943355 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.21710168707975092

Per User RMSE (best)
+------------+------+-------+
| customerId 

In [45]:
eval_counts

[{'precision_recall_by_user': Columns:
  	customerId	int
  	cutoff	int
  	precision	float
  	recall	float
  	count	int
  
  Rows: 166122
  
  Data:
  +------------+--------+-----------+--------+-------+
  | customerId | cutoff | precision | recall | count |
  +------------+--------+-----------+--------+-------+
  | 497361708  |   1    |    0.0    |  0.0   |   5   |
  | 497361708  |   2    |    0.0    |  0.0   |   5   |
  | 497361708  |   3    |    0.0    |  0.0   |   5   |
  | 497361708  |   4    |    0.0    |  0.0   |   5   |
  | 497361708  |   5    |    0.0    |  0.0   |   5   |
  | 497361708  |   6    |    0.0    |  0.0   |   5   |
  | 497361708  |   7    |    0.0    |  0.0   |   5   |
  | 497361708  |   8    |    0.0    |  0.0   |   5   |
  | 497361708  |   9    |    0.0    |  0.0   |   5   |
  | 497361708  |   10   |    0.0    |  0.0   |   5   |
  +------------+--------+-----------+--------+-------+
  [166122 rows x 5 columns]
  Note: Only the head of the SFrame is printed.
  You 

## Jaccrd Similarity here is the best Similarity

In [46]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', similarity_type='jaccard')
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
| 258117654  |  5769986  | 0.037832189689983024 |  1   |
| 258117654  |  5614842  | 0.03466154770417647  |  2   |
| 258117654  |  5769981  | 0.027372631159695713 |  3   |
| 258117654  |  5805750  | 0.024969187649813564 |  4   |
| 258117654  |    3928   | 0.024153075434944847 |  5   |
| 288191779  |    4092   | 0.04340721579159007  |  1   |
| 288191779  |  5788728  | 0.04075630272136015  |  2   |
| 288191779  |  5614841  | 0.04020581525914809  |  3   |
| 288191779  |    4131   | 0.038680925088770246 |  4   |
| 288191779  |  5769982  |  0.0372549014932969  |  5   |
| 419505462  |    4768   | 0.02761567792584819  |  1   |
| 419505462  |    4554   | 0.021009076026178176 |  2   |
| 419505462  |  5772461  | 0.02028870198034471  |  3   |
| 419505462  |  5772471  | 0.01865997814363049  |  4   |
| 419505462  |  5772307  | 0.01

In [47]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(905240, 4)


Unnamed: 0,customerId,productId,score,rank
0,258117654,5769986,0.037832,1
1,258117654,5614842,0.034662,2
2,258117654,5769981,0.027373,3
3,258117654,5805750,0.024969,4
4,258117654,3928,0.024153,5


In [None]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=5)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: ' , '.join(x.astype(str)))
    df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerId').set_index('customerId')

In [48]:
recomendation = final_model.recommend(users=users_to_recommend, k=5)
df_rec = recomendation.to_dataframe()
df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
    .transform(lambda x: ' , '.join(x.astype(str)))
df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
    .sort_values('customerId').set_index('customerId')

In [49]:
df_output=df_output.reset_index()

In [50]:
df_output.head()

Unnamed: 0,customerId,recommendedProducts
0,10280338,"5816170 , 5816178 , 5816172 , 5816169 , 585062..."
1,29025780,"5815567 , 5815569 , 5818266 , 5818264 , 581826..."
2,36180886,"5819234 , 5819244 , 5819223 , 5819222 , 580783..."
3,40821287,"5304 , 5700046 , 5810157 , 4958 , 5848387 , 53..."
4,43713532,"5833323 , 5848909 , 5772308 , 5833330 , 580929..."


In [57]:
df_output.shape

(9672, 2)

## This is for streamlit 

In [51]:
# new=recomendation.to_dataframe()

# new.columns

# new.columns=['user_id','product_id','score', 'rank']

# new1=new[["user_id","product_id","rank"]]

# new1.to_csv('new_product.csv',index=False)

In [61]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerId').set_index('customerId')
    if print_csv:
        df_output.to_csv('option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [62]:
df_output = create_output(pear_norm, users_to_recommend, 5, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(9672, 1)


Unnamed: 0_level_0,recommendedProducts
customerId,Unnamed: 1_level_1
10280338,5887685|5812505|5888344|5798771|5648651|588768...
29025780,5887685|5812505|5888344|5798771|5648651|588768...
36180886,5887685|5812505|5888344|5798771|5648651|588768...
40821287,5887685|5812505|5888344|5798771|5648651|588768...
43713532,5887685|5812505|5888344|5798771|5648651|588768...


In [63]:
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [64]:
customer_recomendation(40821287)

recommendedProducts    5887685|5812505|5888344|5798771|5648651|588768...
Name: 40821287, dtype: object