In [73]:
# Thanks for Moorissa Tjokro I get some of my codes from her explanation to deal with purchase data


##https://medium.datadriveninvestor.com/how-to-build-a-recommendation-system-for-purchase-data-step-by-step-d6d7a78800b6
    

#  Data Prep & Filtering

In [72]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from pandas import Series, DataFrame
import time
import turicreate as tc
from sklearn.model_selection import train_test_split
from functools import reduce
import random

In [2]:

file1 = "Data1.csv"
file2 = "Data2.csv"


In [3]:
def pre_process_df(filename):
    if filename:
        df = pd.read_csv(filename)
        df = df.drop_duplicates()
        df = df[df.brand.notnull()]
        return df[df.price > 0]
    return None
  
def gen_purchase_count(files):
    df = pd.concat([pre_process_df(f) for f in files])
    df = df.loc[:, ['event_type', 'brand', 'user_id']]
    df['purchase_count'] = [1 if e == 'purchase' else 0 for e in df['event_type']]
    df = df.drop(['event_type'], axis = 1)
    df = df.groupby(['brand', 'user_id']).sum()
    df = df[df.purchase_count != 0]
    return df.reset_index()

In [4]:
purchase_count = gen_purchase_count([file1,file2])

In [5]:
purchase_count

Unnamed: 0,brand,user_id,purchase_count
0,airnails,10280338,1
1,airnails,105697630,1
2,airnails,126051562,2
3,airnails,149936015,1
4,airnails,160905182,2
...,...,...,...
151238,zinger,579632103,1
151239,zinger,579681799,10
151240,zinger,579750441,1
151241,zinger,579849574,1


In [6]:
user_id  = np.unique(purchase_count['user_id'])
num_brand_purchased = np.zeros(len(user_id))
for loc, user in enumerate(user_id):
    num_brand_purchased[loc] = np.sum(purchase_count['user_id'] == user)

In [7]:
num_brand_purchased

array([2., 2., 3., ..., 2., 1., 1.])

In [8]:
np.sum(purchase_count['user_id'] == 579681799)

3

In [9]:
purchase_count[purchase_count['user_id'] == 579681799]

Unnamed: 0,brand,user_id,purchase_count
72519,irisk,579681799,1
89748,lovely,579681799,2
151239,zinger,579681799,10


In [10]:
user_id

array([  9794320,  10079204,  10280338, ..., 579924768, 579925377,
       579944216])

In [11]:
idx = np.where(num_brand_purchased>=5)

In [12]:
np.where(num_brand_purchased>=5)

(array([    6,     9,    11, ..., 46949, 46951, 46953]),)

In [13]:
filitered_user_id = user_id[idx]

In [14]:
filtered_data = purchase_count.loc[purchase_count['user_id'].isin(filitered_user_id)]

In [15]:
filtered_data

Unnamed: 0,brand,user_id,purchase_count
1,airnails,105697630,1
2,airnails,126051562,2
3,airnails,149936015,1
5,airnails,169938179,2
6,airnails,170386254,4
...,...,...,...
151231,zinger,579242176,2
151234,zinger,579501173,1
151236,zinger,579523833,1
151237,zinger,579608040,1


In [16]:
filtered_data=filtered_data[['user_id','brand','purchase_count']]

In [17]:
filtered_data.columns=['customerId','productId','purchase_count']

In [18]:
data=filtered_data

In [19]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)

In [20]:
df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')

In [21]:
df_matrix

productId,airnails,almea,andrea,ardell,art-visage,artex,aura,balbcare,barbie,batiste,...,uskusi,veraclara,vilenta,vosev,weaver,yoko,ypsed,yu-r,zeitun,zinger
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
29025780,,,,,,,,,,,...,,,,,,,,,,
33609704,,,,,,,,,,,...,,,,,,,,,,
34236465,,,,,,,,,,,...,,,,,,,,,,
36180886,,,,,,,,,,,...,,,,,,,,,,
40821287,,,,,,,,,,,...,,,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579811378,,,,,,,,,,,...,,,,,,,,,,
579841776,,,,,,,,,,,...,,,,,,,,,,
579847583,,,,,,,,,,,...,,,,,,,,,,
579849574,,,,,,,,,,,...,,,,,,,,,,1.0


In [22]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

In [23]:
d = df_matrix_norm.reset_index() 
d.index.names = ['scaled_purchase_freq'] 
data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()


In [24]:
print(data_norm.shape)
data_norm.head()

(75607, 3)


Unnamed: 0,customerId,productId,scaled_purchase_freq
27,105697630,airnails,0.0
36,126051562,airnails,0.021277
49,149936015,airnails,0.0
65,169938179,airnails,0.021277
67,170386254,airnails,0.06383


In [25]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [26]:
train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [27]:
user_id = 'customerId'
item_id = 'productId'
users_to_recommend = list(filtered_data[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [30]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
            model = tc.item_similarity_recommender.create(train_data, 
                                                        user_id=user_id, 
                                                        item_id=item_id, 
                                                        target=target, 
                                                    similarity_type='pearson')
    elif name == 'jaccard':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='jaccard')

    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model


In [31]:
name = 'popularity'
target = 'purchase_count'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
| 105697630  | bpw.style | 3.8805925030229744 |  1   |
| 105697630  |   keune   |        3.5         |  2   |
| 105697630  |  airnails |       3.485        |  3   |
| 105697630  |   runail  | 3.4467480354455775 |  4   |
| 105697630  |  kinetics | 3.0516129032258066 |  5   |
| 105697630  | freedecor | 2.9850402761795167 |  6   |
| 105697630  |  grattol  | 2.8441116405718176 |  7   |
| 105697630  |   thuya   |        2.8         |  8   |
| 105697630  |   dermal  | 2.742857142857143  |  9   |
| 105697630  | levissime | 2.7222222222222223 |  10  |
| 126051562  | bpw.style | 3.8805925030229744 |  1   |
| 126051562  |   masura  | 3.667487684729064  |  2   |
| 126051562  |   keune   |        3.5         |  3   |
| 126051562  |  airnails |       3.485        |  4   |
| 126051562  |  kinetics | 3.0516129032258066 |  5   |
| 12605156

In [32]:
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
| 105697630  |   domix   |  1.0  |  1   |
| 105697630  |  italwax  |  1.0  |  2   |
| 105697630  |   ardell  |  1.0  |  3   |
| 105697630  |    oniq   |  1.0  |  4   |
| 105697630  |  bluesky  |  1.0  |  5   |
| 105697630  |  severina |  1.0  |  6   |
| 105697630  |  ingarden |  1.0  |  7   |
| 105697630  |    milv   |  1.0  |  8   |
| 105697630  |    pole   |  1.0  |  9   |
| 105697630  |   dizao   |  1.0  |  10  |
| 126051562  |   domix   |  1.0  |  1   |
| 126051562  |  italwax  |  1.0  |  2   |
| 126051562  |   ardell  |  1.0  |  3   |
| 126051562  |    oniq   |  1.0  |  4   |
| 126051562  |  severina |  1.0  |  5   |
| 126051562  |    milv   |  1.0  |  6   |
| 126051562  |    pole   |  1.0  |  7   |
| 126051562  |   dizao   |  1.0  |  8   |
| 126051562  | bpw.style |  1.0  |  9   |
| 126051562  |   masura  |  1.0  |  10  |
| 149936015  |  italwax  |  1.0  |

In [33]:
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
| 105697630  |   labay   |        1.0         |  1   |
| 105697630  |    osmo   | 0.6666666666666666 |  2   |
| 105697630  |   dewal   | 0.6666666666666666 |  3   |
| 105697630  |   coxir   |        0.5         |  4   |
| 105697630  |   mielle  |        0.5         |  5   |
| 105697630  |   sanoto  |        0.5         |  6   |
| 105697630  |  skinity  |        0.5         |  7   |
| 105697630  |  ecocraft |        0.4         |  8   |
| 105697630  |   nirvel  |        0.35        |  9   |
| 105697630  |   keune   | 0.3333333333333333 |  10  |
| 126051562  |   labay   |        1.0         |  1   |
| 126051562  |    osmo   | 0.6666666666666666 |  2   |
| 126051562  |   dewal   | 0.6666666666666666 |  3   |
| 126051562  |   coxir   |        0.5         |  4   |
| 126051562  |   mielle  |        0.5         |  5   |
| 12605156

In [34]:
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+---------------------+------+
| customerId | productId |        score        | rank |
+------------+-----------+---------------------+------+
| 105697630  |   runail  |  0.3300863703091939 |  1   |
| 105697630  | bpw.style | 0.22618568936983743 |  2   |
| 105697630  |   domix   |  0.2021221121152242 |  3   |
| 105697630  |  severina |  0.1905081868171692 |  4   |
| 105697630  |   zinger  | 0.16219988465309143 |  5   |
| 105697630  | freedecor | 0.15928182999293009 |  6   |
| 105697630  |  grattol  | 0.15431002775828043 |  7   |
| 105697630  |  ingarden |  0.1542424956957499 |  8   |
| 105697630  |  nagaraku | 0.15061996380488077 |  9   |
| 105697630  |  inoface  | 0.14454394578933716 |  10  |
| 126051562  | bpw.style |  0.3505216572019789 |  1   |
| 126051562  |   domix   | 0.29984940422905815 |  2   |
| 126051562  | freedecor |  0.2324874003728231 |  3   |
| 126051562  |    milv   | 0.22222508986790976 |  4   |
| 126051562  |    yoko   |  0.2183974583943685 |

In [35]:
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+---------------------+------+
| customerId | productId |        score        | rank |
+------------+-----------+---------------------+------+
| 105697630  |  ingarden | 0.21753829717636108 |  1   |
| 105697630  |  grattol  | 0.21155695617198944 |  2   |
| 105697630  | freedecor | 0.18618328869342804 |  3   |
| 105697630  |   domix   | 0.18014919012784958 |  4   |
| 105697630  |    uno    | 0.16740696132183075 |  5   |
| 105697630  |    milv   | 0.16437480598688126 |  6   |
| 105697630  |  severina | 0.15084143728017807 |  7   |
| 105697630  |   de.lux  | 0.14920006692409515 |  8   |
| 105697630  |   zinger  | 0.13844066113233566 |  9   |
| 105697630  |  jessnail |  0.1348465010523796 |  10  |
| 126051562  | bpw.style | 0.30916649103164673 |  1   |
| 126051562  |   masura  | 0.23814347386360168 |  2   |
| 126051562  | freedecor | 0.22777479141950607 |  3   |
| 126051562  |   domix   |  0.2090669423341751 |  4   |
| 126051562  |    milv   | 0.20381104201078415 |

In [36]:
name = 'cosine' 
target = 'scaled_purchase_freq' 
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-------------+-----------------------+------+
| customerId |  productId  |         score         | rank |
+------------+-------------+-----------------------+------+
| 105697630  |    irisk    |  0.006274844918932233 |  1   |
| 105697630  |    domix    | 0.0035774452345711844 |  2   |
| 105697630  |    de.lux   | 0.0033263819558279856 |  3   |
| 105697630  |  freedecor  | 0.0031950814383370535 |  4   |
| 105697630  |   ingarden  | 0.0029682346752711703 |  5   |
| 105697630  |   grattol   |  0.002919239657265799 |  6   |
| 105697630  |   severina  | 0.0025632636887686594 |  7   |
| 105697630  |     milv    | 0.0024867398398263113 |  8   |
| 105697630  |    zinger   | 0.0024011731147766113 |  9   |
| 105697630  |     uno     |  0.002041935920715332 |  10  |
| 126051562  |   farmona   | 0.0032359063625335693 |  1   |
| 126051562  |  igrobeauty |  0.002030588686466217 |  2   |
| 126051562  |  art-visage | 0.0017753541469573975 |  3   |
| 126051562  |    irisk    | 0.001729115

In [37]:
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
| 105697630  | bpw.style | 3.855887132755738  |  1   |
| 105697630  |   keune   |        3.5         |  2   |
| 105697630  |  airnails | 3.484505927165349  |  3   |
| 105697630  |   runail  | 3.4330264998423674 |  4   |
| 105697630  |  kinetics | 3.051851590025808  |  5   |
| 105697630  | freedecor | 2.9547833193638007 |  6   |
| 105697630  |  grattol  | 2.840546319205824  |  7   |
| 105697630  |   thuya   | 2.8000000000000003 |  8   |
| 105697630  |   dermal  | 2.743655864965349  |  9   |
| 105697630  | levissime | 2.718699597650105  |  10  |
| 126051562  | bpw.style | 3.875610508394064  |  1   |
| 126051562  |   masura  | 3.6694870629334093 |  2   |
| 126051562  |   keune   | 3.4986272388034396 |  3   |
| 126051562  |  airnails | 3.4677936291694635 |  4   |
| 126051562  |  kinetics | 3.0483289588309512 |  5   |
| 12605156

In [38]:
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
| 105697630  |   domix   |  0.0  |  1   |
| 105697630  |  italwax  |  0.0  |  2   |
| 105697630  |   ardell  |  0.0  |  3   |
| 105697630  |    oniq   |  0.0  |  4   |
| 105697630  |  bluesky  |  0.0  |  5   |
| 105697630  |  severina |  0.0  |  6   |
| 105697630  |  ingarden |  0.0  |  7   |
| 105697630  |    milv   |  0.0  |  8   |
| 105697630  |    pole   |  0.0  |  9   |
| 105697630  |   dizao   |  0.0  |  10  |
| 126051562  |   domix   |  0.0  |  1   |
| 126051562  |  italwax  |  0.0  |  2   |
| 126051562  |   ardell  |  0.0  |  3   |
| 126051562  |    oniq   |  0.0  |  4   |
| 126051562  |  severina |  0.0  |  5   |
| 126051562  |    milv   |  0.0  |  6   |
| 126051562  |    pole   |  0.0  |  7   |
| 126051562  |   dizao   |  0.0  |  8   |
| 126051562  | bpw.style |  0.0  |  9   |
| 126051562  |   masura  |  0.0  |  10  |
| 149936015  |  italwax  |  0.0  |

In [39]:
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+------------+---------------------+------+
| customerId | productId  |        score        | rank |
+------------+------------+---------------------+------+
| 105697630  |   labay    |         1.0         |  1   |
| 105697630  |    osmo    |  0.6667477374985106 |  2   |
| 105697630  |   dewal    |  0.6666666666666667 |  3   |
| 105697630  |   mielle   |         0.5         |  4   |
| 105697630  |   coxir    |         0.5         |  5   |
| 105697630  |   sanoto   |         0.5         |  6   |
| 105697630  |  skinity   |         0.5         |  7   |
| 105697630  |  ecocraft  |         0.4         |  8   |
| 105697630  |   nirvel   |         0.35        |  9   |
| 105697630  |   keune    |  0.3333573625201271 |  10  |
| 126051562  |   labay    |         1.0         |  1   |
| 126051562  |    osmo    |  0.6667516032854717 |  2   |
| 126051562  |   dewal    |  0.6666666666666667 |  3   |
| 126051562  |   coxir    |         0.5         |  4   |
| 126051562  |   sanoto   |    

In [40]:
name = 'jaccard'
target = 'purchase_count'
jacc = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
| 105697630  |   runail  | 0.13963550329208374  |  1   |
| 105697630  | bpw.style |  0.1085998813311259  |  2   |
| 105697630  |  ingarden | 0.09747594594955444  |  3   |
| 105697630  |  grattol  | 0.09008134404818217  |  4   |
| 105697630  | freedecor | 0.07864546775817871  |  5   |
| 105697630  |   domix   | 0.07530614733695984  |  6   |
| 105697630  |    milv   | 0.07007154822349548  |  7   |
| 105697630  |    uno    | 0.06881490349769592  |  8   |
| 105697630  |  severina | 0.061802953481674194 |  9   |
| 105697630  |   zinger  | 0.053297460079193115 |  10  |
| 126051562  | bpw.style |  0.1552469465467665  |  1   |
| 126051562  |   masura  | 0.11825984054141575  |  2   |
| 126051562  |   domix   | 0.10620878802405463  |  3   |
| 126051562  | freedecor | 0.10473116901185778  |  4   |
| 126051562  |    milv   | 0.09

In [41]:
name = 'jaccard'
target = 'purchase_dummy'
jacc_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
| 105697630  |  ingarden | 0.11788840591907501  |  1   |
| 105697630  |  grattol  | 0.11352728307247162  |  2   |
| 105697630  | freedecor | 0.09824187308549881  |  3   |
| 105697630  |   domix   | 0.08815513551235199  |  4   |
| 105697630  |    milv   | 0.08245334029197693  |  5   |
| 105697630  |    uno    | 0.08202414959669113  |  6   |
| 105697630  |  severina | 0.07113657891750336  |  7   |
| 105697630  |   de.lux  | 0.06914214044809341  |  8   |
| 105697630  |   zinger  | 0.06118902564048767  |  9   |
| 105697630  |  jessnail | 0.05903148651123047  |  10  |
| 126051562  | bpw.style | 0.17345990985631943  |  1   |
| 126051562  |   masura  | 0.12848469614982605  |  2   |
| 126051562  | freedecor |  0.1187070980668068  |  3   |
| 126051562  |   domix   | 0.10959205031394958  |  4   |
| 126051562  |    milv   | 0.10

In [42]:
name = 'jaccard'
target = 'scaled_purchase_freq'
jacc_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
| 105697630  |   irisk   |  0.0793764761516026  |  1   |
| 105697630  | freedecor | 0.05246158157076154  |  2   |
| 105697630  |  grattol  | 0.04689739431653704  |  3   |
| 105697630  |  ingarden | 0.04470562083380563  |  4   |
| 105697630  |    milv   | 0.03124966791697911  |  5   |
| 105697630  |   de.lux  | 0.028992925371442522 |  6   |
| 105697630  |    uno    | 0.026084601879119873 |  7   |
| 105697630  |   domix   | 0.021358072757720947 |  8   |
| 105697630  |  haruyama | 0.01683098077774048  |  9   |
| 105697630  |    pole   | 0.015866202967507497 |  10  |
| 126051562  |   irisk   | 0.03949251025915146  |  1   |
| 126051562  | bpw.style | 0.028025485575199127 |  2   |
| 126051562  |   masura  | 0.021750353276729584 |  3   |
| 126051562  | freedecor | 0.017033152282238007 |  4   |
| 126051562  |    milv   | 0.01

In [43]:
models_w_counts = [popularity, cos, pear,jacc]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy,jacc_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm,jacc_norm]
names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts','Jaccard Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy','Jaccard Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts','Jaccard Similarity on Scaled Purchase Counts']

In [45]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    | 0.12757653372362476 | 0.06928545292979987 |
|   2    | 0.09062080741553852 | 0.09869409161396009 |
|   3    | 0.06866691059885352 | 0.11248894077287815 |
|   4    |  0.0740029271862422 |  0.1625937735341322 |
|   5    |  0.0837907061836811 |  0.2300591729150086 |
|   6    | 0.07818026588608366 |  0.2579880047315094 |
|   7    | 0.07237816458453139 |  0.2773402486172335 |
|   8    | 0.06564824978655948 |  0.286557981325636  |
|   9    | 0.05899093385372197 |  0.2895412138917438 |
|   10   | 0.05433589462129532 | 0.29609398745878807 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 3.0698958071907074

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |
+------------+------+-------+
| 57544981


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    | 0.31613611416026344 |  0.181210300520582  |
|   2    | 0.24570069520673246 |  0.2746871472905133 |
|   3    |  0.2048217262267755 | 0.34124578684586016 |
|   4    | 0.17864983534577383 | 0.39440775781792975 |
|   5    | 0.15909257226491053 |  0.438360562747683  |
|   6    | 0.14450949302760485 |  0.4771027051305137 |
|   7    | 0.13297788928963453 |  0.5106642892958311 |
|   8    |  0.1230637882668618 |  0.5391409037493935 |
|   9    | 0.11486495643100143 |  0.5663852250661625 |
|   10   | 0.10780582997926587 |  0.591596100569756  |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 3.7263409001617167

Per User RMSE (best)
+------------+----------------------+-------+
| customerId |         rmse         | count |
+-------


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    | 0.13013782168557145 |  0.070549834764983  |
|   2    | 0.09976826442249058 | 0.10854401523995406 |
|   3    | 0.07850550880188643 | 0.12780178380251594 |
|   4    |  0.079277960726918  | 0.17428828912546424 |
|   5    |  0.0846932552750335 | 0.23185817279304263 |
|   6    | 0.07769240151237955 | 0.25581700826852605 |
|   7    | 0.07246528322269268 |  0.2777833920900146 |
|   8    | 0.06656299548725447 | 0.29037958558631827 |
|   9    | 0.06021059478798221 | 0.29507150504223356 |
|   10   | 0.05553116233687033 | 0.30257271018339404 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 3.0569750892269023

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |
+------------+------+-------+
| 57544981


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    | 0.34150506159287763 | 0.19564731084123646 |
|   2    |  0.2588120502500305 | 0.28994994550245184 |
|   3    |  0.215798674635118  | 0.35816587195108945 |
|   4    |  0.188529088913282  | 0.41690967926789296 |
|   5    | 0.16845956824002936 |  0.4637556941709893 |
|   6    | 0.15119730048379884 |  0.4991344279306231 |
|   7    |  0.1371421601937517 |  0.5260976464415908 |
|   8    |  0.1265093304061472 |  0.5548426346999346 |
|   9    | 0.11767017657980024 |  0.5794344838801477 |
|   10   | 0.11015977558238813 |  0.6027297657283024 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 3.859371859057911

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse        | count |
+------------


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0008581586367537087 | 0.0004903763638592623 |
|   2    | 0.0011033468186833397 | 0.0013383188263659017 |
|   3    | 0.0009807527277185233 | 0.0017408360917003799 |
|   4    | 0.0009194556822361159 | 0.0021261318061612296 |
|   5    |  0.001299497364227043 | 0.0038976164206028048 |
|   6    | 0.0012872379551305616 |  0.004714910360368254 |
|   7    | 0.0015411828578433932 |  0.006308633542910853 |
|   8    |  0.001455804830207184 |  0.006758145209781841 |
|   9    | 0.0014438859602522688 |  0.007555006801053148 |
|   10   |  0.001373053818805931 |  0.008106680210394799 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    | 0.34436680152016674 | 0.19931191634023523 |
|   2    | 0.26130930489150417 | 0.29554010480821813 |
|   3    | 0.21829921131134808 |  0.3683510705577637 |
|   4    | 0.18983694985901683 |  0.4237768807003826 |
|   5    | 0.16962118425891864 | 0.47067495830827943 |
|   6    |  0.1518327816599239 |   0.50266749174436  |
|   7    | 0.13751554317939008 |  0.5294331385449458 |
|   8    | 0.12622594090964814 |  0.5543566631834387 |
|   9    |  0.1172408156593521 |  0.5784763208540346 |
|   10   | 0.10972171141350995 |  0.6016249068382205 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.8344548393333262

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse        | count |
+-----------


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0008581586367537093 | 0.0004903763638592618 |
|   2    | 0.0011033468186833404 |  0.001338318826365903 |
|   3    | 0.0009807527277185253 |  0.00174083609170038  |
|   4    | 0.0009194556822361166 | 0.0021261318061612353 |
|   5    | 0.0012994973642270447 | 0.0038976164206028117 |
|   6    | 0.0012872379551305618 |  0.004714910360368259 |
|   7    | 0.0015411828578433934 |  0.006308633542910851 |
|   8    | 0.0014558048302071845 |  0.006758145209781845 |
|   9    |  0.001443885960252269 |  0.00755500680105314  |
|   10   | 0.0013730538188059301 |  0.008106680210394812 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    | 0.34044379060929264 |  0.1982955529479981 |
|   2    |  0.2559764619345349 |  0.2901941267700739 |
|   3    |  0.2146622532793918 |  0.3618855457674879 |
|   4    | 0.18775285031261504 | 0.41887224138971146 |
|   5    | 0.16795390462179732 |  0.4659375723645673 |
|   6    | 0.15034122021985202 |  0.4978996032232513 |
|   7    |  0.1362720888281753 |  0.5249523245201799 |
|   8    | 0.12516856687507655 |  0.5500965185224104 |
|   9    |  0.1163145491942847 |  0.5734090984274868 |
|   10   | 0.10866740223121249 |  0.5961942804991719 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9149688683866506

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse        | count |
+-----------


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    | 0.00012238404112103785 | 0.00012238404112103785 |
|   2    | 0.00012238404112103785 | 0.0002447680822420757  |
|   3    | 0.0002039734018683964  | 0.00046913882429731185 |
|   4    | 0.00015298005140129722 | 0.0004691388242973115  |
|   5    | 0.0001223840411210378  | 0.0004691388242973118  |
|   6    | 0.00010198670093419812 | 0.0004691388242973116  |
|   7    | 0.00010490060667517525 | 0.0005303308448578312  |
|   8    | 0.00010708603598090803 | 0.0006527148859788682  |
|   9    | 0.00012238404112103774 | 0.0007445029168196464  |
|   10   | 0.00011014563700893396 | 0.0007445029168196466  |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.09443703136005296

Per User RMSE (best)
+------------+-


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    | 0.20597234120670666 |  0.1182391073346897 |
|   2    |  0.1655856076367641 |  0.1854557737099652 |
|   3    | 0.14490270468730865 | 0.24376861257292026 |
|   4    | 0.13394933300697584 | 0.29895847295798333 |
|   5    | 0.12328968302533339 |  0.3417378630969375 |
|   6    | 0.11371517154163102 | 0.37825075324463425 |
|   7    | 0.10507544101963379 |  0.4077161195322993 |
|   8    | 0.09760127279402767 |  0.4316667735098771 |
|   9    | 0.09159765566570127 |  0.4547653528836987 |
|   10   |  0.0864520866479012 | 0.47525535527310153 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.10223464451249951

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |
+------------+------+-------+
| 4784213


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    | 0.00012238404112103785 | 0.00012238404112103785 |
|   2    | 0.00018357606168155677 |  0.000285562762615755  |
|   3    | 0.0002039734018683963  | 0.00046913882429731196 |
|   4    | 0.0001529800514012972  | 0.00046913882429731147 |
|   5    | 0.00012238404112103788 | 0.00046913882429731207 |
|   6    | 0.00010198670093419823 | 0.00046913882429731196 |
|   7    | 0.00010490060667517536 | 0.0005303308448578307  |
|   8    | 0.0001070860359809078  | 0.0006527148859788683  |
|   9    | 0.0001223840411210378  | 0.0007445029168196465  |
|   10   | 0.00012238404112103788 | 0.0007750989270999064  |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0942172711743491

Per User RMSE (best)
+------------+--


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    |  0.2746297882756089 | 0.15847426924100533 |
|   2    | 0.21539591237302655 | 0.24380076266626236 |
|   3    | 0.18153632766287275 | 0.30470979441423685 |
|   4    |  0.1595275976012729 |  0.3547669555318552 |
|   5    | 0.14022763431648524 | 0.38680530058880375 |
|   6    |  0.1274017868070004 | 0.41864001219955155 |
|   7    | 0.11619490532720231 | 0.44418729226280346 |
|   8    | 0.10691775792436667 | 0.46704285189782635 |
|   9    | 0.09963420769931608 | 0.48943087535671087 |
|   10   | 0.09336678497123987 |  0.5087666796821121 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.10190272227802054

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |
+------------+------+-------+
| 4784213

In [None]:
eval_counts

In [46]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', similarity_type='cosine')
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+---------------------+------+
| customerId | productId |        score        | rank |
+------------+-----------+---------------------+------+
| 105697630  |  ingarden |  0.2608681619167328 |  1   |
| 105697630  |  grattol  |  0.2507947444915771 |  2   |
| 105697630  | freedecor |  0.2367006242275238 |  3   |
| 105697630  |   domix   | 0.20659009218215943 |  4   |
| 105697630  |    milv   |  0.2047964811325073 |  5   |
| 105697630  |    uno    |  0.1964915633201599 |  6   |
| 105697630  |  severina | 0.17321430444717406 |  7   |
| 105697630  |   zinger  | 0.15881983041763306 |  8   |
| 105697630  |  jessnail | 0.14715051651000977 |  9   |
| 105697630  |    yoko   | 0.14709696769714356 |  10  |
| 126051562  | bpw.style | 0.32805266976356506 |  1   |
| 126051562  |   masura  | 0.24917551279067993 |  2   |
| 126051562  | freedecor |  0.2325199544429779 |  3   |
| 126051562  |   domix   |  0.2261669874191284 |  4   |
| 126051562  |    milv   | 0.22544825673103333 |

In [47]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(757920, 4)


Unnamed: 0,customerId,productId,score,rank
0,105697630,ingarden,0.260868,1
1,105697630,grattol,0.250795,2
2,105697630,freedecor,0.236701,3
3,105697630,domix,0.20659,4
4,105697630,milv,0.204796,5


In [None]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=5)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: ' , '.join(x.astype(str)))
    df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerId').set_index('customerId')

In [64]:
recomendation = final_model.recommend(users=users_to_recommend, k=5)
df_rec = recomendation.to_dataframe()
df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
    .transform(lambda x: ' , '.join(x.astype(str)))
df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
    .sort_values('customerId').set_index('customerId')

In [65]:
df_output=df_output.reset_index()

In [66]:
df_output.head()

Unnamed: 0,customerId,recommendedProducts
0,29025780,"bpw.style , ingarden , grattol , masura , uno ..."
1,33609704,"irisk , bpw.style , ingarden , grattol , uno ,..."
2,34236465,"bpw.style , grattol , masura , domix , freedec..."
3,36180886,"irisk , bpw.style , grattol , masura , freedec..."
4,40821287,"bpw.style , ingarden , masura , domix , freede..."


## This is for Streamlit

In [67]:
# new=recomendation.to_dataframe()

# new.columns=["user_id","brand","score","rank"]
# new.to_csv('new_brand.csv',index=False)

In [48]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerId').set_index('customerId')
    if print_csv:
        df_output.to_csv('option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [49]:
df_output = create_output(pear_norm, users_to_recommend, 5, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(10609, 1)


Unnamed: 0_level_0,recommendedProducts
customerId,Unnamed: 1_level_1
29025780,labay|dewal|osmo|coxir|sanoto|labay|dewal|osmo...
33609704,labay|dewal|osmo|mielle|coxir|labay|dewal|osmo...
34236465,labay|dewal|osmo|coxir|sanoto|labay|dewal|osmo...
36180886,labay|osmo|dewal|mielle|coxir|labay|osmo|dewal...
40821287,labay|dewal|osmo|coxir|sanoto|labay|dewal|osmo...


In [50]:
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [51]:
customer_recomendation(40821287)

recommendedProducts    labay|dewal|osmo|coxir|sanoto|labay|dewal|osmo...
Name: 40821287, dtype: object