# Import Libaray

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_event = pd.read_csv("dataset/events.csv")
df_cat = pd.read_csv("dataset/category_tree.csv")
df_prop1 = pd.read_csv("dataset/item_properties_part1.csv")
df_prop2 = pd.read_csv("dataset/item_properties_part1.csv")
df_prop = pd.concat([df_prop1, df_prop2])

In [3]:
df_event.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [4]:
df_cat.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [5]:
df_prop.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [6]:
df_event.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   timestamp      int64  
 1   visitorid      int64  
 2   event          object 
 3   itemid         int64  
 4   transactionid  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 105.1+ MB


In [7]:
df_event.isnull().sum()

timestamp              0
visitorid              0
event                  0
itemid                 0
transactionid    2733644
dtype: int64

Only "transactionid" have null value
-> Check do we need to fix it

In [8]:
print(df_event["event"].unique())
print(((df_event["event"] == "transaction") & (df_event["transactionid"]).isna()).sum())

['view' 'addtocart' 'transaction']
0


# Drop duplication

In [9]:
print("Before dropping duplication: ", df_event.shape[0])
df_event.drop_duplicates(inplace=True)
df_event.reset_index(drop=True, inplace=True)
print("After dropping duplication: ", df_event.shape[0])

Before dropping duplication:  2756101
After dropping duplication:  2755641


In [10]:
df_event.groupby("event")["itemid"].count()

event
addtocart        68966
transaction      22457
view           2664218
Name: itemid, dtype: int64

Transform the unix timestamp to readable time

In [11]:
df_event["time"] = pd.to_datetime(df_event["timestamp"], unit = "ms")
df_event["time"].head()

0   2015-06-02 05:02:12.117
1   2015-06-02 05:50:14.164
2   2015-06-02 05:13:19.827
3   2015-06-02 05:12:35.914
4   2015-06-02 05:02:17.106
Name: time, dtype: datetime64[ns]

In [12]:
print("Start Date: ", df_event["time"].min())
print("Start Date: ", df_event["time"].max())

Start Date:  2015-05-03 03:00:04.384000
Start Date:  2015-09-18 02:59:47.788000


In [13]:
print(f"""
    There are {df_event['visitorid'].nunique()} distinct users.
    There are {df_event['transactionid'].nunique()} distinct transactions.
    There are {df_event['itemid'].nunique()} distinct items.
    """)


    There are 1407580 distinct users.
    There are 17672 distinct transactions.
    There are 235061 distinct items.
    


In [14]:
df_event['visitorid'].value_counts().describe().round(2)

count    1407580.00
mean           1.96
std           12.58
min            1.00
25%            1.00
50%            1.00
75%            2.00
max         7757.00
Name: count, dtype: float64

In [15]:
# identify abnormal users, windows shoppers and outliers

print(df_event['visitorid'].value_counts().head(10))
top20 = df_event['visitorid'].value_counts().nlargest(20).index.tolist()

for id in top20:
    view_time = df_event[(df_event['visitorid'] == id) & (df_event['event'] == "view")].shape[0]
    add_time = df_event[(df_event['visitorid'] == id) & (df_event['event'] == "addtocart")].shape[0]
    tran_time = df_event[(df_event['visitorid'] == id) & (df_event['event'] == "transaction")].shape[0]
    print("visitor with id", id, "have", view_time, "times of view, and","have", add_time, "times of add to cart, and", tran_time, "times of transactions.")

visitorid
1150086    7757
530559     4328
152963     3024
895999     2474
163561     2410
371606     2345
286616     2252
684514     2246
892013     2024
861299     1991
Name: count, dtype: int64
visitor with id 1150086 have 6479 times of view, and have 719 times of add to cart, and 559 times of transactions.
visitor with id 530559 have 3623 times of view, and have 419 times of add to cart, and 286 times of transactions.
visitor with id 152963 have 2304 times of view, and have 371 times of add to cart, and 349 times of transactions.
visitor with id 895999 have 2368 times of view, and have 56 times of add to cart, and 50 times of transactions.
visitor with id 163561 have 2194 times of view, and have 124 times of add to cart, and 92 times of transactions.
visitor with id 371606 have 2141 times of view, and have 110 times of add to cart, and 94 times of transactions.
visitor with id 286616 have 2057 times of view, and have 120 times of add to cart, and 75 times of transactions.
visitor wi

In [16]:
event_counts = df_event.groupby(['visitorid', 'event']).size().unstack(fill_value=0)
event_counts['total_events'] = event_counts.sum(axis=1)

active_users = event_counts[event_counts['total_events'] > 100]

abnormal_list = []

for id in active_users.index:
    view_time = event_counts.loc[id, 'view']
    add_time = event_counts.loc[id, 'addtocart']
    tran_time = event_counts.loc[id, 'transaction']

    if add_time == 0 or (tran_time != 0 and view_time / add_time >= 100 and view_time / tran_time >= 500):
        print(f"Visitor with ID {id} has {view_time} view(s), {add_time} add(s) to cart, and {tran_time} transaction(s).")
        abnormal_list.append(id)

print(f"Identified {len(abnormal_list)} abnormal users/windows shoppers/outliers.")

Visitor with ID 1722 has 105 view(s), 0 add(s) to cart, and 0 transaction(s).
Visitor with ID 1879 has 195 view(s), 0 add(s) to cart, and 0 transaction(s).
Visitor with ID 2194 has 115 view(s), 0 add(s) to cart, and 0 transaction(s).
Visitor with ID 9535 has 159 view(s), 0 add(s) to cart, and 0 transaction(s).
Visitor with ID 21395 has 145 view(s), 0 add(s) to cart, and 0 transaction(s).
Visitor with ID 24197 has 105 view(s), 0 add(s) to cart, and 0 transaction(s).
Visitor with ID 32252 has 170 view(s), 0 add(s) to cart, and 0 transaction(s).
Visitor with ID 54791 has 368 view(s), 0 add(s) to cart, and 0 transaction(s).
Visitor with ID 77177 has 118 view(s), 0 add(s) to cart, and 0 transaction(s).
Visitor with ID 78724 has 198 view(s), 0 add(s) to cart, and 0 transaction(s).
Visitor with ID 93504 has 133 view(s), 0 add(s) to cart, and 0 transaction(s).
Visitor with ID 95128 has 114 view(s), 0 add(s) to cart, and 0 transaction(s).
Visitor with ID 97112 has 108 view(s), 0 add(s) to cart,

In [17]:
# drop id in abnormal list
df_event_cleaned = df_event[~df_event['visitorid'].isin(abnormal_list)]

In [18]:
print(df_event_cleaned.groupby("event")["itemid"].count())
print(f"""
    There are {df_event_cleaned['visitorid'].nunique()} distinct users.
    There are {df_event_cleaned['transactionid'].nunique()} distinct transactions.
    There are {df_event_cleaned['itemid'].nunique()} distinct items.
    """)
df_event_cleaned['visitorid'].value_counts().describe().round(2)

event
addtocart        68962
transaction      22454
view           2636027
Name: itemid, dtype: int64

    There are 1407448 distinct users.
    There are 17669 distinct transactions.
    There are 233262 distinct items.
    


count    1407448.00
mean           1.94
std           12.29
min            1.00
25%            1.00
50%            1.00
75%            2.00
max         7757.00
Name: count, dtype: float64

In [19]:
print(f"""
    There are {df_cat['categoryid'].nunique()} distinct category id.
    There are {df_cat['parentid'].nunique()} distinct parent id.
    """)


    There are 1669 distinct category id.
    There are 362 distinct parent id.
    


In [20]:
df_event_tra_only = df_event_cleaned[df_event_cleaned['event'] == 'transaction']

print(f"""
Most Viewed Items: \n{df_event_cleaned['itemid'].value_counts().head()}
\nMost Active Users: \n{df_event_cleaned['visitorid'].value_counts().head()}
""")

print(f"""
Best Sold Items: \n{df_event_tra_only['itemid'].value_counts().head()}
\nMost Active Buyers: \n{df_event_tra_only['visitorid'].value_counts().head()}
""")


Most Viewed Items: 
itemid
187946    3411
461686    2971
5411      2330
370653    1854
219512    1798
Name: count, dtype: int64

Most Active Users: 
visitorid
1150086    7757
530559     4328
152963     3024
895999     2474
163561     2410
Name: count, dtype: int64


Best Sold Items: 
itemid
461686    133
119736     97
213834     92
312728     46
7943       46
Name: count, dtype: int64

Most Active Buyers: 
visitorid
1150086    559
152963     349
530559     286
684514     189
861299     188
Name: count, dtype: int64



In [21]:
df_event_cleaned["date"] = df_event_cleaned["time"].dt.date
df_event_cleaned["weekday"] = df_event_cleaned["time"].dt.day_name()
df_event_cleaned["hrs"] = df_event_cleaned["time"].dt.time
print(f"""
Date of activities most: \n{df_event_cleaned['date'].value_counts().head()}
\Activities of weekday: \n{df_event_cleaned['weekday'].value_counts()}
""")

df_event_tra_only["date"] = df_event_tra_only["time"].dt.date
df_event_tra_only["weekday"] = df_event_tra_only["time"].dt.day_name()
df_event_tra_only["hrs"] = df_event_tra_only["time"].dt.time
print(f"""
Date of sell most: \n{df_event_tra_only['date'].value_counts().head()}
\nSale of weekday: \n{df_event_tra_only['weekday'].value_counts()}
""")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event_cleaned["date"] = df_event_cleaned["time"].dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event_cleaned["weekday"] = df_event_cleaned["time"].dt.day_name()



Date of activities most: 
date
2015-07-26    32613
2015-07-27    28336
2015-07-25    27037
2015-05-18    26736
2015-05-19    26566
Name: count, dtype: int64
\Activities of weekday: 
weekday
Tuesday      441322
Monday       435451
Wednesday    426172
Thursday     413586
Friday       375076
Sunday       332526
Saturday     303310
Name: count, dtype: int64


Date of sell most: 
date
2015-06-16    276
2015-07-28    273
2015-06-17    269
2015-07-27    267
2015-07-22    266
Name: count, dtype: int64

Sale of weekday: 
weekday
Wednesday    4150
Tuesday      3973
Monday       3848
Thursday     3749
Friday       2928
Sunday       1995
Saturday     1811
Name: count, dtype: int64



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event_cleaned["hrs"] = df_event_cleaned["time"].dt.time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event_tra_only["date"] = df_event_tra_only["time"].dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_event_tra_only["weekday"] = df_event_tra_only["time"].dt.day_name()
A value is tr

In [22]:
item_tra = df_event_cleaned[['visitorid', 'itemid', 'time']][df_event_cleaned['event'] == 'transaction']
item_add = df_event_cleaned[['visitorid', 'itemid', 'time']][df_event_cleaned['event'] == 'addtocart']
item_view = df_event_cleaned[['visitorid', 'itemid', 'time']][df_event_cleaned['event'] == 'view']

In [23]:
df_purchase = item_tra.merge(item_add, how='inner', on=['visitorid', 'itemid'], suffixes=['(transaction)', '(add to cart)'])
df_purchase = df_purchase.merge(item_view, how='inner', on=['visitorid', 'itemid'])
df_purchase = df_purchase.rename(columns={'time':'time(view)'})
df_purchase.head()

Unnamed: 0,visitorid,itemid,time(transaction),time(add to cart),time(view)
0,599528,356475,2015-06-02 05:17:56.276,2015-06-02 05:12:21.632,2015-06-02 05:11:36.102
1,599528,356475,2015-06-02 05:17:56.276,2015-06-02 05:12:21.632,2015-06-03 02:12:25.235
2,599528,356475,2015-06-02 05:17:56.276,2015-06-02 05:12:21.632,2015-06-06 21:10:10.602
3,599528,356475,2015-06-02 05:17:56.276,2015-06-02 05:12:21.632,2015-06-08 02:44:57.480
4,599528,356475,2015-06-02 05:17:56.276,2015-06-02 05:12:21.632,2015-06-10 00:34:37.794


In [24]:
# print("Before: ", df_purchase.shape[0])
# purchase_record = (df_purchase['time(transaction)'] - df_purchase['time(add to cart)']) > np.timedelta64(0, 's')
# tmp=df_purchase[purchase_record]
# print("After: ", tmp.shape[0])
# purchase_record = (tmp['time(add to cart)'] - tmp['time(view)']) > np.timedelta64(0, 's')
# tmp=tmp[purchase_record]
# print("Final: ", tmp.shape[0])
# tmp

print("Before: ", df_purchase.shape[0])
purchase_record = (df_purchase['time(transaction)'] - df_purchase['time(view)']) > np.timedelta64(0, 's')
tmp = df_purchase[purchase_record]
purchase_record2 = (tmp['time(transaction)'] - tmp['time(add to cart)']) > np.timedelta64(0, 's')
tmp = tmp[purchase_record2]
print("Final: ", tmp.shape[0])
tmp

Before:  75932
Final:  49424


Unnamed: 0,visitorid,itemid,time(transaction),time(add to cart),time(view)
0,599528,356475,2015-06-02 05:17:56.276,2015-06-02 05:12:21.632,2015-06-02 05:11:36.102
14,552148,81345,2015-06-01 21:25:15.008,2015-06-01 21:24:40.872,2015-06-01 21:22:28.663
16,189384,310791,2015-06-01 16:01:58.180,2015-06-01 15:59:01.347,2015-06-01 15:52:45.771
17,189384,310791,2015-06-01 16:01:58.180,2015-06-01 15:59:01.347,2015-06-01 15:53:23.671
18,350566,54058,2015-06-01 18:44:21.340,2015-06-01 18:41:49.097,2015-06-01 18:41:01.334
...,...,...,...,...,...
75926,1050575,31640,2015-07-31 21:12:56.570,2015-07-31 21:11:12.772,2015-07-31 21:08:45.248
75927,1050575,31640,2015-07-31 21:12:56.570,2015-07-31 21:11:12.772,2015-07-31 21:10:18.947
75929,855941,235771,2015-07-31 15:48:50.123,2015-07-31 15:47:51.106,2015-07-31 15:46:54.290
75930,548772,29167,2015-07-31 15:12:40.300,2015-07-31 15:10:26.076,2015-07-31 15:09:48.292


In [25]:
avg_view = tmp.groupby(['visitorid', 'itemid']).count()['time(view)'].mean()
print("Average number of view before purchase: {0:.2f}".format(avg_view))
tmp.groupby(['visitorid', 'itemid']).count()['time(view)'].describe()

Average number of view before purchase: 2.89


count    17118.000000
mean         2.887253
std          9.771779
min          1.000000
25%          1.000000
50%          1.000000
75%          3.000000
max        617.000000
Name: time(view), dtype: float64

In [26]:
tmp.groupby(['visitorid', 'itemid']).count()['time(view)'].value_counts()

time(view)
1      8753
2      3861
3      1562
4       962
5       446
       ... 
56        1
48        1
45        1
59        1
190       1
Name: count, Length: 85, dtype: int64

In [27]:
# Display the time(view)

tmp_count = tmp.groupby(['visitorid', 'itemid']).count()['time(view)'].value_counts()
tmp_count_df = tmp_count.reset_index()
tmp_count.columns = ['time(view)', 'count']
tmp_count_df = tmp_count_df.sort_values(by=['count', 'time(view)'], ascending=[False, True])
print(tmp_count_df.to_string(index=False))



 time(view)  count
          1   8753
          2   3861
          3   1562
          4    962
          5    446
          6    419
          8    227
          7    169
         10    114
          9     92
         12     78
         14     55
         11     48
         16     31
         13     28
         15     25
         20     23
         17     19
         26     17
         22     16
         18     14
         24     12
         21      9
         30      9
         19      8
         23      7
         38      7
         27      6
         29      6
         32      6
         36      6
         28      5
         33      4
         25      3
         31      3
         39      3
         42      3
         44      3
         50      3
         75      3
         37      2
         41      2
         51      2
         54      2
         55      2
         62      2
         63      2
         72      2
         34      1
         35      1
         40      1
         43 

In [28]:
tmp_count_df.to_csv("dataset/view before transaction.csv", index=False)

In [29]:
# Devide to single view and multi view

multi_view = tmp.duplicated(subset=['visitorid', 'itemid'], keep=False)
single_view = tmp[~multi_view]
multi_view = tmp[multi_view]

# Not count last
# not_last = multi_view.duplicated(subset=['visitorid', 'itemid'], keep='last')
# avg_view = multi_view[not_last].groupby(['visitorid', 'itemid']).count()['time(view)'].mean()
avg_view = multi_view.groupby(['visitorid', 'itemid']).count()['time(view)'].mean()
print("Average number of view before purchase(>1): {0:.2f}".format(avg_view))
multi_view.groupby(['visitorid', 'itemid']).count()['time(view)'].describe()

Average number of view before purchase(>1): 4.86


count    8365.000000
mean        4.862044
std        13.703602
min         2.000000
25%         2.000000
50%         3.000000
75%         4.000000
max       617.000000
Name: time(view), dtype: float64

# Time before Purchase

In [30]:
def printTime(df, word1, word2):
    time_diff = df['time(transaction)'] - df['time(view)']
    avg_time = time_diff.mean()
    days = avg_time.days
    totalsec = avg_time.seconds
    hrs, tmp = divmod(totalsec, 3600)
    mins, sec = divmod(tmp, 60)
    print('For %s, Average Time from %s to transaction: %s days %s hours %s minutes %s seconds.' % (word1, word2, days, hrs, mins, sec))

# Time for instant purchase
printTime(single_view, "Instant Puchase", "Last View")

# Time for non-instant purchase

# Sort the dataframe
multi_view = multi_view.sort_values(['visitorid', 'itemid', 'time(view)'])
multi_view = multi_view.reset_index(drop=True)
first_view = multi_view[~(multi_view.duplicated(subset=['visitorid', 'itemid'], keep='first'))]

last = multi_view[~(multi_view.duplicated(subset=['visitorid', 'itemid'], keep='last'))]
printTime(first_view, "Non-Instant Puchase", "First View")
printTime(last, "Non-Instant Puchase", "Last View")


For Instant Puchase, Average Time from Last View to transaction: 0 days 2 hours 0 minutes 3 seconds.
For Non-Instant Puchase, Average Time from First View to transaction: 2 days 3 hours 9 minutes 32 seconds.
For Non-Instant Puchase, Average Time from Last View to transaction: 0 days 3 hours 32 minutes 4 seconds.


# Content Based Recommendation

In [31]:
# Data Precession
# Drop timestamp and 'available'


df_prop_CBR = df_prop.drop(columns=['timestamp'])
df_prop_CBR = df_prop_CBR[df_prop_CBR['property'] != 'available']
df_prop_CBR

Unnamed: 0,itemid,property,value
0,460429,categoryid,1338
1,206783,888,1116713 960601 n277.200
2,395014,400,n552.000 639502 n720.000 424566
3,59481,790,n15360.000
4,156781,917,828513
...,...,...,...
10999994,86599,categoryid,618
10999995,153032,1066,n1020.000 424566
10999996,421788,888,35975 856003 37346
10999997,159792,400,n552.000 639502 n720.000 424566


In [32]:
df_prop_CBR.drop_duplicates(inplace=True)
df_prop_CBR


Unnamed: 0,itemid,property,value
0,460429,categoryid,1338
1,206783,888,1116713 960601 n277.200
2,395014,400,n552.000 639502 n720.000 424566
3,59481,790,n15360.000
4,156781,917,828513
...,...,...,...
10999990,385426,846,769062
10999993,70776,283,160795 283258 981281 829153 787461 333798 2832...
10999994,86599,categoryid,618
10999995,153032,1066,n1020.000 424566


In [33]:
print(df_prop_CBR['itemid'].nunique())
print((df_prop_CBR['property'] == 'categoryid').value_counts())

417053
property
False    6519575
True      244728
Name: count, dtype: int64


Although about 40% items do not have category id, but not all items are appeard in event, so we need further checking...

In [34]:
categoryid_items = df_prop_CBR[df_prop_CBR['property'] == 'categoryid']['itemid'].unique()
items_event = df_event_cleaned['itemid'].drop_duplicates()
matching_items = items_event[items_event.isin(categoryid_items)]

print("No. of items in event that have category id:", len(matching_items))
print("No. of items in event that have not category id:", (len(items_event) - len(matching_items)))

No. of items in event that have category id: 101623
No. of items in event that have not category id: 131639


Ok... More than 50% items do not have category id, so I will drop this feature

In [35]:
df_prop_CBR = df_prop_CBR[df_prop_CBR['property'] != 'categoryid']
df_prop_CBR


Unnamed: 0,itemid,property,value
1,206783,888,1116713 960601 n277.200
2,395014,400,n552.000 639502 n720.000 424566
3,59481,790,n15360.000
4,156781,917,828513
6,89534,213,1121373
...,...,...,...
10999989,56123,348,452320 853231
10999990,385426,846,769062
10999993,70776,283,160795 283258 981281 829153 787461 333798 2832...
10999995,153032,1066,n1020.000 424566


In [36]:
# Feature engineering: Aggre propert and value

df_prop_CBR2 = df_prop_CBR.groupby(['itemid']).agg({'property': ' '.join, 'value': ' '.join}).reset_index()
df_prop_CBR2



Unnamed: 0,itemid,property,value
0,0,112 283 227 6 1056 225 189 6 776 917 888 364 1...,679677 66094 372274 478989 1152934 1238769 115...
1,1,296 59 813 33 790 790 790 790 185 764 839 284 ...,866110 769062 814966 1128577 1000087 421694 n5...
2,2,282 332 159 283 443 790 641 877,n192.000 145688 n72.000 519769 822092 325894 5...
3,3,159 678 1080 283 250 227 689 562 283 459 888 9...,519769 327918 769062 138228 150169 1182824 327...
4,4,115 897 28 839 202 698 689 764 678 776 888,n24.000 324209 150169 176547 508476 371058 714...
...,...,...,...
417048,466862,413 159 614 764 227 839 917 888 810,769062 519769 1102430 1285872 1102430 150169 7...
417049,466863,790 96 1014 917 364 227 928 624 685 283 112 40...,n43320.000 n204.000 1075463 1159 209123 111599...
417050,466864,790 230 790 790 764 790 790 152 790 790 813 6 ...,n81000.000 1298303 n105000.000 n129600.000 128...
417051,466865,839 928 364 689 888 994 6 28 112 888 159 642 2...,1014333 1154859 575774 150169 780351 150169 78...


In [37]:
df_prop_CBR2.isnull().value_counts()

itemid  property  value
False   False     False    417053
Name: count, dtype: int64

In [38]:
# Sample too large....So use only 1/10
sample_size = max(1, len(df_prop_CBR2) // 10)
df_prop_CBR2_sampled = df_prop_CBR2.sample(n = sample_size, random_state = 37)

# Limited input
tfidf = TfidfVectorizer(min_df = 500, max_df = 0.7)
value_tfidf = tfidf.fit_transform(df_prop_CBR2_sampled['value'])

# Since property vary over time and we have no ram...so I decide not to use property
final_df_CBR = pd.concat([df_prop_CBR2_sampled[['itemid']], pd.DataFrame(value_tfidf.toarray(), columns=tfidf.get_feature_names_out())], axis=1)


In [39]:
tfidf_df = pd.DataFrame(value_tfidf.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df.index = df_prop_CBR2_sampled.index
tfidf_df

Unnamed: 0,1000087,1002405,1020770,1030323,10317,1037891,1051562,1055803,1065616,108347,...,n3840,n48,n60,n600,n6000,n72,n720,n84,n9216,n96
266104,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
293385,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.380836,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
365409,0.0,0.0,0.0,0.0,0.147520,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
285967,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
45361,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230558,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
163944,0.0,0.0,0.0,0.0,0.425565,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.544513,0.0,0.0,0.0
208619,0.0,0.0,0.0,0.0,0.276869,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
126401,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [40]:
CBR_result = cosine_similarity(tfidf_df.iloc[:, 1:])
CBR_result

array([[1.        , 0.26479574, 0.06290316, ..., 0.0108911 , 0.29338723,
        0.33007943],
       [0.26479574, 1.        , 0.18072076, ..., 0.11674878, 0.29691512,
        0.32257883],
       [0.06290316, 0.18072076, 1.        , ..., 0.04084371, 0.06896945,
        0.22644518],
       ...,
       [0.0108911 , 0.11674878, 0.04084371, ..., 1.        , 0.0143297 ,
        0.02665029],
       [0.29338723, 0.29691512, 0.06896945, ..., 0.0143297 , 1.        ,
        0.36980864],
       [0.33007943, 0.32257883, 0.22644518, ..., 0.02665029, 0.36980864,
        1.        ]])

In [41]:
parent_counts = df_cat.groupby('parentid')['categoryid'].count()
valid_parents = parent_counts[parent_counts >= 8].index
category_lists = [df_cat[df_cat['parentid'] == parentid]['categoryid'].tolist() for parentid in valid_parents]


similar_items_with_scores = []

for category_list in category_lists:
    found_items = df_prop_CBR2_sampled[df_prop_CBR2_sampled['itemid'].isin(category_list)]['itemid'].tolist()
    
    if len(found_items) >= 2: 
        indices = [df_prop_CBR2_sampled[df_prop_CBR2_sampled['itemid'] == item].index[0] for item in found_items]
        
        extracted_sim = CBR_result[indices, :][:, indices]
        
        similar_items_with_scores.append((found_items, extracted_sim))

print("Similar Items Found (with 2 or more itemids) and their Cosine Similarity:")
for items, scores in similar_items_with_scores:
    print(f"Items: {items}")
    print(f"Cosine Similarity Matrix:\n{scores}\n")

Similar Items Found (with 2 or more itemids) and their Cosine Similarity:
Items: [1261, 1282]
Cosine Similarity Matrix:
[[1.         0.08357705]
 [0.08357705 1.        ]]

Items: [563, 1642, 1411]
Cosine Similarity Matrix:
[[1.        0.        0.       ]
 [0.        1.        0.0949806]
 [0.        0.0949806 1.       ]]

Items: [1420, 678]
Cosine Similarity Matrix:
[[1.         0.08145747]
 [0.08145747 1.        ]]

Items: [743, 1658]
Cosine Similarity Matrix:
[[1.         0.01194025]
 [0.01194025 1.        ]]

Items: [0, 710]
Cosine Similarity Matrix:
[[1.         0.06688857]
 [0.06688857 1.        ]]

Items: [1596, 92]
Cosine Similarity Matrix:
[[1.         0.01530301]
 [0.01530301 1.        ]]

Items: [488, 898, 850, 775]
Cosine Similarity Matrix:
[[1.         0.13557414 0.         0.07960708]
 [0.13557414 1.         0.         0.14749958]
 [0.         0.         1.         0.01041475]
 [0.07960708 0.14749958 0.01041475 1.        ]]

Items: [451, 1001, 303]
Cosine Similarity Matrix

# Result for Content Based Recommendation
Seem result is not good, maybe same parentid ndo not mean they are similar

In [42]:
df_event_cleaned.to_csv('dataset/Event Cleaned.csv', index=False)

# Collaborative Filtering

In [43]:
df_event_cleaned

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,time,date,weekday,hrs
0,1433221332117,257597,view,355908,,2015-06-02 05:02:12.117,2015-06-02,Tuesday,05:02:12.117000
1,1433224214164,992329,view,248676,,2015-06-02 05:50:14.164,2015-06-02,Tuesday,05:50:14.164000
2,1433221999827,111016,view,318965,,2015-06-02 05:13:19.827,2015-06-02,Tuesday,05:13:19.827000
3,1433221955914,483717,view,253185,,2015-06-02 05:12:35.914,2015-06-02,Tuesday,05:12:35.914000
4,1433221337106,951259,view,367447,,2015-06-02 05:02:17.106,2015-06-02,Tuesday,05:02:17.106000
...,...,...,...,...,...,...,...,...,...
2755636,1438398785939,591435,view,261427,,2015-08-01 03:13:05.939,2015-08-01,Saturday,03:13:05.939000
2755637,1438399813142,762376,view,115946,,2015-08-01 03:30:13.142,2015-08-01,Saturday,03:30:13.142000
2755638,1438397820527,1251746,view,78144,,2015-08-01 02:57:00.527,2015-08-01,Saturday,02:57:00.527000
2755639,1438398530703,1184451,view,283392,,2015-08-01 03:08:50.703,2015-08-01,Saturday,03:08:50.703000


# Data Preprocessing
There are many features I can choose. Due to limitation of resource, I will use ratio of frequency of action as he only features.

In [44]:
# Drop useless columns
df_event_Col = df_event_cleaned.drop(columns=['timestamp', 'event', 'itemid', 'transactionid', 'time', 'date', 'hrs'])
df_event_Col


Unnamed: 0,visitorid,weekday
0,257597,Tuesday
1,992329,Tuesday
2,111016,Tuesday
3,483717,Tuesday
4,951259,Tuesday
...,...,...
2755636,591435,Saturday
2755637,762376,Saturday
2755638,1251746,Saturday
2755639,1184451,Saturday


In [45]:
# Feature Engineering

total_counts = df_event_Col.groupby('visitorid')['weekday'].count()
weekday_counts = df_event_Col.groupby(['visitorid', 'weekday']).size().unstack(fill_value=0)
weekday_proportions = weekday_counts.div(total_counts, axis=0)
df_event_Col = df_event_Col.merge(weekday_proportions, on='visitorid', how='left', suffixes=('', '_proportion'))
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
proportion_columns = [day for day in weekday_order if day in weekday_proportions.columns]
df_event_Col2 = df_event_Col[['visitorid'] + proportion_columns].drop_duplicates()

df_event_Col2

Unnamed: 0,visitorid,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,257597,0.500000,0.500000,0.000000,0.0,0.000000,0.000000,0.000000
1,992329,0.266667,0.133333,0.066667,0.3,0.166667,0.033333,0.033333
2,111016,0.000000,0.500000,0.000000,0.0,0.000000,0.000000,0.500000
3,483717,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.000000
4,951259,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
2727425,1392454,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000
2727435,226214,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000
2727438,591435,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000
2727439,762376,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000


In [46]:
total_counts2 = total_counts.reset_index(name='Activity Count')
df_event_Col2 = df_event_Col2.merge(total_counts2, on='visitorid', how='left') 
df_event_Col2

Unnamed: 0,visitorid,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Activity Count
0,257597,0.500000,0.500000,0.000000,0.0,0.000000,0.000000,0.000000,2
1,992329,0.266667,0.133333,0.066667,0.3,0.166667,0.033333,0.033333,30
2,111016,0.000000,0.500000,0.000000,0.0,0.000000,0.000000,0.500000,2
3,483717,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.000000,3
4,951259,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...
1407443,1392454,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000,1
1407444,226214,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000,1
1407445,591435,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000,1
1407446,762376,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000,1


In [47]:
# No Ram...
sample_size2 = max(1, len(df_event_Col2) // 100)
df_event_Col2_sampled = df_event_Col2.sample(n = sample_size2, random_state = 37)

user_features = df_event_Col2_sampled.drop(columns=['visitorid'])

scaler = MinMaxScaler()
user_features_scaled = scaler.fit_transform(user_features)

similarity_matrix = cosine_similarity(user_features_scaled)
df_similarity = pd.DataFrame(similarity_matrix, index=df_event_Col2_sampled['visitorid'], columns=df_event_Col2_sampled['visitorid'])

df_similarity

visitorid,64865,857996,979037,467967,1033321,1220775,1119204,30129,16074,310227,...,635437,949037,1247105,676663,169835,1264138,587756,1316221,690564,643891
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
64865,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.707102,0.0,0.000000,0.0,1.0,0.0,0.0,0.0
857996,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000019,0.0,...,0.999997,0.999997,0.000009,0.0,0.413710,0.0,0.0,0.0,0.0,0.0
979037,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,1.0,0.901993,0.0,0.0,0.0,0.0,0.0
467967,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.707102,0.0,0.000000,0.0,1.0,0.0,0.0,0.0
1033321,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.707102,0.0,0.000000,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1264138,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.999972,0.0,...,0.000000,0.000000,0.707102,0.0,0.000000,1.0,0.0,1.0,0.0,1.0
587756,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.707102,0.0,0.000000,0.0,1.0,0.0,0.0,0.0
1316221,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.999972,0.0,...,0.000000,0.000000,0.707102,0.0,0.000000,1.0,0.0,1.0,0.0,1.0
690564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.0,...,0.000000,0.000000,0.000000,0.0,0.037583,0.0,0.0,0.0,1.0,0.0


In [48]:
user_features2 = df_event_Col2_sampled.drop(columns=['visitorid', 'Activity Count'])
similarity_matrix2 = cosine_similarity(user_features2)
df_similarity2 = pd.DataFrame(similarity_matrix2, index=df_event_Col2_sampled['visitorid'], columns=df_event_Col2_sampled['visitorid'])
df_similarity2

visitorid,64865,857996,979037,467967,1033321,1220775,1119204,30129,16074,310227,...,635437,949037,1247105,676663,169835,1264138,587756,1316221,690564,643891
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
64865,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.707107,0.0,0.000000,0.0,1.0,0.0,0.0,0.0
857996,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.000000,0.0,0.416356,0.0,0.0,0.0,0.0,0.0
979037,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,1.0,0.908413,0.0,0.0,0.0,0.0,0.0
467967,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.707107,0.0,0.000000,0.0,1.0,0.0,0.0,0.0
1033321,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.707107,0.0,0.000000,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1264138,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.707107,0.0,0.000000,1.0,0.0,1.0,0.0,1.0
587756,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.707107,0.0,0.000000,0.0,1.0,0.0,0.0,0.0
1316221,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.707107,0.0,0.000000,1.0,0.0,1.0,0.0,1.0
690564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.000000,0.0,0.037851,0.0,0.0,0.0,1.0,0.0


In [49]:
# Seems too many same users (avtivity count = 1), so we only need active users

df_event_Col3 = df_event_Col2[df_event_Col2['Activity Count'] >= 30]
df_event_Col3

Unnamed: 0,visitorid,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Activity Count
1,992329,0.266667,0.133333,0.066667,0.300000,0.166667,0.033333,0.033333,30
102,712443,0.280000,0.380000,0.040000,0.000000,0.000000,0.000000,0.300000,50
106,492414,0.391304,0.260870,0.043478,0.021739,0.065217,0.065217,0.152174,46
174,85734,0.201916,0.252567,0.234086,0.151951,0.158111,0.001369,0.000000,1461
176,820159,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,159
...,...,...,...,...,...,...,...,...,...
1378386,272883,0.000000,0.000000,0.951220,0.048780,0.000000,0.000000,0.000000,41
1378580,1296849,0.000000,0.000000,0.333333,0.633333,0.033333,0.000000,0.000000,30
1379191,1186695,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,59
1398732,1271070,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,34


In [50]:
# Only 3038 row, no need sampling...

user_features3 = df_event_Col3.drop(columns=['visitorid', 'Activity Count'])
similarity_matrix3 = cosine_similarity(user_features3)
df_similarity3 = pd.DataFrame(similarity_matrix3, index=df_event_Col3['visitorid'], columns=df_event_Col3['visitorid'])
df_similarity3

visitorid,992329,712443,492414,85734,820159,1185234,121688,73449,286616,1141573,...,1080047,321766,958164,977340,354959,272883,1296849,1186695,1271070,533590
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
992329,1.000000,0.532854,0.714299,0.832385,0.577350,0.804775,0.577350,0.885796,0.753800,0.871018,...,0.218002,0.288675,0.660473,0.355810,0.144338,0.177413,0.658089,0.649519,0.072169,0.360844
712443,0.532854,1.000000,0.904081,0.633850,0.499363,0.683658,0.499363,0.457995,0.849922,0.189645,...,0.124758,0.677708,0.000000,0.067121,0.071338,0.071244,0.033189,0.000000,0.000000,0.000000
492414,0.714299,0.904081,1.000000,0.733547,0.774597,0.755925,0.774597,0.557257,0.897879,0.327183,...,0.129991,0.516398,0.047045,0.095555,0.086066,0.088157,0.084087,0.043033,0.129099,0.129099
85734,0.832385,0.633850,0.733547,1.000000,0.443321,0.970222,0.443321,0.921680,0.763006,0.588785,...,0.581295,0.554527,0.344298,0.596574,0.513952,0.530364,0.550168,0.333618,0.003006,0.347143
820159,0.577350,0.499363,0.774597,0.443321,1.000000,0.477827,1.000000,0.337770,0.528020,0.292425,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272883,0.177413,0.071244,0.088157,0.530364,0.000000,0.525991,0.000000,0.549237,0.133747,0.153849,...,0.996428,0.000000,0.051190,0.957000,0.998688,1.000000,0.509903,0.051215,0.000000,0.000000
1296849,0.658089,0.033189,0.084087,0.550168,0.000000,0.568824,0.000000,0.826627,0.307071,0.880604,...,0.532561,0.000000,0.884982,0.737155,0.465242,0.509903,1.000000,0.883960,0.000000,0.046524
1186695,0.649519,0.000000,0.043033,0.333618,0.000000,0.370316,0.000000,0.650703,0.282079,0.930443,...,0.079493,0.000000,0.999512,0.338719,0.000000,0.051215,0.883960,1.000000,0.000000,0.000000
1271070,0.072169,0.000000,0.129099,0.003006,0.000000,0.000000,0.000000,0.000000,0.376440,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000


In [51]:
# Pick some to check

# Highly similar
df_event_Col3[df_event_Col3['visitorid'].isin([992329, 492414, 85734])]

Unnamed: 0,visitorid,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Activity Count
1,992329,0.266667,0.133333,0.066667,0.3,0.166667,0.033333,0.033333,30
106,492414,0.391304,0.26087,0.043478,0.021739,0.065217,0.065217,0.152174,46
174,85734,0.201916,0.252567,0.234086,0.151951,0.158111,0.001369,0.0,1461


In [52]:
# Not similar
df_event_Col3[df_event_Col3['visitorid'].isin([992329, 1080047, 321766])]

Unnamed: 0,visitorid,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Activity Count
1,992329,0.266667,0.133333,0.066667,0.3,0.166667,0.033333,0.033333,30
1367805,1080047,0.0,0.068966,0.862069,0.068966,0.0,0.0,0.0,58
1367854,321766,0.0,1.0,0.0,0.0,0.0,0.0,0.0,33


In [67]:
# Simple Recommendation System

target_user_id = 992329
similarity_scores = df_similarity3[target_user_id]
similar_users = similarity_scores[similarity_scores > 0.5]
sorted_similar_users = similar_users.sort_values(ascending=False)

# drop first row since it will be user himself
sorted_similar_users = sorted_similar_users.iloc[1:]
sorted_similar_users.head()

visitorid
311205     0.977726
1284779    0.975541
1080579    0.974428
1175225    0.974158
895999     0.966107
Name: 992329, dtype: float64

In [66]:
recommendation_list = []

target_user_items = set(tmp[tmp['visitorid'] == target_user_id]['itemid'])

for user_id in sorted_similar_users.index:
    # Check is this user buy something
    if user_id in tmp['visitorid'].values:
        user_items = tmp[tmp['visitorid'] == user_id]['itemid'].tolist()
        
        for item in user_items:
            # Not recommend this item if target user bought this item
            if item not in target_user_items:
                recommendation_list.append(item)
                break  
    
    # recommend 10 items only
    if len(recommendation_list) >= 10:
        break

print("Recommended items:", recommendation_list)

Recommended items: [437490, 40484, 227311, 8786, 349318, 420238, 396732, 372188, 352788, 294242]
