## This exericise is about using similarity for user-based and item-based CF 

- The dataset is one example about whether custumer is interested in the items. 1 for yes and 0 for no. 
- jaccard similarity is used for user-based and item-based CF 

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
users = ['User1', 'User2', 'User3', 'User4', 'User5']
items = ['Item A', 'Item B', 'Item C', 'Item D', 'Item E']

dataset = [
    [1, 0, 1, 1, 0],
    [1, 0, 0, 1, 1],
    [1, 0, 1, 0, 0],
    [0, 1, 0, 1, 1],
    [1, 1, 1, 0, 1]    
]

In [3]:
df = pd.DataFrame(dataset, index=users, columns=items)

In [4]:
df

Unnamed: 0,Item A,Item B,Item C,Item D,Item E
User1,1,0,1,1,0
User2,1,0,0,1,1
User3,1,0,1,0,0
User4,0,1,0,1,1
User5,1,1,1,0,1


#### calculate similarity score of two users by using jaccard_score 

In [5]:
jaccard_score(df.loc['User1'], df.loc['User2'])

0.5

#### User-based CF

In [6]:
user_similar = 1- pairwise_distances(np.array(df), metric='jaccard')
user_similar = pd.DataFrame(user_similar, columns=users, index=users)
user_similar



Unnamed: 0,User1,User2,User3,User4,User5
User1,1.0,0.5,0.666667,0.2,0.4
User2,0.5,1.0,0.25,0.5,0.4
User3,0.666667,0.25,1.0,0.0,0.5
User4,0.2,0.5,0.0,1.0,0.4
User5,0.4,0.4,0.5,0.4,1.0


In [7]:
# find the top 2 of the most similiar users
topN_users = {}
for i in user_similar.index:
    _df = user_similar.loc[i].drop([i])
    _df_sorted = _df.sort_values(ascending = False)
    top2 = list(_df_sorted.index[:2])
    topN_users[i] = top2

In [8]:
topN_users

{'User1': ['User3', 'User2'],
 'User2': ['User4', 'User1'],
 'User3': ['User1', 'User5'],
 'User4': ['User2', 'User5'],
 'User5': ['User3', 'User4']}

In [9]:
# use the top 2 similiar users to build the recommendation item list
rs_results = {}
for user, sim_users in topN_users.items():
    rs_result = set()
    for sim_user in sim_users:
        rs_result = rs_result.union(set(df.loc[sim_user].replace(0, np.nan).dropna().index))
    # remove the items already purchased 
    rs_result -= set(df.loc[user].replace(0, np.nan).dropna().index)
    rs_results[user] = rs_result

In [10]:
rs_results

{'User1': {'Item E'},
 'User2': {'Item B', 'Item C'},
 'User3': {'Item B', 'Item D', 'Item E'},
 'User4': {'Item A', 'Item C'},
 'User5': {'Item D'}}

#### Item-based CF

In [11]:
item_similar = 1-pairwise_distances(np.array(df.T), metric='jaccard')
item_similar = pd.DataFrame(item_similar, columns=items, index=items)
item_similar



Unnamed: 0,Item A,Item B,Item C,Item D,Item E
Item A,1.0,0.2,0.75,0.4,0.4
Item B,0.2,1.0,0.25,0.25,0.666667
Item C,0.75,0.25,1.0,0.2,0.2
Item D,0.4,0.25,0.2,1.0,0.5
Item E,0.4,0.666667,0.2,0.5,1.0


In [12]:
# pick out the top 2 of most similar items for each item
topN_items = {}
for i in item_similar.index:
    _df = item_similar.loc[i].drop([i])
    _df_sorted = _df.sort_values(ascending = False)
    top2 = list(_df_sorted.index[:2])
    topN_items[i] = top2

In [13]:
topN_items

{'Item A': ['Item C', 'Item E'],
 'Item B': ['Item E', 'Item D'],
 'Item C': ['Item A', 'Item B'],
 'Item D': ['Item E', 'Item A'],
 'Item E': ['Item B', 'Item D']}

In [18]:
# build the item-based CF
rs_results = {}

for user in df.index:  # iterate user by user
    rs_result = set()
    for item in df.loc[user].replace(0, np.nan).dropna().index:  # every item of user will be recommended with the topN_item
        rs_result = rs_result.union(topN_items[item])
    # remove the items already bought
    rs_result -= set(df.loc[user].replace(0, np.nan).dropna().index)
    rs_results[user] = rs_result

In [19]:
rs_results

{'User1': {'Item B', 'Item E'},
 'User2': {'Item B', 'Item C'},
 'User3': {'Item B', 'Item E'},
 'User4': {'Item A'},
 'User5': {'Item D'}}