# Recommendations based on Jaccard index
In this Notebook you will write the function that can be implemented in the Streamlit app. The actual code is already implemented in the streamlit app if you want to be inspired.

### 1. Loading the data
Load a subset from the ratings dataset.

In [2]:
import pandas as pd
df = pd.read_csv('./data/BX-Book-Ratings-Subset.csv', sep=';', encoding='latin-1')

In [3]:
users = df.groupby('User-ID')['ISBN'].apply(list)
users

User-ID
243       [0060915544, 0060977493, 0316601950, 031677696...
254       [0064471047, 0142001740, 0380789035, 038097365...
638       [0316603287, 0316666343, 0316693006, 031669320...
805       [0060256672, 0060928336, 0140077022, 015602732...
882       [0064401847, 0373218400, 0373484224, 038001817...
                                ...                        
278194    [0060926317, 0140280553, 0156011042, 034542313...
278535    [0061094129, 0425178765, 0425182878, 044022159...
278582    [0312980140, 0312983298, 0312983867, 031299045...
278633    [0060987561, 0140244824, 014028009X, 014029629...
278843    [0060173289, 0060517794, 014028009X, 014200020...
Name: ISBN, Length: 1826, dtype: object

In [4]:
users.items
users[98783]

['0345313860',
 '0345334531',
 '0345337662',
 '0345339703',
 '0345351525',
 '034538475X',
 '0345397819',
 '0553262505',
 '0553583468',
 '0553584375']

In [8]:
from scipy.spatial import distance
# id = 98783
# for user, value in users.items():
#     intersection = len(user(set(value).intersection(users[98783])))
#     union = (len(user.value) + len(users[98783].value) - intersection)
#     distance =  float(intersection) / union
#     print(distance)

### 2. Generating the recommendations

In [9]:
import itertools
from scipy.spatial import distance

# user we want to check
id = 98783


def get_jaccard_recommendations(id):
  # create lists per user
  users = df.groupby('User-ID')['ISBN'].apply(list)
  
  new_content = []
  similar_users = []

  for user, value in users.items():
    a = set(users[id])
    b = set(users[user])
    new = b.difference(a)

    distances = float(len(a.intersection(b))) / len(a.union(b))

    # distances = distance.jaccard(users[user], users[íd])
    # intersection = len(user(set(value).intersection(users['íd'])))
    # union = (len(user.value) + len(users['íd'].value) - intersection)
    # distance =  float(intersection) / union
    
    # tweak this parameter. Closer to 0.0 is more the same. 0.0 is the user.
    if distances < 0.8 and distances != 0.0:
      #get the differences in sets (ISBN) from the  selected user and user in the for-loop
      # add these differences to new_content
      new_content.append(new)
      similar_users.append(user)

      # add the user to similiar_users
      similar_users.append(user)

  # flatten the list with the sets
  new_content = list(itertools.chain(*new_content))

  df_recommendations = df[df['User-ID'].isin(similar_users) & df['ISBN'].isin(new_content)]

  df_recommendations.sort_values('Book-Rating', ascending=False)
  print(df_recommendations)

  return df_recommendations

get_jaccard_recommendations(id)

       User-ID        ISBN  Book-Rating
290       2033  0060248025           10
291       2033  0060256737           10
292       2033  0140386645            8
293       2033  0142000663           10
295       2033  0439064864            9
...        ...         ...          ...
41054   276050  0553279912            7
41055   276050  0553377868            7
41056   276050  0671021001            9
41057   276050  067102423X            8
41058   276050  0679746048            7

[8300 rows x 3 columns]


Unnamed: 0,User-ID,ISBN,Book-Rating
290,2033,0060248025,10
291,2033,0060256737,10
292,2033,0140386645,8
293,2033,0142000663,10
295,2033,0439064864,9
...,...,...,...
41054,276050,0553279912,7
41055,276050,0553377868,7
41056,276050,0671021001,9
41057,276050,067102423X,8


In [10]:
get_jaccard_recommendations(98783)

       User-ID        ISBN  Book-Rating
290       2033  0060248025           10
291       2033  0060256737           10
292       2033  0140386645            8
293       2033  0142000663           10
295       2033  0439064864            9
...        ...         ...          ...
41054   276050  0553279912            7
41055   276050  0553377868            7
41056   276050  0671021001            9
41057   276050  067102423X            8
41058   276050  0679746048            7

[8300 rows x 3 columns]


Unnamed: 0,User-ID,ISBN,Book-Rating
290,2033,0060248025,10
291,2033,0060256737,10
292,2033,0140386645,8
293,2033,0142000663,10
295,2033,0439064864,9
...,...,...,...
41054,276050,0553279912,7
41055,276050,0553377868,7
41056,276050,0671021001,9
41057,276050,067102423X,8
