In [168]:
import numpy as np
from sklearn.metrics import jaccard_score
from collections import defaultdict
import json
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import sys
import random
from operator import itemgetter

In [169]:
from sklearn.preprocessing import LabelEncoder

In [170]:
from scipy.sparse import csr_matrix

In [171]:
data_path = '../data/cleaned/RC_2023-01_2.json'
with open(data_path, 'r') as fh:
    comments = json.load(fh)

### Simple Data Stats

In [172]:
print(f'No. of comments in the data - {len(comments)}')
n_subreddit = len(set([comment['subreddit_id'] for comment in comments]))
print(f'No. of unique subreddits : {n_subreddit}')
n_users = len(set([comment['author'] for comment in comments]))
print(f'No. of unique users : {n_users}')

No. of comments in the data - 314599
No. of unique subreddits : 18645
No. of unique users : 77996


### Jaccard Similarity Model

In [173]:
### Jaccard Similarity Model

def Jaccard(s1: np.ndarray, s2: np.ndarray):
    """
    Jaccard similarity between two arrays.

    Parameters:
        s1: input array 1
        s2: input array 2
    
    Returns:
        Jaccard similarity for the two arrays
    """
    numer = len(np.intersect1d(s1, s2))
    denom = len(np.union1d(s1, s2))

    if denom == 0:
        return 0
    return numer/denom

In [174]:
s1 = [1,2,2,3]
s2 = [2,3,4, 5]

Jaccard(s1, s2)

0.4

### Sparse Representation of the data

In [175]:
usersperitem = defaultdict(set)
itemsperuser = defaultdict(set)
item_name = defaultdict()
for comment in comments:
    user = comment['author_fullname']
    item = comment['subreddit_id']
    item_name[item] = comment['subreddit']
    usersperitem[item].add(user)
    itemsperuser[user].add(item)

In [176]:
user_encoder = LabelEncoder().fit(list(itemsperuser.keys()))
item_encoder = LabelEncoder().fit(list(usersperitem.keys()))

In [177]:
# data : binary indicator whether user uses an item
# idxptr : col index
# indices : row index
# create (n_item*n_user) sparse matrix each row represents one item and each column represents one user

row_idx = []
col_idx = []
for user in tqdm(itemsperuser):
    # user_idx = user_encoder.transform(user)
    for item in itemsperuser[user]:
        # item_idx = item_encoder.transform(item)
        row_idx.append(item)
        col_idx.append(user)

100%|██████████| 77996/77996 [00:00<00:00, 842314.47it/s]


In [178]:
col_idx = user_encoder.transform(np.array(col_idx))
row_idx = item_encoder.transform(np.array(row_idx))

In [179]:
col_idx.shape

(149206,)

In [180]:
row_idx.shape

(149206,)

In [181]:
data = np.ones_like(col_idx)

In [182]:
data.dtype

dtype('int64')

In [183]:
# creating a user item interaction matrix 
# each row is an item
# each column is a matrix
user_item_interaction = csr_matrix((data, (row_idx, col_idx)))

In [184]:
user_item_interaction.shape

(18645, 77996)

In [30]:
%%timeit
# getting a users using a specific item (slice the matrix for a given row)
item_idx = np.random.randint(0,len(usersperitem))
user_item_interaction[item_idx].nonzero()

98.6 µs ± 245 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [31]:
%%timeit
# getting a items using a specific user (slice the matrix for a given column)
user_idx = np.random.randint(0,len(itemsperuser))
user_item_interaction[:,user_idx].nonzero()

924 µs ± 5.23 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [32]:
num_items, num_users = user_item_interaction.shape

Slicing a column is ten times more costly than slicing rows in csr format
- 98.4 µs ± 397 ns average for slicing rows
- 929 µs ± 3.05 µs per loop for slicing columns

In [33]:
## finding most similar items based on user-item interaction using Jaccard Similarity metric
%time
random_item = np.random.randint(0, num_items)
target_user_idx = user_item_interaction[random_item].nonzero()[1]
sim = []
for item_idx in range(num_items):
    if item_idx == random_item:
        continue
    user_idx = user_item_interaction[item_idx].nonzero()[1]
    idx_sim = Jaccard(user_idx, target_user_idx)
    sim.append(idx_sim)  

target_item_name = item_encoder.inverse_transform([random_item])
print(item_name[target_item_name[0]])
print('---------------------------------')
top_items = np.argsort(sim)[::-1]
top_item_names = item_encoder.inverse_transform(top_items[:10])
for item in top_item_names:
    print(item_name[item])
    

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 12.4 µs
SelfAwarewolves
---------------------------------
HealthAnxiety
cricutcrafting
MaddenUltimateTeam
NewToReddit
EmilyInParis
iamverybadass
IRLgirls
fashionhunters
2022wrx
Rateme


In [123]:
def Jaccard(s1: set, s2: set):
    """
    Jaccard similarity between two sets.

    Parameters:
        s1: input set 1
        s2: input set 2
    
    Returns:
        Jaccard similarity for the two sets
    """
    numer = len(s1 & s2)
    denom = len(s1 | s2)

    if denom == 0:
        return 0
    return numer/denom

In [185]:
u = 12
user_history = set(user_item_interaction[:, u].nonzero()[0])
user_history

{4676, 7261, 8672, 8929}

In [186]:
# list of interesting items will belong to set of users who have some intersection with user_history
user_search_space = set(user_item_interaction[list(user_history), :].nonzero()[1])
item_search_space = set(user_item_interaction[:, list(user_search_space)].nonzero()[0])
print(f'No. of items = {len(item_search_space)}')
item_search_space

No. of items = 3007


{16386,
 8196,
 8,
 8201,
 16396,
 18,
 8210,
 21,
 8216,
 16408,
 8218,
 34,
 16418,
 16419,
 38,
 8231,
 16424,
 47,
 8243,
 16435,
 16439,
 57,
 8249,
 8251,
 16441,
 63,
 68,
 8263,
 72,
 8264,
 74,
 75,
 76,
 16457,
 80,
 85,
 8278,
 8279,
 8280,
 89,
 92,
 97,
 8289,
 8295,
 105,
 16490,
 8300,
 110,
 8302,
 16494,
 8307,
 16501,
 118,
 8310,
 16503,
 8324,
 16517,
 8326,
 135,
 8327,
 16525,
 8336,
 145,
 151,
 8345,
 156,
 16542,
 165,
 8358,
 16556,
 175,
 176,
 8369,
 8371,
 16563,
 8373,
 184,
 8378,
 8379,
 190,
 8383,
 192,
 193,
 16574,
 201,
 17951,
 8397,
 206,
 8400,
 8402,
 211,
 8405,
 8408,
 219,
 8413,
 16609,
 229,
 16614,
 8423,
 8427,
 16621,
 16627,
 8441,
 8446,
 255,
 8449,
 259,
 8451,
 8452,
 262,
 16646,
 268,
 16654,
 271,
 8464,
 274,
 277,
 8470,
 16662,
 281,
 282,
 8473,
 8474,
 285,
 286,
 8479,
 16678,
 8489,
 300,
 16686,
 16687,
 8497,
 8499,
 16694,
 16695,
 16696,
 314,
 319,
 8515,
 328,
 329,
 16714,
 8524,
 335,
 338,
 339,
 8532,
 8534,
 343

In [167]:
item_search_space

{16386,
 8196,
 8,
 8201,
 16396,
 18,
 8210,
 21,
 8216,
 16408,
 8218,
 34,
 16418,
 16419,
 38,
 8231,
 16424,
 47,
 8243,
 16435,
 16439,
 57,
 8249,
 8251,
 16441,
 63,
 68,
 8263,
 72,
 8264,
 74,
 75,
 76,
 16457,
 80,
 85,
 8278,
 8279,
 8280,
 89,
 92,
 97,
 8289,
 8295,
 105,
 16490,
 8300,
 110,
 8302,
 16494,
 8307,
 16501,
 118,
 8310,
 16503,
 8324,
 16517,
 8326,
 135,
 8327,
 16525,
 8336,
 145,
 151,
 8345,
 156,
 16542,
 165,
 8358,
 16556,
 175,
 176,
 8369,
 8371,
 16563,
 8373,
 184,
 8378,
 8379,
 190,
 8383,
 192,
 193,
 16574,
 201,
 17951,
 8397,
 206,
 8400,
 8402,
 211,
 8405,
 8408,
 219,
 8413,
 16609,
 229,
 16614,
 8423,
 8427,
 16621,
 16627,
 8441,
 8446,
 255,
 8449,
 259,
 8451,
 8452,
 262,
 16646,
 268,
 16654,
 271,
 8464,
 274,
 277,
 8470,
 16662,
 281,
 8474,
 282,
 8473,
 285,
 286,
 8479,
 16678,
 8489,
 300,
 16686,
 16687,
 8497,
 8499,
 16694,
 16695,
 16696,
 314,
 319,
 8515,
 328,
 329,
 16714,
 8524,
 335,
 338,
 339,
 8532,
 8534,
 343

In [187]:
# for each item in item search space find the users who have used the items
# users_with_items = set(user_item_interaction[:,list(item_search_space)].nonzero()[0])
# item_history =  user_item_interaction[list(users_with_items)].nonzero()
# item_history

user_history = user_item_interaction[list(item_search_space)].nonzero()
user_history

(array([   0,    1,    1, ..., 3005, 3006, 3006], dtype=int32),
 array([24560, 14198, 24560, ..., 24560, 24560, 45711], dtype=int32))

In [153]:
item_history[1].max()

3006

65053

In [144]:
# find the user_v_history for each user v in item_history
user_v_history = user_item_interaction[list(item_history)].nonzero()
user_v_history

(array([   0,    0,    0, ..., 1464, 1465, 1466], dtype=int32),
 array([   97,  3517, 11915, ...,  1550,   110,   110], dtype=int32))

In [145]:
spliting_arr = np.diff(user_v_history[0]).astype(bool)
idx = np.where(spliting_arr == 1)[0]
user_v_history = np.split(user_v_history[1], idx+1)
user_v_history = [set(items) for items in user_v_history]
print(len(user_v_history))
user_v_history

1467


[{97, 3517, 11915},
 {110, 568},
 {99},
 {89},
 {4},
 {143, 8753, 12697, 13951, 17754},
 {151},
 {151, 4397, 12687, 13389, 17815},
 {34, 14422},
 {110},
 {110},
 {179, 2108, 10357, 12024, 12462, 17454},
 {151, 10510, 10834},
 {110},
 {110, 8929, 18474},
 {151},
 {110},
 {110, 2811, 4161, 16128},
 {2, 780},
 {89, 1851},
 {143, 12610, 14774, 17969},
 {89, 1724},
 {157},
 {167, 1703, 1717, 1828, 13180, 15414, 15469},
 {56, 4681, 7353},
 {4, 1678, 14625},
 {151},
 {58},
 {23, 2265, 16841},
 {89},
 {141, 9931, 17690},
 {110, 3347},
 {89},
 {110},
 {110, 8005},
 {160, 17719},
 {4},
 {44, 2597},
 {34, 2532, 8323, 10467, 11331, 13309, 15005},
 {23},
 {122, 1133, 6595, 10201},
 {151, 7398},
 {151, 9006, 17423},
 {151},
 {176, 1640, 2070, 16981},
 {110, 4837, 17711},
 {38, 15583},
 {110},
 {2, 14489},
 {110, 1983},
 {106, 9853},
 {118},
 {120, 18457},
 {163, 2344, 5507},
 {92},
 {110, 2070},
 {38},
 {110, 5967},
 {72,
  774,
  1523,
  7666,
  9030,
  9312,
  9485,
  9825,
  10067,
  10216,
  111

In [124]:
def rec(u:int, user_item_interaction:csr_matrix):
    """
    Given user `u` and `user_item_interaction` matrix with rows as items and columns as users,
    return the recommendation for the user based on Jaccard similarity.

    Parameters:
        u: user label
        user_item_interaction: csr_matrix containing the user_item_interaction

    Return:
        most similar item based on user u's history,
    """
    user_history = set(user_item_interaction[u].nonzero()[1])

    # Compute the Jaccard similarity between the user and each item
    item_history = user_item_interaction[:, list(user_history)].nonzero()[0]
    item_history = np.split(item_history, np.cumsum(np.diff(item_history).astype(bool)))
    item_history = [set(items) for items in item_history]
    user_item_jaccard = np.array([Jaccard(user_history, items) for items in item_history])

    # Find the item with the highest similarity score
    bestitem = np.argmax(user_item_jaccard)

    return bestitem, user_item_jaccard[bestitem]

In [None]:
u = 12
# get the items that are used by the user u
user_history = set(user_item_interaction[u].nonzero()[1])
# find all the users who have used the items present in user_history
item_history = set(user_item_interaction[:, list(user_history)].nonzero()[0])
# find the items for the users present in item_history 
items = user_item_interaction[list(item_history), :].nonzero()
# find the itemsets for the users present in item_histoty
spliting_arr = np.diff(items[0]).astype(bool)
idx = np.where(spliting_arr == 1)[0]
user_v_history = np.split(items[1], idx+1)
user_v_history = [set(items) for items in user_v_history]
# find the jaccard similarity between user_history and user_v_history
user_item_jaccard = np.array([Jaccard(user_history, items) for items in user_v_history])


In [44]:
# get the items that are used by the user u
u = 12
print(user_item_interaction[u].nonzero())
user_history = set(user_item_interaction[u].nonzero()[1])

(array([0, 0, 0, 0], dtype=int32), array([4676, 7261, 8672, 8929], dtype=int32))


In [45]:
user_history

{4676, 7261, 8672, 8929}

In [46]:
user_item_interaction[:, list(user_history)].nonzero()[0]

array([   12,    12,    12,    12,    95,   680,   930,  2454,  2963,
        3268,  3608,  5099,  5286,  6098,  6136,  6460,  6652,  6699,
        6905,  7255,  7284,  7383,  8896,  8963,  8965,  9773, 10432,
       10563, 11273, 11555, 11570, 11844, 12147, 12189, 12594, 12673,
       12852, 12947, 12953, 13511, 13955, 14314, 14992, 15043, 15068,
       15142, 15622, 15780, 15905, 16084, 16172, 16172, 16489, 16516,
       16518, 17059, 17069, 17124, 17449, 17474, 17853, 20123, 20167,
       20173, 20394, 20765, 21103, 21227, 21429, 22671, 22914, 23316,
       24007, 24017, 24085, 24190, 24383, 24560, 24560, 24963, 25108,
       26508, 26614, 26992, 27277, 27430, 27436, 27980, 28176, 28459,
       28545, 29074, 29642, 29976, 30075, 30391, 30396, 30645, 30972,
       31521, 31579, 32025, 33533, 34964, 35204, 35308, 36694, 36712,
       37314, 39293, 39500, 41048, 41326, 42321, 42584, 43015, 43096,
       43939, 43939, 44058, 44081, 44417, 44554, 44803, 45551, 46056,
       46502, 46635,

In [79]:
# find all the users who have used the items present in user_history
item_history = set(user_item_interaction[:, list(user_history)].nonzero()[0])
item_history

{12,
 95,
 680,
 930,
 2454,
 2963,
 3268,
 3608,
 5099,
 5286,
 6098,
 6136,
 6460,
 6652,
 6699,
 6905,
 7255,
 7284,
 7383,
 8896,
 8963,
 8965,
 9773,
 10432,
 10563,
 11273,
 11555,
 11570,
 11844,
 12147,
 12189,
 12594,
 12673,
 12852,
 12947,
 12953,
 13511,
 13955,
 14314,
 14992,
 15043,
 15068,
 15142,
 15622,
 15780,
 15905,
 16084,
 16172,
 16489,
 16516,
 16518,
 17059,
 17069,
 17124,
 17449,
 17474,
 17853,
 20123,
 20167,
 20173,
 20394,
 20765,
 21103,
 21227,
 21429,
 22671,
 22914,
 23316,
 24007,
 24017,
 24085,
 24190,
 24383,
 24560,
 24963,
 25108,
 26508,
 26614,
 26992,
 27277,
 27430,
 27436,
 27980,
 28176,
 28459,
 28545,
 29074,
 29642,
 29976,
 30075,
 30391,
 30396,
 30645,
 30972,
 31521,
 31579,
 32025,
 33533,
 34964,
 35204,
 35308,
 36694,
 36712,
 37314,
 39293,
 39500,
 41048,
 41326,
 42321,
 42584,
 43015,
 43096,
 43939,
 44058,
 44081,
 44417,
 44554,
 44803,
 45551,
 46056,
 46502,
 46635,
 47206,
 49366,
 49387,
 49590,
 50124,
 50342,
 5082

In [80]:
len(item_history)

191

In [92]:
# find the item_sets for the users present in item_history 
item_sets = user_item_interaction[list(item_history), :].nonzero()
item_sets

(array([  0,   1,   2, ..., 189, 190, 190], dtype=int32),
 array([ 7261,  7261,  1767, ...,  7261,  8672, 15361], dtype=int32))

In [93]:
item_sets[0][:10]

array([0, 1, 2, 2, 2, 2, 3, 3, 3, 3], dtype=int32)

In [94]:
item_sets[1][:10]

array([7261, 7261, 1767, 2137, 6628, 7261, 4676, 7261, 8672, 8929],
      dtype=int32)

In [95]:
item_sets[0].shape

(3309,)

In [96]:
item_sets[0][:10]

array([0, 1, 2, 2, 2, 2, 3, 3, 3, 3], dtype=int32)

In [100]:
spliting_arr = np.diff(item_sets[0]).astype(bool)
idx = np.where(spliting_arr == 1)[0]
idx

array([   0,    1,    5,    9,   10,   12,   13,   15,   18,   20,   22,
         24,   26,   28,   32,   34,   36,   39,   40,   42,   46,   48,
         52,   53,   54,   56,   57,   59,   62,   68,   71,   73,   76,
         78,   79,   80,   81,   83,   85,   95,   96,   98,  101,  102,
        103,  105,  107,  110,  112,  114,  116,  117,  120,  122,  123,
        125,  127,  128,  130,  131,  132,  135,  136,  138,  140,  142,
        144,  145,  147,  149,  151,  153,  155,  158,  160,  162,  164,
        165,  170,  171,  173,  177,  179,  181,  183,  185,  187,  189,
        190,  192,  198,  200,  201,  203,  204,  206,  210,  211,  213,
        219,  220,  221,  223,  228,  230,  232,  236,  237,  238,  239,
        240,  241,  244,  248,  249,  252,  254,  255,  257,  259,  260,
        261,  263,  265,  266,  267,  269,  272,  273,  274,  276,  280,
        281,  282,  284,  286,  290,  291,  293,  294,  296,  298,  300,
        302,  306,  308,  309,  313,  315,  316,  3

In [102]:
item_sets_list = np.split(item_sets[1], idx+1)
item_sets_list

[array([7261], dtype=int32),
 array([7261], dtype=int32),
 array([1767, 2137, 6628, 7261], dtype=int32),
 array([4676, 7261, 8672, 8929], dtype=int32),
 array([8672], dtype=int32),
 array([4044, 7261], dtype=int32),
 array([4676], dtype=int32),
 array([4676, 9553], dtype=int32),
 array([ 4213,  8672, 10461], dtype=int32),
 array([3017, 8672], dtype=int32),
 array([1579, 8672], dtype=int32),
 array([ 8672, 17310], dtype=int32),
 array([ 4676, 12001], dtype=int32),
 array([4676, 8616], dtype=int32),
 array([ 1568,  4676, 10692, 16938], dtype=int32),
 array([2871, 4676], dtype=int32),
 array([1805, 4676], dtype=int32),
 array([3408, 8400, 8672], dtype=int32),
 array([8672], dtype=int32),
 array([1804, 4676], dtype=int32),
 array([ 1589,  8672, 10298, 17604], dtype=int32),
 array([ 8672, 17772], dtype=int32),
 array([  285,  4676,  8616, 15605], dtype=int32),
 array([4676], dtype=int32),
 array([8672], dtype=int32),
 array([ 8672, 13116], dtype=int32),
 array([8672], dtype=int32),
 array([

In [61]:
len(item_sets)

3309

In [111]:
user_history

{4676, 7261, 8672, 8929}

In [118]:
item_history

[{7261},
 {7261},
 {1767, 2137, 6628, 7261},
 {4676, 7261, 8672, 8929},
 {8672},
 {4044, 7261},
 {4676},
 {4676, 9553},
 {4213, 8672, 10461},
 {3017, 8672},
 {1579, 8672},
 {8672, 17310},
 {4676, 12001},
 {4676, 8616},
 {1568, 4676, 10692, 16938},
 {2871, 4676},
 {1805, 4676},
 {3408, 8400, 8672},
 {8672},
 {1804, 4676},
 {1589, 8672, 10298, 17604},
 {8672, 17772},
 {285, 4676, 8616, 15605},
 {4676},
 {8672},
 {8672, 13116},
 {8672},
 {8672, 10393},
 {1984, 7088, 8672},
 {3953, 5199, 6462, 7561, 8474, 8929},
 {1835, 8672, 11720},
 {1558, 4676},
 {3671, 8672, 10125},
 {4676, 5177},
 {7261},
 {8929},
 {8672},
 {8929, 12934},
 {1558, 8672},
 {4401, 4421, 4676, 5514, 5652, 6059, 10511, 12399, 12724, 17573},
 {8672},
 {4676, 10408},
 {4676, 8201, 8616},
 {4676},
 {8672},
 {2566, 7261},
 {8672, 14243},
 {110, 8929, 18474},
 {4784, 8672},
 {5898, 8672},
 {6756, 7261},
 {8929},
 {4555, 4676, 17310},
 {262, 8672},
 {8672},
 {8672, 16146},
 {4676, 10823},
 {4676},
 {1767, 4676},
 {7261},
 {7261}

In [120]:
item_history = [set(items) for items in item_sets_list]
user_item_jaccard = np.array([Jaccard(user_history, items) for items in item_history])

In [106]:
item_history

[{7261},
 {7261},
 {1767, 2137, 6628, 7261},
 {4676, 7261, 8672, 8929},
 {8672},
 {4044, 7261},
 {4676},
 {4676, 9553},
 {4213, 8672, 10461},
 {3017, 8672},
 {1579, 8672},
 {8672, 17310},
 {4676, 12001},
 {4676, 8616},
 {1568, 4676, 10692, 16938},
 {2871, 4676},
 {1805, 4676},
 {3408, 8400, 8672},
 {8672},
 {1804, 4676},
 {1589, 8672, 10298, 17604},
 {8672, 17772},
 {285, 4676, 8616, 15605},
 {4676},
 {8672},
 {8672, 13116},
 {8672},
 {8672, 10393},
 {1984, 7088, 8672},
 {3953, 5199, 6462, 7561, 8474, 8929},
 {1835, 8672, 11720},
 {1558, 4676},
 {3671, 8672, 10125},
 {4676, 5177},
 {7261},
 {8929},
 {8672},
 {8929, 12934},
 {1558, 8672},
 {4401, 4421, 4676, 5514, 5652, 6059, 10511, 12399, 12724, 17573},
 {8672},
 {4676, 10408},
 {4676, 8201, 8616},
 {4676},
 {8672},
 {2566, 7261},
 {8672, 14243},
 {110, 8929, 18474},
 {4784, 8672},
 {5898, 8672},
 {6756, 7261},
 {8929},
 {4555, 4676, 17310},
 {262, 8672},
 {8672},
 {8672, 16146},
 {4676, 10823},
 {4676},
 {1767, 4676},
 {7261},
 {7261}

In [121]:
user_item_jaccard

array([2.50000000e-01, 2.50000000e-01, 1.42857143e-01, 1.00000000e+00,
       2.50000000e-01, 2.00000000e-01, 2.50000000e-01, 2.00000000e-01,
       1.66666667e-01, 2.00000000e-01, 2.00000000e-01, 2.00000000e-01,
       2.00000000e-01, 2.00000000e-01, 1.42857143e-01, 2.00000000e-01,
       2.00000000e-01, 1.66666667e-01, 2.50000000e-01, 2.00000000e-01,
       1.42857143e-01, 2.00000000e-01, 1.42857143e-01, 2.50000000e-01,
       2.50000000e-01, 2.00000000e-01, 2.50000000e-01, 2.00000000e-01,
       1.66666667e-01, 1.11111111e-01, 1.66666667e-01, 2.00000000e-01,
       1.66666667e-01, 2.00000000e-01, 2.50000000e-01, 2.50000000e-01,
       2.50000000e-01, 2.00000000e-01, 2.00000000e-01, 7.69230769e-02,
       2.50000000e-01, 2.00000000e-01, 1.66666667e-01, 2.50000000e-01,
       2.50000000e-01, 2.00000000e-01, 2.00000000e-01, 1.66666667e-01,
       2.00000000e-01, 2.00000000e-01, 2.00000000e-01, 2.50000000e-01,
       1.66666667e-01, 2.00000000e-01, 2.50000000e-01, 2.00000000e-01,
      

In [122]:
bestitem = np.argmax(user_item_jaccard)
bestitem

3

In [31]:
item_history

[{5863},
 {5948},
 {6481},
 {8616},
 {9615},
 {10955},
 {11434},
 {11822},
 set(),
 set(),
 {12342}]

In [32]:
user_history

{21337, 24528, 33332}

In [None]:
# while True:
#     u = random.sample(users, 1)[0]
#     if len(items_per_user_train[u]) > 2:
#         break
# item, similarity_score = rec(u)
# print(f'recommended - {item_name[item]}')
# print('user history')
# for item in items_per_user_train[u]:
#     print(item_name[item])



In [116]:
s1 = {1,2,3}
s2 = {2,3,4}
s1|s2

{1, 2, 3, 4}