In [1]:
import numpy as np
from sklearn.metrics import jaccard_score
from collections import defaultdict
import json
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import sys
import random
from operator import itemgetter

In [2]:
from typing import Union
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

In [3]:
data_path = '../data/cleaned/RC_2023-01_2.json'
with open(data_path, 'r') as fh:
    comments = json.load(fh)

In [4]:
print(f'No. of comments in the data - {len(comments)}')
n_subreddit = len(set([comment['subreddit_id'] for comment in comments]))
print(f'No. of unique subreddits : {n_subreddit}')
n_users = len(set([comment['author'] for comment in comments]))
print(f'No. of unique users : {n_users}')

No. of comments in the data - 314599
No. of unique subreddits : 18645
No. of unique users : 77996


In [5]:
usersperitem = defaultdict(set)
itemsperuser = defaultdict(set)
item_name = defaultdict()
for comment in comments:
    user = comment['author_fullname']
    item = comment['subreddit_id']
    item_name[item] = comment['subreddit']
    usersperitem[item].add(user)
    itemsperuser[user].add(item)
user_encoder = LabelEncoder().fit(list(itemsperuser.keys()))
item_encoder = LabelEncoder().fit(list(usersperitem.keys()))
# data : binary indicator whether user uses an item
# idxptr : col index
# indices : row index
# create (n_item*n_user) sparse matrix each row represents one user and each column represents one item

row_idx = []
col_idx = []
for user in tqdm(itemsperuser):
    # user_idx = user_encoder.transform(user)
    for item in itemsperuser[user]:
        # item_idx = item_encoder.transform(item)
        row_idx.append(user)
        col_idx.append(item)
col_idx = item_encoder.transform(np.array(col_idx))
row_idx = user_encoder.transform(np.array(row_idx))
print(col_idx.shape)
print(row_idx.shape)
data = np.ones_like(col_idx)
data.dtype
# creating a user item interaction matrix 
# each row is an item
# each column is a matrix
user_item_interaction = csr_matrix((data, (row_idx, col_idx)))
user_item_interaction.shape

100%|██████████| 77996/77996 [00:00<00:00, 1019794.74it/s]


(149206,)
(149206,)


(77996, 18645)

In [8]:
user_item_interaction2 = user_item_interaction.copy()

In [9]:
user_item_interaction  = np.array([[1, 0, 0, 1],
                              [1, 1, 0, 0],
                              [0, 1, 1, 0]])

user_item_interaction = csr_matrix(user_item_interaction)

In [116]:
user_item_interaction = user_item_interaction2

In [141]:
target_user = 14243

In [142]:
row_sum = np.array(user_item_interaction.sum(axis=1))[:,0]
row_indices, col_indices = user_item_interaction.nonzero()
data = user_item_interaction.data/row_sum[row_indices]
normalized_sparse_matrix = csr_matrix((data, (row_indices, col_indices)))

In [143]:
item_ratings = np.zeros(user_item_interaction.shape[1])
target_user_items = user_item_interaction[target_user].nonzero()[1]


In [144]:
target_user_items

array([5314], dtype=int32)

In [145]:
intersection = normalized_sparse_matrix.multiply(normalized_sparse_matrix[target_user]).sum(axis = 1).T[0]
union = row_sum[target_user] + row_sum - intersection # |A| + |B| - intesection
jaccard_score = np.array(intersection/union).flatten()
for item in tqdm(range(user_item_interaction.shape[1])):
    if item not in target_user_items:
        # find similar users who have interacted with this item
        similar_users = user_item_interaction[:, item].nonzero()[0]
        # print(similar_users)
        # jaccard score
        # print(jaccard_score)
        # print(jaccard_score[0][[1,2]])
        # compute weighted average rating for this item
        # rating = r(u, i)*sim(u, v)/sim(u, v)
        weighted_ratings = normalized_sparse_matrix[similar_users, item].toarray().flatten() * jaccard_score[similar_users]
        # print(f'weighted_ratings = {weighted_ratings}')
        rating_sum = jaccard_score[similar_users].sum()
        if rating_sum == 0:
            item_ratings[item] = 0
        else:
            item_ratings[item] = weighted_ratings.sum() / rating_sum



100%|██████████| 18645/18645 [00:38<00:00, 484.34it/s]


In [146]:
np.unique(item_ratings)

array([0.00000000e+00, 3.44352617e-04, 9.09076846e-02, 9.09090909e-02,
       1.66665858e-01, 1.66666667e-01, 2.00000000e-01, 3.33333333e-01,
       5.00000000e-01])

In [147]:
items = np.argsort(item_ratings)[::-1]
print([item_name[i] for i in item_encoder.inverse_transform(items[:10])])

['PHGamers', 'SonicTheHedgehog', 'batman', 'XFiles', 'hamstercare', 'Shitty_Car_Mods', 'RATS', 'antiwork', 'cymbalta', 'Stretched']


In [148]:
print([item_name[i] for i in item_encoder.inverse_transform(target_user_items)])
target_user_items

['hamsters']


array([5314], dtype=int32)

In [108]:
item_ratings

array([ 1.,  0.,  0., nan])

In [81]:
n_users = normalized_sparse_matrix.shape[0]
jaccard_score = np.zeros((n_users, n_users))
for user in range(n_users):
    intersection = normalized_sparse_matrix.multiply(normalized_sparse_matrix[user]).sum(axis = 1).T[0]
    union = row_sum[user] + row_sum - intersection # |A| + |B| - intesection
    print((intersection/union)[0])
    jaccard_score[user] = intersection/union
jaccard_score

[[1.         0.33333333 0.        ]]
[[0.33333333 1.         0.33333333]]
[[0.         0.33333333 1.        ]]


array([[1.        , 0.33333333, 0.        ],
       [0.33333333, 1.        , 0.33333333],
       [0.        , 0.33333333, 1.        ]])

In [86]:
intersection = normalized_sparse_matrix.multiply(normalized_sparse_matrix[user]).sum(axis = 1).T[0]
print(intersection)
union = row_sum[user] + row_sum - intersection # |A| + |B| - intesection
intersection/union

[[0 1 2]]


matrix([[0.        , 0.33333333, 1.        ]])

In [64]:
item_ratings = np.zeros(sparse_matrix.shape[1])


array([[1.        , 0.33333333, 0.        ],
       [0.33333333, 1.        , 0.33333333],
       [0.        , 0.33333333, 1.        ]])

In [30]:
num_users = normalized_sparse_matrix.shape[0]
jaccard_sim = np.zeros((num_users, num_users))
for i in range(num_users):
    jaccard_sim[i] = normalized_sparse_matrix.multiply(normalized_sparse_matrix[i]).sum(axis=1).T[0] / \
        (row_sum[i] + row_sum - normalized_sparse_matrix.multiply(normalized_sparse_matrix[i]).sum(axis=1).T[0])

In [33]:
def test_jaccard_similarity():
    # create a sparse matrix with 3 users and 4 items
    sparse_matrix = np.array([[1, 0, 0, 1],
                              [1, 1, 0, 0],
                              [0, 1, 1, 0]])
    
    # expected Jaccard similarity matrix
    expected_sim = np.array([[1.        , 0.33333333, 0.        ],
                             [0.33333333, 1.        , 0.25      ],
                             [0.        , 0.25      , 1.        ]])
    
    # normalize sparse matrix
    row_sum = np.array(sparse_matrix.sum(axis=1))[:,0]
    row_indices, col_indices = sparse_matrix.nonzero()
    data = sparse_matrix.data / row_sum[row_indices]
    normalized_sparse_matrix = csr_matrix((data, (row_indices, col_indices)))
    
    # compute Jaccard similarity matrix
    num_users = normalized_sparse_matrix.shape[0]
    jaccard_sim = np.zeros((num_users, num_users))
    for i in range(num_users):
        jaccard_sim[i] = normalized_sparse_matrix.multiply(normalized_sparse_matrix[i]).sum(axis=1).T.toarray()[0] / \
            (row_sum[i] + row_sum - normalized_sparse_matrix.multiply(normalized_sparse_matrix[i]).sum(axis=1).T.toarray()[0])
    
    # check that the computed Jaccard similarity matrix matches the expected matrix
    assert np.allclose(jaccard_sim, expected_sim)