In [1]:
import numpy as np
from sklearn.metrics import jaccard_score
from collections import defaultdict
import json
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import sys
import random
from operator import itemgetter

In [2]:
from typing import Union

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
from scipy.sparse import csr_matrix

In [5]:
data_path = '../data/cleaned/RC_2023-01_2.json'
with open(data_path, 'r') as fh:
    comments = json.load(fh)

### Simple Data Stats

In [6]:
print(f'No. of comments in the data - {len(comments)}')
n_subreddit = len(set([comment['subreddit_id'] for comment in comments]))
print(f'No. of unique subreddits : {n_subreddit}')
n_users = len(set([comment['author'] for comment in comments]))
print(f'No. of unique users : {n_users}')

No. of comments in the data - 314599
No. of unique subreddits : 18645
No. of unique users : 77996


### Sparse Representation of the data

In [7]:
usersperitem = defaultdict(set)
itemsperuser = defaultdict(set)
item_name = defaultdict()
for comment in comments:
    user = comment['author_fullname']
    item = comment['subreddit_id']
    item_name[item] = comment['subreddit']
    usersperitem[item].add(user)
    itemsperuser[user].add(item)
user_encoder = LabelEncoder().fit(list(itemsperuser.keys()))
item_encoder = LabelEncoder().fit(list(usersperitem.keys()))
# data : binary indicator whether user uses an item
# idxptr : col index
# indices : row index
# create (n_item*n_user) sparse matrix each row represents one user and each column represents one item

row_idx = []
col_idx = []
for user in tqdm(itemsperuser):
    # user_idx = user_encoder.transform(user)
    for item in itemsperuser[user]:
        # item_idx = item_encoder.transform(item)
        row_idx.append(user)
        col_idx.append(item)
col_idx = item_encoder.transform(np.array(col_idx))
row_idx = user_encoder.transform(np.array(row_idx))
print(col_idx.shape)
print(row_idx.shape)
data = np.ones_like(col_idx)
data.dtype
# creating a user item interaction matrix 
# each row is an item
# each column is a matrix
user_item_interaction = csr_matrix((data, (row_idx, col_idx)))
user_item_interaction.shape

100%|██████████| 77996/77996 [00:00<00:00, 1074439.64it/s]


(149206,)
(149206,)


(77996, 18645)

In [47]:
def get_item_(idx: Union[int, np.array], item_name: dict) -> Union[str, np.array]:
    if type(idx) == int:
        return item_name[idx]
    return np.array([item_name[idx_] for idx_ in idx])

def Jaccard(s1: np.ndarray, s2: np.ndarray):
    """
    Jaccard similarity between two arrays.

    Parameters:
        s1: input array 1
        s2: input array 2
    
    Returns:
        Jaccard similarity for the two arrays
    """
    numer = np.intersect1d(s1, s2).size
    denom = np.union1d(s1, s2).size

    if denom == 0:
        return 0
    return numer/denom

def score(user_history:np.array, item_history:np.array, user_item_interaction:csr_matrix):
    """
    Return the similarity score for item i and user u.

    Parameters:
        user_history: array of items user has interacted with
        item_history: array of users who have used item i
        user_item_interaction: csr_matrix containing the user_item_interaction
    """
    # user_v_history = np.unique(user_item_interaction[item_history].nonzero()[1])
    sims = np.vectorize(lambda v: Jaccard(user_history, user_item_interaction[v].nonzero()[1]))(item_history)
    bestsim = sims.max()
    if bestsim == None:
        return 0 
    return bestsim

def rec(u:int, user_item_interaction:csr_matrix, N:int = 5):
    """
    Given user `u` and user_item_interaction matrix with rows as items and columns as users,
    return the recommendation for the user beasr on jaccard similarity.

    Parameters:
        u: user label
        user_item_interaction : csr_matrix containing the user_item_interaction
        N: top N scores and items recommended
    Return:
        most similar item based on user u's history,
    """
    user_history = user_item_interaction[u].nonzero()[1] # items user `u` has interacted with
    n_users, n_items = user_item_interaction.shape
    items = np.arange(n_items)
    item_history = user_item_interaction[:, items].nonzero()
    valid_items = items[~np.isin(items, user_history)]
    idx = np.where(np.isin(item_history[1], valid_items))
    valid_item_history = (item_history[0][idx], item_history[1][idx])
    sims = np.vectorize(lambda i: score(user_history, 
                                        valid_item_history[0][valid_item_history[1] == i], 
                                        user_item_interaction))(valid_items)
    bestitems = np.argsort(sims)[::-1]
    bestscore = sims[bestitems]
    
    return bestitems[:N], bestscore[:N]


def fast_rec(u, user_item_interaction, N = 5):
    """
    Given user `u` and user_item_interaction matrix with rows as items and columns as users,
    return the recommendation for the user beasr on jaccard similarity.

    Parameters:
        u: user label
        user_item_interaction : csr_matrix containing the user_item_interaction
        N: top N scores and items recommended
    Return:
        most similar item based on user u's history,
    """
    user_history = user_item_interaction[u].nonzero()[1] # items user `u` has interacted with 
    n_users, n_items = user_item_interaction.shape
    print(f'{n_users}, {n_items}')
    items = np.arange(n_items)
    item_history_u = np.unique(user_item_interaction[:, user_history].nonzero()[0]) #
    user_history_u = np.unique(user_item_interaction[item_history_u].nonzero()[1])
    item_history = user_item_interaction[:, items].nonzero()
    valid_items = items[(~np.isin(items, user_history)) & (np.isin(items, user_history_u))]
    # valid_items = items[(np.isin(items, user_history_u))]
    idx = np.where(np.isin(item_history[1], valid_items))
    valid_item_history = (item_history[0][idx], item_history[1][idx])
    sims = np.vectorize(lambda i: score(user_history, 
                                        user_item_interaction[:,i].nonzero(), 
                                        user_item_interaction))(valid_items)
    item_idx = np.argsort(sims)[::-1]
    bestitems = valid_items[item_idx]
    bestscore = sims[item_idx]
    
    return bestitems[:N], bestscore[:N]

In [18]:
u = 14243
user_history = user_item_interaction[u].nonzero()[1]
bestitems_rec, bestscores = rec(u, user_item_interaction)
bestitems_name = get_item_(item_encoder.inverse_transform(bestitems_rec), item_name)
print(bestitems_name, bestscores)

['instantpot' 'SonicTheHedgehog' 'batman' 'XFiles' 'FitAndNatural'] [0.5        0.5        0.33333333 0.33333333 0.2       ]


In [48]:
u = 14243
user_history = user_item_interaction[u].nonzero()[1]
bestitems_fast_rec, bestscores = fast_rec(u, user_item_interaction)
bestitems_name = get_item_(item_encoder.inverse_transform(bestitems_fast_rec), item_name)
print(bestitems_name, bestscores)

77996, 18645
['PHGamers' 'SonicTheHedgehog' 'XFiles' 'batman' 'RATS'] [0.5        0.5        0.33333333 0.33333333 0.2       ]


In [26]:
user_history = user_item_interaction[u].nonzero()[1]
n_items = 18645
items = np.arange(n_items)
item_history_u = np.unique(user_item_interaction[:, user_history].nonzero()[0]) #
user_history_u = np.unique(user_item_interaction[item_history_u].nonzero()[1])
item_history = user_item_interaction[:, items].nonzero()
# valid_items = items[(~np.isin(items, user_history)) & (np.isin(items, user_history_u))]
valid_items = items[(np.isin(items, user_history_u))]

In [28]:
user_history

array([5314], dtype=int32)

In [27]:
item_history_u

array([14243, 15913, 22303, 24560, 26495, 31899, 35398, 45656, 59872,
       63806, 68380], dtype=int32)

In [31]:
user_history_u

array([    8,    18,    21, ..., 18604, 18605, 18623], dtype=int32)

In [32]:
item_history

(array([    0,     1,     1, ..., 77994, 77994, 77995], dtype=int32),
 array([ 2351,  5507, 13533, ..., 13788, 15423,  3285], dtype=int32))

In [33]:
valid_items

array([    8,    18,    21, ..., 18604, 18605, 18623])

In [34]:
idx = np.where(np.isin(item_history[1], valid_items))
valid_item_history = (item_history[0][idx], item_history[1][idx])

In [35]:
idx

(array([     0,      2,      3, ..., 149193, 149199, 149203]),)

In [36]:
valid_item_history

(array([    0,     1,     2, ..., 77988, 77991, 77994], dtype=int32),
 array([ 2351, 13533,  1803, ...,  9041,  4501, 13788], dtype=int32))