In [2]:
import numpy as np
from sklearn.metrics import jaccard_score
from collections import defaultdict
import json
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import sys
import random
from operator import itemgetter

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
from scipy.sparse import csr_matrix

In [5]:
data_path = '../data/cleaned/RC_2023-01_2.json'
with open(data_path, 'r') as fh:
    comments = json.load(fh)

### Simple Data Stats

In [6]:
print(f'No. of comments in the data - {len(comments)}')
n_subreddit = len(set([comment['subreddit_id'] for comment in comments]))
print(f'No. of unique subreddits : {n_subreddit}')
n_users = len(set([comment['author'] for comment in comments]))
print(f'No. of unique users : {n_users}')

No. of comments in the data - 314599
No. of unique subreddits : 18645
No. of unique users : 77996


### Sparse Representation of the data

In [60]:

usersperitem = defaultdict(set)
itemsperuser = defaultdict(set)
item_name = defaultdict()
for comment in comments:
    user = comment['author_fullname']
    item = comment['subreddit_id']
    item_name[item] = comment['subreddit']
    usersperitem[item].add(user)
    itemsperuser[user].add(item)
user_encoder = LabelEncoder().fit(list(itemsperuser.keys()))
item_encoder = LabelEncoder().fit(list(usersperitem.keys()))
# data : binary indicator whether user uses an item
# idxptr : col index
# indices : row index
# create (n_item*n_user) sparse matrix each row represents one user and each column represents one item

row_idx = []
col_idx = []
for user in tqdm(itemsperuser):
    # user_idx = user_encoder.transform(user)
    for item in itemsperuser[user]:
        # item_idx = item_encoder.transform(item)
        row_idx.append(user)
        col_idx.append(item)
col_idx = item_encoder.transform(np.array(col_idx))
row_idx = user_encoder.transform(np.array(row_idx))
col_idx.shape
row_idx.shape
data = np.ones_like(col_idx)
data.dtype
# creating a user item interaction matrix 
# each row is an item
# each column is a matrix
user_item_interaction = csr_matrix((data, (row_idx, col_idx)))
user_item_interaction.shape

100%|██████████| 77996/77996 [00:00<00:00, 943429.61it/s]


(77996, 18645)

In [76]:
def Jaccard(s1: np.ndarray, s2: np.ndarray):
    """
    Jaccard similarity between two arrays.

    Parameters:
        s1: input array 1
        s2: input array 2
    
    Returns:
        Jaccard similarity for the two arrays
    """
    numer = np.intersect1d(s1, s2).size
    denom = np.union1d(s1, s2).size

    if denom == 0:
        return 0
    return numer/denom

def score(user_history, item_history, user_item_interaction):
    """
    Return the similarity score for item i and user u.

    Parameters:
        u: user_label
        i: item_label
        user_history: array of items user has interacted with
        item_history: array of users who have used item i
        user_item_interaction: csr_matrix containing the user_item_interaction
    """
    # user_v_history = np.unique(user_item_interaction[item_history].nonzero()[1])
    sims = np.vectorize(lambda v: Jaccard(user_history, user_item_interaction[v].nonzero()[1]))(item_history)
    bestsim = sims.max()
    if bestsim == None:
        return 0 
    return bestsim

def rec(u:int, user_item_interaction:csr_matrix):
    """
    Given user `u` and user_item_interaction matrix with rows as items and columns as users,
    return the recommendation for the user beasr on jaccard similarity.

    Parameters:
        u: user label
        user_item_interaction : csr_matrix containing the user_item_interaction

    Return:
        most similar item based on user u's history,
    """
    user_history = user_item_interaction[u].nonzero()[1]
    n_items, n_users = user_item_interaction.shape
    items = np.arange(n_items)
    item_history = user_item_interaction[:, items].nonzero()[0]
    valid_items = items[~np.isin(items, user_history)]
    valid_item_history = item_history[~np.isin(items, user_history)]
    sims = np.vectorize(lambda i: score(user_history, valid_item_history[i==valid_items], user_item_interaction))(valid_items)
    bestitem = valid_items[sims.argmax()]
    bestscore = sims.max()

    return bestitem, bestscore

In [81]:
u = 12
user_history = user_item_interaction[u].nonzero()[1]
user_history # items that are used by the user

array([4676, 7261, 8672, 8929], dtype=int32)

In [82]:
n_users, n_items = user_item_interaction.shape
n_users, n_items

(77996, 18645)

In [83]:
items = np.arange(n_items)
items

array([    0,     1,     2, ..., 18642, 18643, 18644])

In [84]:
item_history = user_item_interaction[:, items].nonzero()
item_history

(array([    0,     1,     1, ..., 77994, 77994, 77995], dtype=int32),
 array([ 2351,  5507, 13533, ..., 13788, 15423,  3285], dtype=int32))

In [85]:
valid_items = items[~np.isin(items, user_history)]
valid_items

array([    0,     1,     2, ..., 18642, 18643, 18644])

In [86]:
valid_items.shape

(18641,)

In [87]:
idx = np.where(np.isin(item_history[1], valid_items))
valid_item_history = (item_history[0][idx], item_history[1][idx])
valid_item_history

(array([    0,     1,     1, ..., 77994, 77994, 77995], dtype=int32),
 array([ 2351,  5507, 13533, ..., 13788, 15423,  3285], dtype=int32))

In [88]:
valid_item_history[0][valid_item_history[1] == 4676]

array([], dtype=int32)

In [90]:
sims = np.vectorize(lambda i: score(user_history, 
                                    valid_item_history[0][valid_item_history[1] == i], 
                                    user_item_interaction))(valid_items)

In [97]:
sims.shape

(18641,)

In [98]:
np.unique(sims)

array([0.        , 0.00068823, 0.07692308, 0.11111111, 0.125     ,
       0.14285714, 0.16666667, 0.2       , 0.4       ])

In [100]:
bestitems = np.argsort(sims)[::-1]
bestscore = sims[bestitems]

In [101]:
bestitems[:5]

array([ 9657, 18177, 17768,  1892,  9301])

In [102]:
bestscore[:5]

array([0.4, 0.4, 0.2, 0.2, 0.2])

In [105]:
for item in item_encoder.inverse_transform(user_history):
    print(item_name[item])

BabyBumps
AskALiberal
namenerds
AskConservatives


In [104]:
for item in item_encoder.inverse_transform(bestitems[:5]):
    print(item_name[item])

SeattleGW
ClothedPreggo
Dororo
Sacramento
gaybbcpersonals
