In [2]:
import numpy as np
from sklearn.metrics import jaccard_score
from collections import defaultdict
import json
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import sys
import random
from operator import itemgetter

In [5]:
from sklearn.preprocessing import LabelEncoder

In [13]:
from scipy.sparse import csr_matrix

In [3]:
data_path = '../data/cleaned/RC_2023-01_2.json'
with open(data_path, 'r') as fh:
    comments = json.load(fh)

### Simple Data Stats

In [4]:
print(f'No. of comments in the data - {len(comments)}')
n_subreddit = len(set([comment['subreddit_id'] for comment in comments]))
print(f'No. of unique subreddits : {n_subreddit}')
n_users = len(set([comment['author'] for comment in comments]))
print(f'No. of unique users : {n_users}')

No. of comments in the data - 314599
No. of unique subreddits : 18645
No. of unique users : 77996


### Jaccard Similarity Model

In [49]:
### Jaccard Similarity Model

def Jaccard(s1: np.ndarray, s2: np.ndarray):
    # numer = len(s1.intersection(s2))
    # denom = len(s1.union(s2))
    # if denom == 0:
    #     return 0
    # return numer / denom
    numer = len(np.intersect1d(s1, s2))
    denom = len(np.union1d(s1, s2))
    if denom == 0:
        return 0
    return numer/denom

In [71]:
s1 = [1,2,3]
s2 = [2,3,4, 5]

Jaccard(s1, s2)

0.4

In [103]:
len(np.union1d(s1, s2))

5

### Sparse Representation of the data

In [6]:
usersperitem = defaultdict(set)
itemsperuser = defaultdict(set)
item_name = defaultdict()
for comment in comments:
    user = comment['author_fullname']
    item = comment['subreddit_id']
    item_name[item] = comment['subreddit']
    usersperitem[item].add(user)
    itemsperuser[user].add(item)

In [7]:
user_encoder = LabelEncoder().fit(list(itemsperuser.keys()))
item_encoder = LabelEncoder().fit(list(usersperitem.keys()))

In [9]:
# data : binary indicator whether user uses an item
# idxptr : col index
# indices : row index
# create (n_item*n_user) sparse matrix each row represents one column and each column represents one user

row_idx = []
col_idx = []
for user in tqdm(itemsperuser):
    # user_idx = user_encoder.transform(user)
    for item in itemsperuser[user]:
        # item_idx = item_encoder.transform(item)
        row_idx.append(item)
        col_idx.append(user)

100%|██████████| 77996/77996 [00:00<00:00, 411213.10it/s]


In [10]:
col_idx = user_encoder.transform(np.array(col_idx))
row_idx = item_encoder.transform(np.array(row_idx))

In [12]:
data = np.ones_like(col_idx)

In [19]:
data.dtype

dtype('int64')

In [33]:
user_item_interaction = csr_matrix((data, (row_idx, col_idx)))

In [40]:
user_item_interaction.shape

(18645, 77996)

In [46]:
%%timeit
item_idx = np.random.randint(0,len(usersperitem))
user_item_interaction[item_idx].nonzero()

98.4 µs ± 397 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [47]:
%%timeit
user_idx = np.random.randint(0,len(itemsperuser))
user_item_interaction[:,user_idx].nonzero()

929 µs ± 3.05 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [48]:
num_items, num_users = user_item_interaction.shape

Slicing a column is ten times more costly than slicing rows in csr format
- 98.4 µs ± 397 ns average for slicing rows
- 929 µs ± 3.05 µs per loop for slicing columns

In [114]:
## finding most similar items based on user-item interaction using Jaccard Similarity metric
%time
random_item = np.random.randint(0, num_items)
target_user_idx = user_item_interaction[random_item].nonzero()[1]
sim = []
for item_idx in range(num_items):
    if item_idx == random_item:
        continue
    user_idx = user_item_interaction[item_idx].nonzero()[1]
    idx_sim = Jaccard(user_idx, target_user_idx)
    sim.append(idx_sim)  

target_item_name = item_encoder.inverse_transform([random_item])
print(item_name[target_item_name[0]])
print('---------------------------------')
top_items = np.argsort(sim)[::-1]
top_item_names = item_encoder.inverse_transform(top_items[:10])
for item in top_item_names:
    print(item_name[item])
    

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 17.2 µs
holdmycatnip
---------------------------------
Possums
StartledCats
diabetes_t2
CampingGear
WestVirginia
InternetIsBeautiful
CorollaHatchback
agedlikewine
puffco
worldnews


In [101]:
target_user_idx

array([20591, 22661, 48101, 66626], dtype=int32)

recruiting


MemesEnEspanol
enlistedgame
TouhouArt
disablednudes
BBWnThiccness
MurderedByWords
wholesomegifs
blondegirlsfucking
Isekai
AfterPrisonShow
