In [1]:
import numpy 
from sklearn.metrics import jaccard_score
from collections import defaultdict
import json
import random

In [2]:
from tqdm import tqdm

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
import sys

In [5]:
import random

In [6]:
data_path = '../data/cleaned/RC_2023-01_2.json'
with open(data_path, 'r') as fh:
    comments = json.load(fh)

In [7]:
comments[0]['created_utc']

1672541331

### Simple Data Stats

In [8]:
print(f'No. of comments in the data - {len(comments)}')

No. of comments in the data - 314599


In [9]:
n_subreddit = len(set([comment['subreddit_id'] for comment in comments]))
print(f'No. of unique subreddits : {n_subreddit}')

No. of unique subreddits : 18645


In [10]:
n_users = len(set([comment['author'] for comment in comments]))
print(f'No. of unique users : {n_users}')

No. of unique users : 77996


### Data Transformation

In [11]:
usersperitem = defaultdict(set)
itemsperuser = defaultdict(set)
item_name = defaultdict()
for comment in comments:
    user = comment['author_fullname']
    item = comment['subreddit_id']
    item_name[item] = comment['subreddit']
    usersperitem[item].add(user)
    itemsperuser[user].add(item)

In [12]:
users = [u for u in itemsperuser]

### Jaccard Similarity

In [13]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [14]:
# Retrieve the most similar item based on jaccard similarity
def mostSimilar(i, N):
    similarities = []
    users = usersperitem[i]
    for i2 in usersperitem:
        if i2 == i: continue
        sim = Jaccard(users, usersperitem[i2])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [15]:
idx = random.randint(0, len(comments))
query = comments[idx]['subreddit_id']
similar_items = mostSimilar(query, 5)
# print(similar_items)
print(item_name[query])
for sim, _id in similar_items:
    print(sim, item_name[_id])

Flipping
0.09090909090909091 HusbandSharing
0.08333333333333333 eBaySellerAdvice
0.07692307692307693 FacebookMarketplace
0.05 Infidelity
0.03333333333333333 IllegalLifeProTips


In [16]:
## Given a user find the subreddits to reccommend

# 2: Jaccard similarity with most similar user who has consumed i
def score(u, i, userHistory):
    bestsim = None
    for v in usersperitem[i]:
        if u == v:
            continue
        sim = Jaccard(userHistory, itemsperuser[v])
        if bestsim == None or sim > bestsim:
            bestsim = sim
    if bestsim == None:
        return 0
    return bestsim

def rec(u):
    history = itemsperuser[u]
    bestitem = None
    bestscore = None
    for i in item_name:
        if i in history:
            continue
        s = score(u, i, history)
        if bestitem == None or s > bestscore:
            bestitem = i
            bestscore = s
    return bestitem, bestscore

In [18]:
while True:
    u = random.sample(users, 1)[0]
    if len(itemsperuser[u]) > 3:
        break
item, similarity_score = rec(u)
print(f'recommended - {item_name[item]}')
print('user history')
for item in itemsperuser[u]:
    print(item_name[item])

recommended - CryptoCurrency
user history
greentext
meirl
vegan
jakeandamir


## Data Splitting Strategy

In [19]:
from datetime import datetime

def unix_to_datetime(unixtime: int):
    return datetime.fromtimestamp(unixtime).strftime('%Y-%m-%d-%H-%M-%S')

def datetime_to_unix(date: datetime):
    return int((datetime(*list(map(int, date.split('-')))) - datetime(1970, 1, 1)).total_seconds())

In [20]:
print(f'No. of data points in the dataset: {len(comments)}')

No. of data points in the dataset: 314599


In [21]:
users_with_one_subreddit = 0
for user, items in itemsperuser.items():
    if len(items) == 1:
        users_with_one_subreddit+=1
print(f'No. of users who have interacted with only one subreddit: {users_with_one_subreddit}')

No. of users who have interacted with only one subreddit: 36738


In [22]:
type(comments[0]['created_utc'])

int

In [205]:
min_time = sys.maxsize
max_time = 0
for comment in tqdm(comments):
    unixtime = comment['created_utc']
    if unixtime < min_time:
        min_time = unixtime
    if unixtime > max_time:
        max_time = unixtime

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 314599/314599 [00:00<00:00, 786486.53it/s]


In [206]:
min_time

1672541331

In [207]:
max_time

1672547219

In [231]:
print(f'Earliest  date : {unix_to_datetime(min_time)}')
print(f'Latest date : {unix_to_datetime(max_time)}')

Earliest  date : 2022-12-31-23-48-51
Latest date : 2023-01-01-01-26-59


In [44]:
def get_authors(data: list):
    authors = []
    for comment in comments:
        author = comment['author_fullname']
        if author in authors:
            continue
        authors.append(author)
    return authors

def random_split(data:list, ratio: float=0.25):
    indices = range(len(data))
    train_idx, test_idx = train_test_split(indices, test_size = ratio)
    return train_idx, test_idx

def chronological_split(data:list, date: int):
    #todo
    pass
    
def leave_one_out_split(data: list):
    
    # authors = get_authors(data)
    author_idx = defaultdict(list)
    train_idx, test_idx = [], []
    for i, comment in tqdm(enumerate(comments)):
        author = comment['author_fullname']
        author_idx[author].append(i)
    # print(author_idx[author])
    for author, _idx in tqdm(author_idx.items()):
        # print(_idx)
        if len(_idx) >= 2:
            train_idx.extend(_idx[:-1])
            test_idx.append(_idx[-1])
    return train_idx, test_idx

In [45]:
# train_idx, test_idx = random_split(comments)
train_idx, test_idx = leave_one_out_split(comments)

314599it [00:00, 422251.95it/s]
100%|██████████| 77996/77996 [00:00<00:00, 530778.46it/s]


In [46]:
len(train_idx)

236603

In [47]:
len(test_idx)

77996

In [None]:
# transform data
def filter_data(data: list, idx: list):
    return data[idx]