In [1]:
import json
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm import tqdm
from pprint import pprint

In [2]:
WIDTH = 7
PRECISION = 3
TYPE = "f"

In [3]:
sys.path.append('/home/ashish/workspace/recommender/')

In [4]:
data_path = '../data/raw/RC_2023-01_2.json'
with open(data_path, 'rb') as f:
    comments = json.load(f)

In [5]:
print(f'No. of comments in the data - {len(comments)}')

No. of comments in the data - 500000


## Data Cleaning and Dala Loss Analysis after cleaning 

### Removing ['deleted'] users

In [6]:
comments_del_rm = []
for comment in tqdm(comments):
    if comment['author'] == '[deleted]':
        continue
    comments_del_rm.append(comment)
comments = comments_del_rm
del(comments_del_rm)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500000/500000 [00:00<00:00, 1278329.53it/s]


In [7]:
print(f'No. of comments after removing [deleted] comments - {len(comments)}')

No. of comments after removing [deleted] comments - 458358


### Removing subreddits that contain less than threshold no. of comments

In [8]:
comments_per_subreddit = defaultdict(int)
for comment in tqdm(comments):
    subreddit_id = comment['subreddit_id']
    comments_per_subreddit[subreddit_id] +=1

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 458358/458358 [00:00<00:00, 1161332.35it/s]


In [9]:
COMMENT_THRESHOLD = 10

In [10]:
comments_lost = 0
subreddits_lost = 0
for subreddit_id, n_comments in tqdm(comments_per_subreddit.items()):
    if n_comments < COMMENT_THRESHOLD:
        comments_lost += n_comments
        subreddits_lost += 1

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25452/25452 [00:00<00:00, 1005807.83it/s]


In [11]:
remaining_comments = len(comments) - comments_lost
remaining_subreddits  = len(comments_per_subreddit) - subreddits_lost

In [12]:
print(f'Remaining data points: {remaining_comments} (/{len(comments)})')

Remaining data points: 407380 (/458358)


In [13]:
print(f'remaining_subreddits : {remaining_subreddits} (/{len(comments_per_subreddit)})')

remaining_subreddits : 6217 (/25452)


In [14]:
print(f'{remaining_subreddits/len(comments_per_subreddit)*100:,.2f}% of the communities contain {remaining_comments/len(comments)*100:,.2f}% of comments')

24.43% of the communities contain 88.88% of comments


In [15]:
print(f'Loosing {comments_lost} data points out of {len(comments)} ({comments_lost/len(comments)*100:,.2f}%)')

Loosing 50978 data points out of 458358 (11.12%)


In [16]:
print(f'Loosing {subreddits_lost} communities out of {len(comments_per_subreddit)} ({subreddits_lost/len(comments_per_subreddit)*100:,.2f}%)')

Loosing 19235 communities out of 25452 (75.57%)


In [17]:
comment_rm_sr_thr = []
for comment in tqdm(comments):
    if comments_per_subreddit[comment['subreddit_id']] < COMMENT_THRESHOLD:
        continue
    comment_rm_sr_thr.append(comment)
# comments = comment_rm_sr_thr
# del(comment_rm_sr_thr)
print(len(comment_rm_sr_thr))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 458358/458358 [00:00<00:00, 1024483.68it/s]

407380





### Removing users who are inactive i.e. have less than threshold number of comments

In [22]:
comments_per_user = defaultdict(int)
for comment in tqdm(comments):
    author_id = comment['author_fullname']
    comments_per_user[author_id] +=1

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 458358/458358 [00:00<00:00, 757782.24it/s]


In [31]:
COMMENT_THRESHOLD = 2

In [32]:
comments_lost = 0
users_lost = 0
for author_id, n_comments in tqdm(comments_per_user.items()):
    if n_comments < COMMENT_THRESHOLD:
        comments_lost += n_comments
        users_lost += 1

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221755/221755 [00:00<00:00, 1872895.27it/s]


In [33]:
remaining_comments = len(comments) - comments_lost
remaining_users  = len(comments_per_user) - users_lost

In [34]:
print(f'Remaining data points: {remaining_comments} (/{len(comments)})')

Remaining data points: 314599 (/458358)


In [35]:
print(f'Remaining Users: {remaining_users} (/{len(comments_per_user)})')

Remaining Users: 77996 (/221755)


In [36]:
print(f'{remaining_users/len(comments_per_user)*100:,.2f}% of the users account for {remaining_comments/len(comments)*100:,.2f}% of comments')

35.17% of the users account for 68.64% of comments


In [37]:
print(f'Loosing {comments_lost} data points out of {len(comments)} ({comments_lost/len(comments)*100:,.2f}%)')

Loosing 143759 data points out of 458358 (31.36%)


In [38]:
print(f'Loosing {users_lost} users out of {len(comments_per_user)} ({users_lost/len(comments_per_user)*100:,.2f}%)')

Loosing 143759 users out of 221755 (64.83%)


In [39]:
comment_rm_au_thr = []
for comment in tqdm(comments):
    if comments_per_user[comment['author_fullname']] < COMMENT_THRESHOLD:
        continue
    comment_rm_au_thr.append(comment)
# comments = comment_rm_sr_thr
# del(comment_rm_sr_thr)
print(len(comment_rm_au_thr))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 458358/458358 [00:00<00:00, 975519.71it/s]

314599





In [42]:
(500000 - 314599)/500000

0.370802

## Data Dump

In [130]:
data_dump_path = '../data/cleaned/' + str(data_path.split('/')[-1])
with open(data_dump_path, 'w') as f:
    json.dump(comments, f)