In [113]:
import json
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm import tqdm
from pprint import pprint

In [114]:
WIDTH = 7
PRECISION = 3
TYPE = "f"

In [115]:
sys.path.append('/home/ashish/workspace/recommender/')

In [116]:
data_path = '../data/raw/RC_2023-01_2.json'
with open(data_path, 'rb') as f:
    comments = json.load(f)

In [117]:
print(f'No. of comments in the data - {len(comments)}')

No. of comments in the data - 500000


## Data Cleaning and Dala Loss Analysis after cleaning 

### Removing ['deleted'] users

In [118]:
comments_del_rm = []
for comment in tqdm(comments):
    if comment['author'] == '[deleted]':
        continue
    comments_del_rm.append(comment)
comments = comments_del_rm
del(comments_del_rm)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 500000/500000 [00:00<00:00, 1353395.96it/s]


In [119]:
print(f'No. of comments after removing [deleted] comments - {len(comments)}')

No. of comments after removing [deleted] comments - 458358


### Removing subreddits that contain less than threshold no. of comments

In [120]:
comments_per_subreddit = defaultdict(int)
for comment in tqdm(comments):
    subreddit_id = comment['subreddit_id']
    comments_per_subreddit[subreddit_id] +=1

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 458358/458358 [00:00<00:00, 1222246.81it/s]


In [121]:
COMMENT_THRESHOLD = 10

In [122]:
comments_lost = 0
subreddits_lost = 0
for subreddit_id, n_comments in tqdm(comments_per_subreddit.items()):
    if n_comments < COMMENT_THRESHOLD:
        comments_lost += n_comments
        subreddits_lost += 1

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 25452/25452 [00:00<00:00, 1805035.77it/s]


In [123]:
remaining_comments = len(comments) - comments_lost
remaining_subreddits  = len(comments_per_subreddit) - subreddits_lost

In [124]:
print(f'Remaining data points: {remaining_comments} (/{len(comments)})')

Remaining data points: 407380 (/458358)


In [125]:
print(f'remaining_subreddits : {remaining_subreddits} (/{len(comments_per_subreddit)})')

remaining_subreddits : 6217 (/25452)


In [126]:
print(f'{remaining_subreddits/len(comments_per_subreddit)*100:,.2f}% of the communities contain {remaining_comments/len(comments)*100:,.2f}% of comments')

24.43% of the communities contain 88.88% of comments


In [127]:
print(f'Loosing {comments_lost} data points out of {len(comments)} ({comments_lost/len(comments)*100:,.2f}%)')

Loosing 50978 data points out of 458358 (11.12%)


In [128]:
print(f'Loosing {subreddits_lost} communities out of {len(comments_per_subreddit)} ({subreddits_lost/len(comments_per_subreddit)*100:,.2f}%)')

Loosing 19235 communities out of 25452 (75.57%)


In [129]:
comment_rm_sr_thr = []
for comment in tqdm(comments):
    if comments_per_subreddit[comment['subreddit_id']] < COMMENT_THRESHOLD:
        continue
    comment_rm_sr_thr.append(comment)
comments = comment_rm_sr_thr
del(comment_rm_sr_thr)
print(len(comments))

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 458358/458358 [00:00<00:00, 1087539.66it/s]

407380





## Data Dump

In [130]:
data_dump_path = '../data/cleaned/' + str(data_path.split('/')[-1])
with open(data_dump_path, 'w') as f:
    json.dump(comments, f)