In [1]:
import zstandard
import os
import json
import sys
import csv
from datetime import datetime
import logging.handlers

In [8]:
os.getcwd()

'/Users/albertvong/Library/CloudStorage/OneDrive-NorthwesternUniversity/Github Coding Stuff/Reddit NBA Upvote Predictor'

In [11]:
#Define input and output file names
input_file = os.getcwd() + '/reddit/subreddits/nba_submissions.zst'
# put the name or path to the output file. The file extension from below will be added automatically
output_file = os.getcwd() +'/output'

In [29]:
output_file

'/Users/albertvong/Library/CloudStorage/OneDrive-NorthwesternUniversity/Github Coding Stuff/Reddit NBA Upvote Predictoroutput'

In [12]:
output_format = "txt"
# The filename from the torrent has which type it is, but you'll need to change this if you removed that from the filename
is_submission = "submission" in input_file

# Only output items between these two dates
from_date = datetime.strptime("2019-10-01", "%Y-%m-%d")
to_date = datetime.strptime("2020-06-01", "%Y-%m-%d")

In [14]:
#Setting up logging

log = logging.getLogger("bot")
log.setLevel(logging.INFO)
log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
log_str_handler = logging.StreamHandler()
log_str_handler.setFormatter(log_formatter)
log.addHandler(log_str_handler)
if not os.path.exists("logs"):
	os.makedirs("logs")
log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
log_file_handler.setFormatter(log_formatter)
log.addHandler(log_file_handler)

## Defining functions for reading from zst json

In [15]:
def write_line_zst(handle, line):
	handle.write(line.encode('utf-8'))
	handle.write("\n".encode('utf-8'))

In [16]:
def write_line_json(handle, obj):
	handle.write(json.dumps(obj))
	handle.write("\n")

In [17]:
def write_line_csv(writer, obj, is_submission):
	output_list = []
	output_list.append(str(obj['score']))
	output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
	if is_submission:
		output_list.append(obj['title'])
	output_list.append(f"u/{obj['author']}")
	output_list.append(f"https://www.reddit.com{obj['permalink']}")
	if is_submission:
		if obj['is_self']:
			if 'selftext' in obj:
				output_list.append(obj['selftext'])
			else:
				output_list.append("")
		else:
			output_list.append(obj['url'])
	else:
		output_list.append(obj['body'])
	writer.writerow(output_list)

In [18]:
def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
	chunk = reader.read(chunk_size)
	bytes_read += chunk_size
	if previous_chunk is not None:
		chunk = previous_chunk + chunk
	try:
		return chunk.decode()
	except UnicodeDecodeError:
		if bytes_read > max_window_size:
			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)


In [19]:
def read_lines_zst(file_name):
	with open(file_name, 'rb') as file_handle:
		buffer = ''
		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
		while True:
			chunk = read_and_decode(reader, 2**27, (2**29) * 2)

			if not chunk:
				break
			lines = (buffer + chunk).split("\n")

			for line in lines[:-1]:
				yield line.strip(), file_handle.tell()

			buffer = lines[-1]

		reader.close()


In [26]:
#Check for fields, values and exact match
field = 'score'
score_thresh = 100 #Field threshold for number of upvotes
exact_match = False




In [28]:
output_path = f"{output_file}.{output_format}"

writer = None
if output_format == "zst":
    handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
elif output_format == "txt":
    handle = open(output_path, 'w', encoding='UTF-8')
elif output_format == "csv":
    handle = open(output_path, 'w', encoding='UTF-8', newline='')
    writer = csv.writer(handle)
else:
    log.error(f"Unsupported output format {output_format}")
    sys.exit()


file_size = os.stat(input_file).st_size
file_bytes_processed = 0
created = None
matched_lines = 0
bad_lines = 0
total_lines = 0

for line, file_bytes_processed in read_lines_zst(input_file):
    total_lines += 1
    if total_lines % 100000 == 0:
        log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : {matched_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")

    try:
        obj = json.loads(line)
        created = datetime.utcfromtimestamp(int(obj['created_utc']))

        if created < from_date:
            continue
        if created > to_date:
            continue
        
        matched = False
        score = obj['score']
        if score >= score_thresh:
            matched = True
            
        if not matched:
            continue
        #Run this if you have additional string filters
        # field_value = obj[field].lower() #For strings only
        # field_value = obj[field]
        # matched = False
        # for value in values:
        #     if exact_match:
        #         if value == field_value:
        #             matched = True
        #             break
        #     else:
        #         if value in field_value:
        #             matched = True
        #             break
        # if not matched:
        #     continue
        
        #If we fulfilled some of the criteria then we can output this line (which means something has matched)
        matched_lines += 1
        if output_format == "zst":
            write_line_zst(handle, line)
        elif output_format == "csv":
            write_line_csv(writer, obj, is_submission)
        elif output_format == "txt":
            write_line_json(handle, obj)
    except (KeyError, json.JSONDecodeError) as err:
        bad_lines += 1

handle.close()
log.info(f"Complete : {total_lines:,} : {matched_lines:,} : {bad_lines:,}")

2023-03-25 21:47:10,844 - INFO: 2013-05-21 17:25:15 : 100,000 : 0 : 0 : 32,244,450:7%
2023-03-25 21:47:11,995 - INFO: 2014-06-03 18:15:19 : 200,000 : 0 : 0 : 48,628,825:11%
2023-03-25 21:47:13,119 - INFO: 2015-03-28 01:05:24 : 300,000 : 0 : 0 : 66,717,175:15%
2023-03-25 21:47:14,349 - INFO: 2015-12-16 05:44:06 : 400,000 : 0 : 0 : 83,363,700:19%
2023-03-25 21:47:15,588 - INFO: 2016-07-07 03:41:53 : 500,000 : 0 : 0 : 101,189,900:23%
2023-03-25 21:47:17,044 - INFO: 2017-03-12 21:52:56 : 600,000 : 0 : 0 : 134,220,800:30%
2023-03-25 21:47:18,199 - INFO: 2017-07-23 05:13:16 : 700,000 : 0 : 0 : 148,507,975:33%
2023-03-25 21:47:19,429 - INFO: 2018-01-01 04:58:26 : 800,000 : 0 : 0 : 161,746,550:36%
2023-03-25 21:47:21,004 - INFO: 2018-05-07 04:51:09 : 900,000 : 0 : 0 : 184,422,525:42%
2023-03-25 21:47:22,743 - INFO: 2018-08-07 01:09:47 : 1,000,000 : 0 : 0 : 206,574,200:47%
2023-03-25 21:47:24,734 - INFO: 2019-01-17 04:10:49 : 1,100,000 : 0 : 0 : 227,546,200:51%
2023-03-25 21:47:26,870 - INFO: 2