In [None]:
# Created by Fanmei Wang, March 30, 2025

import os
import json
import zstandard as zstd

# ========== Configuration ==========

# Input file
input_file = r"path\RS_2024-02.zst"

# Output file (only save posts from target subreddits)
output_file = "path/extracted_RS_2024-02_filtered.jsonl"

# ✅ Target subreddits to extract (lowercase)
target_subreddits = {"immigrationcanada", "canadaimmigrant"}

# ========== Script starts ==========

file_size = os.path.getsize(input_file)

print(f"Starting extraction: {input_file}")
print(f"Compressed file size: {file_size / (1024*1024):.2f} MB")
print(f"Filtering for subreddits: {target_subreddits}")

lines_read = 0        # Number of lines read and parsed
lines_written = 0     # Number of matching posts written
bytes_processed = 0   # Number of compressed bytes processed

with open(output_file, "w", encoding="utf-8") as out:
    with open(input_file, "rb") as compressed_file:
        dctx = zstd.ZstdDecompressor(max_window_size=2_147_483_648)  # 2 GiB
        with dctx.stream_reader(compressed_file) as reader:
            buffer = b""
            chunk_size = 2**20  # Read 1MB at a time

            while True:
                chunk = reader.read(chunk_size)
                if not chunk:
                    break

                bytes_processed += len(chunk)
                buffer += chunk

                lines = buffer.split(b"\n")
                buffer = lines.pop()

                for line in lines:
                    lines_read += 1

                    if lines_read % 100000 == 0:
                        percent = (bytes_processed / file_size) * 100
                        print(f"Processed {lines_read:,} lines, {percent:.2f}% of compressed file, matched: {lines_written:,}")

                    try:
                        post = json.loads(line)
                    except json.JSONDecodeError:
                        continue

                    # ✅ Filter by target subreddit (case-insensitive)
                    sr = post.get("subreddit", "").lower()
                    if sr in target_subreddits:
                        out.write(json.dumps(post) + "\n")
                        lines_written += 1

print("\n✅ Extraction completed!")
print(f"🔢 Total lines read    : {lines_read:,}")
print(f"✅ Total posts matched : {lines_written:,}")
print(f"📄 Output file         : {output_file}")
