In [None]:
# CELL 1: SETUP
from google.colab import drive
import os
import shutil

# 1. Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# 2. Define and Create Output Folder
output_dir = "/content/drive/MyDrive/Processed_Reddit_Data"
os.makedirs(output_dir, exist_ok=True)

# 3. Clean Workspace (Just in case)
raw_dir = "/content/raw_data"
if os.path.exists(raw_dir):
    shutil.rmtree(raw_dir)
os.makedirs(raw_dir, exist_ok=True)

# 4. Permissions Test
test_file = os.path.join(output_dir, "access_test.txt")
try:
    with open(test_file, 'w') as f:
        f.write("System Ready.")
    print(f"SUCCESS: Connected to {output_dir}")
    print("You are safe to run the Master Script below.")
except Exception as e:
    print(f"ERROR: Cannot write to Drive. Reason: {e}")

In [None]:
# ==============================================================================
# FINAL TURBO SCRIPT: ARIA2C DOWNLOADER + AUTO EXTRACTION
# ==============================================================================
import os
import shutil
import zstandard as zstd
import json
import csv
import io
import time
import sys
from pathlib import Path
from google.colab import drive

# --- 1. SETUP ENV ---
print("ðŸ”Œ Mounting Google Drive...")
drive.mount('/content/drive')

print("ðŸ›  Installing aria2c (High-Speed Downloader)...")
os.system("apt-get install aria2 -y")

# --- 2. CONFIGURATION ---
WINDOW_SECONDS = 14 * 24 * 60 * 60  # 14 days
OUTPUT_DIR = "/content/drive/MyDrive/Processed_Reddit_Data"
RAW_DIR = "/content/raw_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Tracker Boost (Helps find peers faster)
TRACKERS = [
    "udp://tracker.opentrackr.org:1337/announce",
    "udp://9.rarbg.com:2810/announce",
    "udp://tracker.openbittorrent.com:6969/announce",
    "http://tracker.openbittorrent.com:80/announce",
    "udp://opentracker.i2p.rocks:6969/announce",
    "udp://tracker.internetwarriors.net:1337/announce",
    "udp://tracker.leechers-paradise.org:6969/announce",
    "udp://coppersurfer.tk:6969/announce",
    "udp://tracker.zer0day.to:1337/announce"
]
TRACKER_STR = "".join([f"&tr={t}" for t in TRACKERS])

# The "Strict" Schedule (No bleed-overs)
TASKS = [
    {
        "month": "JAN",
        "hash": "4fd14d4c3d792e0b1c5cf6b1d9516c48ba6c4a24",
        "targets": { "MacMiller": [1737072000], "TheWeeknd": [1738281600] }
    },
    {
        "month": "FEB",
        "hash": "2f873e0b15da5ee29b63e586c0ab1dedd3508870",
        "targets": { "TheWeeknd": [1738281600], "drizzy": [1739491200] }
    },
    {
        "month": "MAR",
        "hash": "bec5590bd3bc6c0f2d868f36ec92bec1aff4480e",
        "targets": { "playboicarti": [1741910400] }
    },
    {
        "month": "AUG",
        "hash": "b6a7ccf72368a7d39c018c423e01bc15aa551122",
        "targets": { "SabrinaCarpenter": [1724371200] }
    },
    {
        "month": "SEP",
        "hash": "e5a1c4d2b3f9e8d7c6b5a4f3e2d1c0b9a8f7e6d5",
        "targets": { "SabrinaCarpenter": [1724371200], "TaylorSwift": [1727913600] }
    },
    {
        "month": "OCT",
        "hash": "9d8e7f6a5b4c3d2e1f0a9b8c7d6e5f4a3b2c1d0e",
        "targets": { "TaylorSwift": [1727913600], "TameImpala": [1729123200] }
    }
]

# --- 3. HELPER FUNCTIONS ---
def clean_workspace():
    if os.path.exists(RAW_DIR):
        shutil.rmtree(RAW_DIR)
    os.makedirs(RAW_DIR, exist_ok=True)

def process_zst_files(targets):
    zst_files = list(Path(RAW_DIR).rglob("*.zst"))
    if not zst_files:
        print("   ERROR: Download finished but no .zst files found.")
        return

    for file_path in zst_files:
        print(f"   Scanning {file_path.name}...")
        match_count = 0
        line_count = 0
        
        with open(file_path, 'rb') as f:
            dctx = zstd.ZstdDecompressor()
            with dctx.stream_reader(f) as reader:
                stream = io.TextIOWrapper(reader, encoding='utf-8', errors='ignore')
                for line in stream:
                    line_count += 1
                    if line_count % 1000000 == 0:
                        sys.stdout.write(f"\r      Lines: {line_count//1000000}M | Matches: {match_count}")
                        sys.stdout.flush()
                    
                    try:
                        obj = json.loads(line)
                        sub = obj.get('subreddit')
                        
                        if sub in targets:
                            ts = int(obj.get('created_utc', 0))
                            for release_ts in targets[sub]:
                                if (release_ts - WINDOW_SECONDS) <= ts <= (release_ts + WINDOW_SECONDS):
                                    match_count += 1
                                    out_file = os.path.join(OUTPUT_DIR, f"{sub}_MASTER.csv")
                                    exists = os.path.isfile(out_file)
                                    
                                    with open(out_file, 'a', newline='', encoding='utf-8') as csvfile:
                                        writer = csv.writer(csvfile)
                                        if not exists:
                                            writer.writerow(['utc', 'text', 'score', 'album_date'])
                                        
                                        content = obj.get('body') or f"{obj.get('title', '')} {obj.get('selftext', '')}"
                                        clean_text = " ".join(content.split())
                                        writer.writerow([ts, clean_text, obj.get('score', 0), release_ts])
                    except: continue
        print(f"\n   Finished {file_path.name}. Saved {match_count} items.")

# --- 4. MAIN EXECUTION LOOP ---
for task in TASKS:
    print(f"\n==========================================")
    print(f" PROCESSING MONTH: {task['month']}")
    print(f"==========================================")
    
    # A. PREP
    clean_workspace()
    
    # B. DOWNLOAD (ARIA2C)
    print(f" Downloading {task['month']} via aria2c...")
    magnet = f"magnet:?xt=urn:btih:{task['hash']}{TRACKER_STR}"
    
    # -x 16: 16 connections per server (Aggressive)
    # --seed-time=0: Stop seeding immediately (Save bandwidth)
    cmd = f'aria2c --dir="{RAW_DIR}" --seed-time=0 --allow-overwrite=true --summary-interval=30 -x 16 -s 16 "{magnet}"'
    os.system(cmd)
    
    # C. EXTRACT
    process_zst_files(task['targets'])
    
    # D. CLEANUP
    clean_workspace()
    print(f"{task['month']} Data Wiped. Ready for next month.")

print("\n JOB DONE! CHECK GOOGLE DRIVE. ")
# Beep when done
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')