# 1. Load Library

In [None]:
!pip install praw

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
Downloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import praw
import json
import os
import time
import random
import logging
from datetime import datetime
from typing import List
from IPython.display import display, clear_output

# 2. Configuration

In [None]:
ROOT_DIR = r""
CONFIG = {
    "client_id": "",
    "client_secret": "",
    "user_agent": "",
    "username": "",
    "password": "",
    "keywords": ["tariff"],
    "subreddits": ["all"],
    "posts_per_batch": 50,
    "total_posts": 400,
    "comments_per_post": 50,
    "batch_dir": os.path.join(ROOT_DIR, "batches"),
    "checkpoint_file": os.path.join(ROOT_DIR, "checkpoint.json"),
    "log_file": os.path.join(ROOT_DIR, "harvest.log")
}

# 3. Logger Setup

In [None]:
os.makedirs(CONFIG["batch_dir"], exist_ok=True)
os.makedirs(os.path.dirname(CONFIG["log_file"]), exist_ok=True)
logging.basicConfig(
    filename=CONFIG["log_file"],
    filemode="a",
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

# 4. Initialization

In [None]:
reddit = praw.Reddit(
    client_id=CONFIG["client_id"],
    client_secret=CONFIG["client_secret"],
    user_agent=CONFIG["user_agent"],
    username=CONFIG["username"],
    password=CONFIG["password"]
)

In [None]:
print(reddit.read_only)

False


# 5. State Management

In [None]:
def load_checkpoint() -> set:
    if os.path.exists(CONFIG["checkpoint_file"]):
        with open(CONFIG["checkpoint_file"], "r") as f:
            return set(json.load(f))
    return set()

def save_checkpoint(post_ids: set):
    with open(CONFIG["checkpoint_file"], "w") as f:
        json.dump(list(post_ids), f)

# 6. Comment Filter

In [None]:
def is_valid_comment(c) -> bool:
    if not hasattr(c, 'body') or c.body in ["[removed]", "[deleted]"]:
        return False
    if not c.author or str(c.author).lower() == "automoderator":
        return False
    body = c.body.lower()
    return (
        len(body.strip()) > 30 and
        'i am a bot' not in body and
        'performed automatically' not in body and
        'http' not in body
    )

def extract_comment_data(c) -> dict:
    return {
        "comment_id": c.id,
        "parent_id": c.parent_id,
        "author": str(c.author),
        "body": c.body,
        "score": c.score,
        "created_utc": c.created_utc,
        "depth": c.depth
    }

def get_filtered_comments(submission, max_comments: int) -> List[dict]:
    try:
        submission.comments.replace_more(limit=0)
        flat_comments = submission.comments.list()
        filtered = [extract_comment_data(c) for c in flat_comments if is_valid_comment(c)]
        sorted_comments = sorted(filtered, key=lambda x: x["score"], reverse=True)
        return sorted_comments[:max_comments]
    except Exception as e:
        logging.warning(f"⚠️ Failed to process comments for post {submission.id}: {e}")
        return []

# 7. Rate Limit Handling

In [None]:
def throttle(min_delay=1.2, max_delay=1.8):
    time.sleep(random.uniform(min_delay, max_delay))

# 8. Batch Save

In [None]:
def save_batch(data, batch_count):
    try:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        batch_file = os.path.join(CONFIG["batch_dir"], f"batch_{batch_count:03}_{timestamp}.json")
        with open(batch_file, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        logging.info(f"✅ Saved batch {batch_count} with {len(data)} posts to {batch_file}")
    except Exception as e:
        logging.error(f"❌ Failed to save batch {batch_count}: {e}")

# 9. Monitor Display

In [None]:
def update_monitor(total, batch, last_post, errors):
    clear_output(wait=True)
    display(f"✔️ Total Posts Collected: {total}")
    display(f"📦 Current Batch: {batch}")
    display(f"🆔 Last Post ID: {last_post}")
    display(f"⚠️ Errors So Far: {errors}")

# 10. Main Harvest Func

In [None]:
def harvest_posts():
    all_collected = []
    total_count = 0
    batch_count = 1
    error_count = 0
    processed_posts = load_checkpoint()

    try:
        for keyword in CONFIG["keywords"]:
            for sub in CONFIG["subreddits"]:
                query = reddit.subreddit(sub).search(
                    query=keyword,
                    sort='top',
                    time_filter='year',
                    limit=CONFIG["total_posts"]
                )

                for submission in query:
                    if submission.id in processed_posts:
                        continue

                    try:
                        filtered_comments = get_filtered_comments(submission, CONFIG["comments_per_post"])

                        post_data = {
                            "subreddit": sub,
                            "keyword": keyword,
                            "post_id": submission.id,
                            "title": submission.title or "",
                            "selftext": submission.selftext or "",
                            "created_utc": submission.created_utc,
                            "score": submission.score,
                            "url": submission.url or "",
                            "comments": filtered_comments
                        }

                        all_collected.append(post_data)
                        processed_posts.add(submission.id)
                        total_count += 1
                        logging.info(f"📄 Collected post {submission.id} ({total_count}) with {len(filtered_comments)} comments")

                        update_monitor(total_count, batch_count, submission.id, error_count)

                        if total_count % CONFIG["posts_per_batch"] == 0:
                            save_batch(all_collected, batch_count)
                            all_collected = []
                            batch_count += 1
                            save_checkpoint(processed_posts)
                            time.sleep(8)

                        throttle()

                    except Exception as e:
                        error_count += 1
                        logging.error(f"❌ Error processing post {submission.id}: {e}")
                        time.sleep(5)

    except Exception as e:
        logging.critical(f"🔥 Fatal error: {e}")
        save_checkpoint(processed_posts)

    if all_collected:
        save_batch(all_collected, batch_count)
        save_checkpoint(processed_posts)
        logging.info(f"✅ Final save batch {batch_count} with {len(all_collected)} posts.")

    logging.info("🎉 Scraping completed.")

# 11. Test

In [None]:
def test_single_post_save():
    keyword = CONFIG["keywords"][0]
    subreddit = CONFIG["subreddits"][0]

    logging.info("🚀 Starting single post test")

    submission = next(reddit.subreddit(subreddit).search(query=keyword, sort='top', time_filter='year', limit=1))

    filtered_comments = get_filtered_comments(submission, CONFIG["comments_per_post"])

    test_data = {
        "subreddit": subreddit,
        "keyword": keyword,
        "post_id": submission.id,
        "title": submission.title or "",
        "selftext": submission.selftext or "",
        "created_utc": submission.created_utc,
        "score": submission.score,
        "url": submission.url or "",
        "comments": filtered_comments
    }

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    test_file = os.path.join(CONFIG["batch_dir"], f"test_post_{timestamp}.json")

    try:
        with open(test_file, "w", encoding="utf-8") as f:
            json.dump([test_data], f, ensure_ascii=False, indent=2)
        logging.info(f"✅ Test save success: {test_file}")
    except Exception as e:
        logging.error(f"❌ Test save failed: {e}")

In [None]:
test_single_post_save()

2025-05-09 14:54:36,636 - 🚀 Starting single post test
2025-05-09 14:54:41,269 - ✅ Test save success: E:\\Users\\76044\\Desktop\\tariff\batches\test_post_20250509_145441.json


# 12. Run

In [None]:
# Run the harvester
harvest_posts()

'✔️ Total Posts Collected: 245'

'📦 Current Batch: 5'

'🆔 Last Post ID: 1jo6v9m'

'⚠️ Errors So Far: 0'

2025-05-09 15:15:14,616 - ✅ Saved batch 5 with 45 posts to E:\\Users\\76044\\Desktop\\tariff\batches\batch_005_20250509_151514.json
2025-05-09 15:15:14,617 - ✅ Final save batch 5 with 45 posts.
2025-05-09 15:15:14,618 - 🎉 Scraping completed.
