In [None]:
import sys
sys.path.append("..")

from src.preprocess import basic_clean

df_posts_clean = basic_clean(df_posts, text_cols=("title","body"))
df_comments_clean = basic_clean(df_comments, text_cols=("body",))
df_posts_clean.shape, df_comments_clean.shape

In [None]:
from transformers import pipeline

# Load zero-shot classification pipeline
clf = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

labels = ["relevant to bugs/pack quality/issues", "not relevant (showcase/story)"]

def combine_text(row):
    return " ".join([str(row.get("title","")), str(row.get("body",""))]).strip()

# Build text column
df["text"] = df.apply(combine_text, axis=1)

# Apply classifier
zs = clf(df["text"].tolist(), candidate_labels=labels, multi_label=False)

df["zs_label"] = [res["labels"][0] for res in zs]
df["zs_score"] = [res["scores"][0] for res in zs]

# Keep only relevant ones above threshold
df_filtered = df[(df["zs_label"] == "relevant to bugs/pack quality/issues") & (df["zs_score"] >= 0.75)]

print(f"Zero-shot kept {len(df_filtered)}/{len(df)} posts ({len(df_filtered)/len(df):.1%}).")

# Save to processed
df_filtered.to_csv("../data/processed/reddit_sims4_filtered.csv", index=False)

In [None]:
import pandas as pd
from datetime import datetime

def collect_reddit_posts(
    client_id,
    client_secret,
    user_agent,
    subreddit_name="Sims4",
    limit=500,
    time_filter="year",
    mode="top"
):
    reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)
    sr = reddit.subreddit(subreddit_name)

    if mode == "top":
        stream = sr.top(limit=limit, time_filter=time_filter)
    elif mode == "new":
        stream = sr.new(limit=limit)
    elif mode == "hot":
        stream = sr.hot(limit=limit)
    else:
        raise ValueError("mode must be one of {'top','new','hot'}")

    rows = []
    for post in stream:
        rows.append({
            "id": post.id,
            "created_utc": post.created_utc,  # raw unix timestamp
            "created_date": datetime.utcfromtimestamp(post.created_utc),  # human-readable
            "author": str(getattr(post.author, "name", None)),
            "title": post.title or "",
            "body": post.selftext or "",
            "score": post.score,
            "num_comments": post.num_comments,
            "permalink": f"https://reddit.com{post.permalink}",
            "subreddit": subreddit_name
        })
    return pd.DataFrame(rows)