## 1.Imports & Configuration 🛠️

In [1]:
# ### Cell 1: Imports & Configuration ###

import praw
import pandas as pd
import time
import re
import os
from tqdm.notebook import tqdm # The notebook-friendly progress bar

# --- 🕷️ PRAWLER CONFIGURATION ---
# PASTE YOUR REDDIT API CREDENTIALS HERE
CLIENT_ID = "cqUf_9T9v0tHisDODAz2rA"
CLIENT_SECRET = "V5Mt83tpp1vTyBiPlW5vF6xqOLzedw"
USER_AGENT = "Python:PeerHiveV2-Scraper:v1.0 (by /u/Significant-Luck6567)" # Change to your Reddit username

# --- 🎯 TARGETS ---
SUBREDDITS = ['gradschool', 'college', 'PhD', 'csMajors', 'Professors', 'medicalschool']
KEYWORDS = ['burnout', 'overwhelmed', 'exhausted', 'stressed', "can't cope", 'dropping out', 'imposter syndrome', 'completely drained', 'running on fumes', 'lost the passion']
POST_LIMIT_PER_KEYWORD = 750

# --- 💾 OUTPUT ---
OUTPUT_FILENAME = "reddit_burnout_anonymized_for_annotation.csv"

print("✅ Configuration loaded.")

✅ Configuration loaded.


## 2.The PII Scrubber 🧼

In [2]:
# ### Cell 2: The PII Scrubber Function ###

def scrub_pii(text: str) -> str:
    """Anonymizes text by removing potential PII."""
    if not isinstance(text, str):
        return ""
    # Regex to find and replace u/username or /u/username patterns, case-insensitive
    text = re.sub(r'(?:\s|^)(?:u/|/u/)([a-zA-Z0-9_-]+)', ' [USER]', text, flags=re.IGNORECASE)
    return text

print("✅ PII Scrubber function defined.")

✅ PII Scrubber function defined.


## 3.Connect to Reddit API 🔌

In [3]:
# ### Cell 3: Connect to Reddit API ###

reddit = None
if not CLIENT_ID or not CLIENT_SECRET:
    print("🛑 ERROR: Please fill in your Reddit API credentials in the first code cell.")
else:
    try:
        reddit = praw.Reddit(
            client_id=CLIENT_ID,
            client_secret=CLIENT_SECRET,
            user_agent=USER_AGENT,
        )
        print(f"✅ Successfully connected to Reddit as: {reddit.user.me()}")
    except Exception as e:
        print(f"🔥 Reddit connection failed: {e}")

✅ Successfully connected to Reddit as: None


## 4.The Collection Engine 🚀

In [4]:
# ### Cell 4: The Collection Engine ###

collected_posts = []
if reddit:
    subreddit_query = "+".join(SUBREDDITS)
    print(f"Hunting in subreddits: r/{subreddit_query}\n")

    for keyword in tqdm(KEYWORDS, desc="Processing keywords"):
        search_query = f'selftext:"{keyword}"'
        search_results = reddit.subreddit(subreddit_query).search(
            search_query,
            limit=POST_LIMIT_PER_KEYWORD,
            sort='relevance',
            time_filter='all'
        )

        for post in search_results:
            if post.selftext:
                collected_posts.append({
                    'id': post.id,
                    'title': post.title,
                    'body': post.selftext,
                    'subreddit': str(post.subreddit),
                    'url': post.url
                })
        time.sleep(1)

    print("\n--- Collection Complete ---")
else:
    print("⚠️ Reddit client not connected. Please run the previous cell successfully.")

Hunting in subreddits: r/gradschool+college+PhD+csMajors+Professors+medicalschool



Processing keywords:   0%|          | 0/10 [00:00<?, ?it/s]


--- Collection Complete ---


## 5.Process, Anonymize, and Save 💾

In [5]:
# ### Cell 5: Process, Anonymize, and Save ###

if not collected_posts:
    print("⚠️ No posts were collected. Cannot proceed.")
else:
    df_raw = pd.DataFrame(collected_posts)
    print(f"Collected {len(df_raw)} posts in total.")

    df_raw.drop_duplicates(subset='id', inplace=True)
    print(f"Found {len(df_raw)} unique posts after de-duplication.")

    print("\n--- Running Anonymization Protocol ---")
    df_anonymized = pd.DataFrame()
    df_anonymized['id'] = df_raw['id']
    df_anonymized['clean_title'] = df_raw['title'].apply(scrub_pii)
    df_anonymized['clean_body'] = df_raw['body'].apply(scrub_pii)
    df_anonymized['subreddit'] = df_raw['subreddit']
    df_anonymized['url'] = df_raw['url']
    print("✅ PII scrubbing complete.")

    df_anonymized.to_csv(OUTPUT_FILENAME, index=False, encoding='utf-8')
    print(f"\n💾 Gold standard dataset ready for annotation! Saved to '{OUTPUT_FILENAME}'")

Collected 1471 posts in total.
Found 1390 unique posts after de-duplication.

--- Running Anonymization Protocol ---
✅ PII scrubbing complete.

💾 Gold standard dataset ready for annotation! Saved to 'reddit_burnout_anonymized_for_annotation.csv'


## 6.Verify the Output ✨

In [6]:
# ### Cell 6: Verify the Output ###

if os.path.exists(OUTPUT_FILENAME):
    df_final = pd.read_csv(OUTPUT_FILENAME)
    display(df_final.head())
else:
    print(f"File '{OUTPUT_FILENAME}' not found. Make sure the previous cells ran correctly.")

Unnamed: 0,id,clean_title,clean_body,subreddit,url
0,1myx7vx,My career is over. What a relief.,"I am starting my 18th year, and am allowed to ...",Professors,https://www.reddit.com/r/Professors/comments/1...
1,1jctt1m,Anyone else just… not want to grade?,"I know, I know… it’s part of the job. But with...",Professors,https://www.reddit.com/r/Professors/comments/1...
2,1lrpnvw,First year as a lecturer here. Student absente...,I'm new to teaching and I genuinely care about...,Professors,https://www.reddit.com/r/Professors/comments/1...
3,1lciz2j,Successfully defended just an hour ago!,After spending 5 years in a STEM PhD program f...,PhD,https://i.redd.it/zf3wxlopm77f1.jpeg
4,1lf0t2n,Emergency medicine sounds too good to be true ...,EDIT: Thanks to all the ED attendings for lett...,medicalschool,https://www.reddit.com/r/medicalschool/comment...


In [7]:
df_final.describe()

Unnamed: 0,id,clean_title,clean_body,subreddit,url
count,1390,1390,1390,1390,1390
unique,1390,1385,1390,6,1390
top,1myx7vx,Recent CS Grad? Lets lock in together and bag ...,"I am starting my 18th year, and am allowed to ...",PhD,https://www.reddit.com/r/Professors/comments/1...
freq,1,2,1,368,1


In [1]:
import pandas as pd
import os

# --- CONFIGURATION ---
RAW_DATA_FILE = "reddit_burnout_anonymized_for_annotation.csv" # The output from your PRAWler script
FINAL_DATA_FILE = "data_for_annotation.csv" # The file you will feed to The Guillotine
TARGET_SIZE = 600
RANDOM_STATE = 42 # The answer to life, the universe, and reproducible science.

# --- SCRIPT ---
if not os.path.exists(RAW_DATA_FILE):
    print(f"🔥 FATAL ERROR: Raw data file '{RAW_DATA_FILE}' not found.")
    print("Please run the PRAWler script first to generate the raw, de-duplicated data.")
else:
    df = pd.read_csv(RAW_DATA_FILE)
    print(f"Loaded {len(df)} raw, de-duplicated posts.")

    if len(df) < TARGET_SIZE:
        print(f"⚠️ Warning: You have fewer than {TARGET_SIZE} posts. Using all {len(df)} posts.")
        final_df = df
    else:
        # The one and only shuffle. Reproducible every time.
        print(f"Shuffling dataset and taking a reproducible sample of {TARGET_SIZE}...")
        final_df = df.sample(n=TARGET_SIZE, random_state=RANDOM_STATE).reset_index(drop=True)

    print(f"✅ Final dataset created with {len(final_df)} posts.")

    final_df.to_csv(FINAL_DATA_FILE, index=False)
    print(f"💾 Your final, shuffled dataset is ready. Feed '{FINAL_DATA_FILE}' to The Guillotine.")



Loaded 1390 raw, de-duplicated posts.
Shuffling dataset and taking a reproducible sample of 600...
✅ Final dataset created with 600 posts.
💾 Your final, shuffled dataset is ready. Feed 'data_for_annotation.csv' to The Guillotine.
