In [None]:
import os
import time
import json
import re
import random
from tqdm import tqdm
import praw

# ---------------- CONFIG ----------------
REDDIT_CLIENT_ID = 'brWU7p9j_4pulCPxMK2fqw'
REDDIT_CLIENT_SECRET = 'SYu7-3f0iUl8QRHJvgo9X80T4efBPg'
REDDIT_USERNAME = 'Careful-Relation-574'
REDDIT_PASSWORD = 'TbQPu6GVbGqbQC_'
USER_AGENT = 'reddit_data_collector by /u/Careful-Relation-574'

output_path = OUTPUT_JSON = "../data/raw/data.json"

LIMIT_PER_SEARCH = 1500
SLEEP_BETWEEN_QUERIES = 1.0
RANDOM_STATE = 42
# ----------------------------------------

random.seed(RANDOM_STATE)


In [None]:
# If file exists, load it, else start fresh
if os.path.exists(output_path):
    with open(output_path, "r", encoding="utf-8") as f:
        all_posts = json.load(f)
else:
    all_posts = {}

print(f"Loaded {len(all_posts)} posts already in dataset.")


In [5]:
subreddits = [
    "depression", "depressed", "mentalhealth",
    "offmychest", "TrueOffMyChest", "confession", "self",
    "stress", "lonely", "sad", "burnout", "Anxiety",
    "SuicideWatch", "depression_help", "kindvoice", "needadvice",
    "relationships", "bipolarreddit", "traumatoolbox"
]

suicidal_keywords = [
    "suicidal", "i am suicidal", "i'm suicidal", "want to die", "want to end it all",
    "kill myself", "i will kill myself", "i want to die", "end it all", "ending my life",
    "i cant take it", "i can't take it", "i don't want to live", "dont want to live",
    "dont want to be here", "i give up", "ready to die", "wish i was dead", "better off dead",
    "thinking about suicide", "thoughts of suicide", "i want to end it", "i'm done",
    "im done", "going to end it", "overdose", "want to disappear", "no reason to live",
    "i want out", "i want to die right now"
]

depression_keywords = [
    "depressed", "feeling depressed", "i feel depressed", "feeling low", "im feeling down",
    "i'm feeling down", "feeling down", "low mood", "hopeless", "hopelessness", "worthless",
    "i hate myself", "i'm broken", "i am broken", "i'm lonely", "im lonely", "numb",
    "nothing matters", "i'm so sad", "i am so sad", "i'm so tired", "i'm tired of living",
    "crying again", "i cry", "feeling empty", "lost interest", "no motivation", "can't cope",
    "cant cope", "can't enjoy", "cant enjoy"
]

anxiety_keywords = [
    "anxious", "feeling anxious", "i'm anxious", "im anxious", "panic attack",
    "panic attacks", "panic", "overwhelmed", "i'm overwhelmed", "im overwhelmed",
    "heart racing", "racing heart", "cant breathe", "can't breathe", "hyperventilate",
    "overthinking", "worrying", "constant worry", "nervous", "social anxiety",
    "can't sleep", "insomnia", "sweating", "shaking", "feeling panicked"
]

# Normalize
suicidal_keywords = [k.lower() for k in suicidal_keywords]
depression_keywords = [k.lower() for k in depression_keywords]
anxiety_keywords = [k.lower() for k in anxiety_keywords]


In [6]:
def make_pattern(words):
    return re.compile("|".join(re.escape(w) for w in words), re.IGNORECASE)

pattern_suicidal = make_pattern(suicidal_keywords)
pattern_depression = make_pattern(depression_keywords)
pattern_anxiety = make_pattern(anxiety_keywords)

def label_text(text):
    text = text.lower()
    if pattern_suicidal.search(text):
        return "suicidal"
    elif pattern_depression.search(text):
        return "depression"
    elif pattern_anxiety.search(text):
        return "anxiety"
    else:
        return None


In [7]:
reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    username=REDDIT_USERNAME,
    password=REDDIT_PASSWORD,
    user_agent=USER_AGENT
)
print("✅ Connected to Reddit")


✅ Connected to Reddit


In [None]:
for sub in tqdm(subreddits, desc="Subreddits"):
    subreddit = reddit.subreddit(sub)
    for kw in (suicidal_keywords + depression_keywords + anxiety_keywords):
        try:
            for submission in subreddit.search(kw, limit=LIMIT_PER_SEARCH):
                pid = submission.id
                if pid not in all_posts:
                    text = f"{submission.title} {submission.selftext}"
                    label = label_text(text)
                    if label is None:   # 🚨 skip if not suicidal / depression / anxiety
                        continue
                    all_posts[pid] = {
                        "id": pid,
                        "subreddit": sub,
                        "title": submission.title,
                        "selftext": submission.selftext,
                        "label": label,
                        "created_utc": submission.created_utc,
                        "url": submission.url
                    }
            time.sleep(SLEEP_BETWEEN_QUERIES)
        except Exception as e:
            print(f"⚠️ Error in r/{sub} kw={kw[:20]}... : {e}")
            continue

print(f"Total collected: {len(all_posts)} (suicidal / depression / anxiety only)")


Subreddits: 100%|██████████| 19/19 [2:44:29<00:00, 519.45s/it]  

Total collected: 101345 (suicidal / depression / anxiety only)





In [None]:
# os.makedirs(os.path.dirname(OUTPUT_JSON), exist_ok=True)
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(list(all_posts.values()), f, indent=2, ensure_ascii=False)

print(f"✅ Saved {len(all_posts)} posts to {OUTPUT_JSON}")


✅ Saved 101345 posts to ../data/raw/reddit_labeled.json


In [12]:
json_file = "../data/raw/data.json"

# Load existing posts
with open(json_file, "r", encoding="utf-8") as f:
    existing_data = json.load(f)

# Build set of already saved IDs to avoid duplicates
existing_ids = {post["id"] for post in existing_data}

print(f"Loaded {len(existing_data)} posts from {json_file}")
print(f"Unique IDs loaded into memory: {len(existing_ids)}")

Loaded 101345 posts from ../data/raw/data.json
Unique IDs loaded into memory: 101345


In [16]:
# -------- Collect NORMAL posts --------
print("Collecting normal posts...")

normal_subs = [
    "AskReddit", "CasualConversation", "Showerthoughts", "NoStupidQuestions",
    "todayilearned", "explainlikeimfive", "AskScience", "AskHistorians",
    "funny", "pics", "aww", "memes",
    "food", "Cooking", "travel", "books", "movies", "television", "Music",
    "gaming", "sports", "fitness"
]

normal_posts = []

for sub in tqdm(normal_subs, desc="Fetching normal posts"):
    subreddit = reddit.subreddit(sub)
    for post in subreddit.hot(limit=5000):  # adjust limit if needed
        if post.id in existing_ids:
            continue  # avoid duplicates

        text = f"{post.title} {post.selftext}".strip().lower()

        # Skip if text contains depression/anxiety/suicidal keywords
        if (pattern_suicidal.search(text) or 
            pattern_depression.search(text) or 
            pattern_anxiety.search(text)):
            continue

        normal_posts.append({
            "id": post.id,
            "subreddit": sub,
            "title": post.title,
            "selftext": post.selftext,
            "label": "normal",
            "created_utc": float(post.created_utc),
            "url": f"https://www.reddit.com{post.permalink}"
        })
        existing_ids.add(post.id)

print(f"Collected {len(normal_posts)} normal posts")


Collecting normal posts...


Fetching normal posts: 100%|██████████| 22/22 [05:02<00:00, 13.75s/it]

Collected 35 normal posts





In [17]:
# -------- Append normal posts to existing JSON --------

# Path to previously saved file
json_file = "../data/raw/data.json"

# Load existing dataset
with open(json_file, "r", encoding="utf-8") as f:
    existing_data = json.load(f)

# Append normal posts
existing_data.extend(normal_posts)

# Save back to the same file
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(existing_data, f, ensure_ascii=False, indent=2)

# Show updated label counts
label_counts = {lbl: sum(1 for p in existing_data if p['label'] == lbl) 
                for lbl in ['suicidal', 'depression', 'anxiety', 'normal']}
print(f"Updated dataset → {json_file}")
print("Final label counts:", label_counts)


Updated dataset → ../data/raw/data.json
Final label counts: {'suicidal': 33736, 'depression': 36884, 'anxiety': 30725, 'normal': 11203}
