In [None]:
!pip install deepface

Collecting insightface
  Downloading insightface-0.7.3.tar.gz (439 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m439.5/439.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting onnx (from insightface)
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: insightface
  Building wheel for insightface (pyproject.toml) ... [?25l[?25hdone
  Created wheel for insightface: filename=insightface-0.7.3-cp311-cp311-linux_x86_64.whl size=1064921 sha256=886a8bc922e18275f3966f552678f5f05923bc6d2d48

In [None]:
!pip install retina-face
!pip install tqdm


Collecting onnxruntime-gpu
  Downloading onnxruntime_gpu-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting coloredlogs (from onnxruntime-gpu)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime-gpu)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime_gpu-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (280.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.8/280.8 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hIns

In [None]:
pip install python-dotenv


In [None]:
python-dotenv


In [None]:
import requests
import csv
import time
import re
import io
import numpy as np
from PIL import Image
from deepface import DeepFace
from tqdm import tqdm
import signal
import sys
import random
import os
from dotenv import load_dotenv


load_dotenv()

# 🔧 Config
USERNAME = os.getenv("BLUESKY_USERNAME")
APP_PASSWORD = os.getenv("BLUESKY_PASSWORD")
BASE_URL = "https://bsky.social/xrpc"
MAX_USERS = 1000
MAX_USERS_PER_KEYWORD = 70
MIN_AGE = 45
AUTOSAVE_INTERVAL = 20
SAVE_FILENAME = "depressed_45plus.csv"

# 🧠 Depression-related keywords
KEYWORDS = [
    "diagnosed with depression", "i was diagnosed with depression", "psychiatrist diagnosed me with depression",
    "history of depression diagnosis", "i got diagnosed with depression",
    "mental health diagnosis: depression",  "dealing with depression", "diagnosed with depression and anxiety",

    # Additional variations
    "i have been diagnosed with depression", "was officially diagnosed with depression",
    "doctor said i have depression", "i got a depression diagnosis",
    "received a diagnosis of depression", "my diagnosis is depression",
    "depression diagnosis confirmed", "recently diagnosed with depression",
    "diagnosis: depression", "diagnosed depressive episode",
    "got diagnosed with mdd", "diagnosed with major depression",

    "they told me i have depression", "turns out it's depression",
    "i found out i have depression", "my therapist said it's depression",
    "been told i'm depressed", "finally figured out it's depression",
    "i now know it's depression", "it’s confirmed – depression",
    "so apparently i’m depressed", "they diagnosed me last week",
    "depression is what i was told i have"
]



DEPRESSION_REGEX = re.compile(r"\b(dx|diagnos(e|ed|ing|is|es|sis)?|mdd|depressive disorder)\b", re.IGNORECASE)

# Global state
collected = []
seen_users = set()

# 🔑 Authentication
def authenticate(username, password):
    url = f"{BASE_URL}/com.atproto.server.createSession"
    response = requests.post(url, json={"identifier": username, "password": password})
    response.raise_for_status()
    return response.json()["accessJwt"]

# 🧠 Enhanced self-reference checker
def mentions_self(text):
    return bool(re.search(r"\b(i|me|my|mine|i'm|i’ve|i'd|i was|i am|i got|i have|myself)\b", text, re.IGNORECASE))

def validate_user(username, text):
    if not re.match(r"^[a-zA-Z0-9-_.]+\.bsky\.social$", username):
        return False
    if len(text) < 20 or len(text) > 2000:
        return False
    if re.search(r"\bno\s+depression\b", text, re.I):
        return False
    if not mentions_self(text):
        return False
    return True

# 🔍 Search functions
def search_posts(access_token, keyword, cursor=None):
    url = f"{BASE_URL}/app.bsky.feed.searchPosts"
    headers = {"Authorization": f"Bearer {access_token}"}
    params = {
        "q": keyword,
        "limit": 100,
        "sort": "newest" if random.random() > 0.5 else "oldest"
    }
    if cursor:
        params["cursor"] = cursor
    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status()
    return response.json()

def get_thread(uri, access_token):
    try:
        url = f"{BASE_URL}/app.bsky.feed.getPostThread"
        headers = {"Authorization": f"Bearer {access_token}"}
        response = requests.get(url, headers=headers, params={"uri": uri}, timeout=10)
        response.raise_for_status()
        data = response.json()
        replies = [reply.get("post", {}).get("record", {}).get("text", "")
                   for reply in data.get("thread", {}).get("replies", [])]
        return " | ".join(filter(None, replies))
    except Exception:
        return ""

# 🧬 Age/Gender prediction
def predict_age_gender(avatar_url):
    try:
        if not avatar_url:
            return None, None
        response = requests.get(avatar_url, timeout=10)
        img = Image.open(io.BytesIO(response.content)).convert("RGB")
        result = DeepFace.analyze(
            img_path=np.array(img),
            actions=["age", "gender"],
            enforce_detection=False,
            detector_backend="retinaface"
        )
        if isinstance(result, list):
            result = result[0]
        return result.get("age"), result.get("gender")
    except Exception:
        return None, None

# 💾 Data management
def save_to_csv(data, filename=SAVE_FILENAME):
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["username", "age", "gender", "text", "thread_replies", "avatar_url"])
        writer.writerows(data)
    print(f"✅ Saved {len(data)} entries to {filename}")

def load_existing(filename=SAVE_FILENAME):
    if os.path.exists(filename):
        with open(filename, "r", encoding="utf-8") as f:
            reader = csv.reader(f)
            next(reader)  # Skip header
            for row in reader:
                if row:
                    collected.append(row)
                    seen_users.add(row[0])
        print(f"🔄 Resumed from {filename}, found {len(collected)} users")

# 🚦 Interrupt handling
def autosave_handler(signal, frame):
    print("\n⚠️ Interrupted! Saving progress...")
    save_to_csv(collected, filename="autosave_depressed_45plus.csv")
    sys.exit(0)

signal.signal(signal.SIGINT, autosave_handler)
signal.signal(signal.SIGTERM, autosave_handler)

# 🚀 Main application
def main():
    global collected, seen_users
    access_token = authenticate(USERNAME, APP_PASSWORD)
    load_existing()

    try:
        random.shuffle(KEYWORDS)
        for keyword in KEYWORDS:
            user_count = 0
            print(f"\n🔍 Searching: {keyword}")
            cursor = None

            # Random page skipping
            if random.random() > 0.7:
                for _ in range(random.randint(1, 3)):
                    result = search_posts(access_token, keyword, cursor)
                    cursor = result.get("cursor")
                    if not cursor:
                        break

            with tqdm(desc=f"Processing {keyword[:15]}...") as pbar:
                while len(collected) < MAX_USERS and user_count < MAX_USERS_PER_KEYWORD:
                    try:
                        time.sleep(random.uniform(0.5, 2.5))
                        result = search_posts(access_token, keyword, cursor)
                        posts = result.get("posts", [])

                        if random.random() > 0.8:
                            random.shuffle(posts)

                        if not posts:
                            break

                        for post in posts:
                            record = post.get("record", {})
                            author = post.get("author", {})
                            text = record.get("text", "")
                            username = author.get("handle")
                            uri = post.get("uri")
                            avatar_url = author.get("avatar", "")

                            if username in seen_users or not DEPRESSION_REGEX.search(text):
                                continue

                            age, gender = predict_age_gender(avatar_url)
                            if age is None or age < MIN_AGE:
                                continue

                            gender_str = gender.title() if isinstance(gender, str) else max(gender, key=gender.get)
                            thread_replies = get_thread(uri, access_token)

                            collected.append([
                                username,
                                int(age),
                                gender_str,
                                text[:500],
                                thread_replies[:1000],
                                avatar_url
                            ])
                            seen_users.add(username)
                            user_count += 1
                            pbar.update(1)

                            if len(collected) % AUTOSAVE_INTERVAL == 0:
                                save_to_csv(collected, filename="autosave_depressed_45plus.csv")

                        cursor = result.get("cursor")
                        if not cursor:
                            break

                    except Exception as e:
                        print(f"⚠️ Error: {e}")
                        time.sleep(random.uniform(5, 15))

            if len(collected) >= MAX_USERS:
                break

    except Exception as e:
        print(f"❌ Fatal error: {e}")
        save_to_csv(collected, filename="error_autosave_depressed_45plus.csv")
        raise

    save_to_csv(collected)

if __name__ == "__main__":
    main()

download_path: /root/.insightface/models/buffalo_l
Downloading /root/.insightface/models/buffalo_l.zip from https://github.com/deepinsight/insightface/releases/download/v0.7/buffalo_l.zip...


100%|██████████| 281857/281857 [00:05<00:00, 48852.54KB/s]


In [None]:
import requests
import csv
import time
import re
import io
import os
import random
import signal
import sys
from PIL import Image
from deepface import DeepFace
from tqdm import tqdm
import numpy as np
from concurrent.futures import ThreadPoolExecutor

# 🔧 Config
USERNAME = "hsp31.bsky.social"
APP_PASSWORD = "t5lo-dhiy-6g4j-b7xk"
BASE_URL = "https://bsky.social/xrpc"
SAVE_FILENAME = "depressed_45plus.csv"
MAX_USERS = 1000
MAX_USERS_PER_KEYWORD = 70
MIN_AGE = 45
AUTOSAVE_INTERVAL = 20

# Keywords related to depression
KEYWORDS = [
    "diagnosed with depression", "i was diagnosed with depression", "psychiatrist diagnosed me with depression",
    "history of depression diagnosis", "i got diagnosed with depression", "mental health diagnosis: depression",
    "dealing with depression", "diagnosed with depression and anxiety", "i have been diagnosed with depression",
    "doctor said i have depression", "received a diagnosis of depression", "recently diagnosed with depression",
    "diagnosed depressive episode", "got diagnosed with mdd", "diagnosed with major depression",
    "they told me i have depression", "turns out it's depression", "i found out i have depression",
    "my therapist said it's depression", "been told i'm depressed", "finally figured out it's depression",
    "it’s confirmed – depression", "they diagnosed me last week", "depression is what i was told i have"
]

DEPRESSION_REGEX = re.compile(r"\b(dx|diagnos(e|ed|ing|is|es|sis)?|mdd|depressive disorder)\b", re.IGNORECASE)

# Global State
collected = []
seen_users = set()
executor = ThreadPoolExecutor(max_workers=5)

# Authentication

def authenticate(username, password):
    url = f"{BASE_URL}/com.atproto.server.createSession"
    response = requests.post(url, json={"identifier": username, "password": password})
    response.raise_for_status()
    return response.json()["accessJwt"]

# Search

def search_posts(access_token, keyword, cursor=None):
    url = f"{BASE_URL}/app.bsky.feed.searchPosts"
    headers = {"Authorization": f"Bearer {access_token}"}
    params = {"q": keyword, "limit": 100, "sort": "newest"}
    if cursor:
        params["cursor"] = cursor
    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status()
    return response.json()

# Thread fetching

def get_thread(uri, access_token):
    try:
        url = f"{BASE_URL}/app.bsky.feed.getPostThread"
        headers = {"Authorization": f"Bearer {access_token}"}
        response = requests.get(url, headers=headers, params={"uri": uri}, timeout=10)
        response.raise_for_status()
        data = response.json()
        replies = [reply.get("post", {}).get("record", {}).get("text", "") for reply in data.get("thread", {}).get("replies", [])]
        return " | ".join(filter(None, replies))
    except Exception:
        return ""

# Age & Gender prediction

def predict_age_gender(avatar_url):
    try:
        if not avatar_url:
            return None, None
        response = requests.get(avatar_url, timeout=10)
        img = Image.open(io.BytesIO(response.content)).convert("RGB")
        result = DeepFace.analyze(
            img_path=np.array(img),
            actions=["age", "gender"],
            enforce_detection=False,
            detector_backend="retinaface"
        )
        if isinstance(result, list):
            result = result[0]
        return result.get("age"), result.get("gender")
    except Exception:
        return None, None

# Helper - mentions self

def mentions_self(text):
    return bool(re.search(r"\b(i|me|my|mine|i'm|i’ve|i'd|i was|i am|i got|i have|myself)\b", text, re.IGNORECASE))

# Helper - validate post

def validate_user(username, text):
    if not re.match(r"^[a-zA-Z0-9-_.]+\.bsky\.social$", username):
        return False
    if len(text) < 20 or len(text) > 2000:
        return False
    if "http" in text.lower():
        return False
    if re.search(r"no longer (depressed|depression)", text, re.I):
        return False
    if not mentions_self(text):
        return False
    return True

# Save

def save_to_csv(data, filename=SAVE_FILENAME, mode="w"):
    with open(filename, mode, newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        if mode == "w":
            writer.writerow(["username", "age", "gender", "text", "thread_replies", "avatar_url"])
        writer.writerows(data)
    print(f"💾 Saved {len(data)} entries to {filename}")

# Load existing

def load_existing(filename=SAVE_FILENAME):
    if os.path.exists(filename):
        with open(filename, "r", encoding="utf-8") as f:
            reader = csv.reader(f)
            next(reader)
            for row in reader:
                if row:
                    collected.append(row)
                    seen_users.add(row[0])
        print(f"🔄 Resumed from {filename}, {len(collected)} records")

# Handle CTRL+C

def autosave_handler(sig, frame):
    print("\n⚠️ Interrupted! Autosaving...")
    save_to_csv(collected, filename="autosave_depressed_45plus.csv", mode="w")
    sys.exit(0)

signal.signal(signal.SIGINT, autosave_handler)
signal.signal(signal.SIGTERM, autosave_handler)

# Main Function

def main():
    global collected, seen_users
    access_token = authenticate(USERNAME, APP_PASSWORD)
    load_existing()
    random.shuffle(KEYWORDS)

    try:
        for keyword in KEYWORDS:
            user_count = 0
            cursor = None
            print(f"\n🔍 Searching: {keyword}")
            with tqdm(desc=f"{keyword[:15]}", leave=False) as pbar:
                while len(collected) < MAX_USERS and user_count < MAX_USERS_PER_KEYWORD:
                    for attempt in range(3):
                        try:
                            time.sleep(random.uniform(0.5, 2.0))
                            result = search_posts(access_token, keyword, cursor)
                            cursor = result.get("cursor")
                            posts = result.get("posts", [])
                            break
                        except Exception as e:
                            print(f"⚠️ Retry {attempt + 1}: {e}")
                            time.sleep(5)
                    else:
                        print("❌ Skipping keyword due to repeated errors.")
                        break

                    if not posts:
                        break

                    random.shuffle(posts)

                    for post in posts:
                        record = post.get("record", {})
                        author = post.get("author", {})
                        text = record.get("text", "")
                        username = author.get("handle")
                        uri = post.get("uri")
                        avatar_url = author.get("avatar", "")

                        if username in seen_users or not DEPRESSION_REGEX.search(text) or not validate_user(username, text):
                            continue

                        future = executor.submit(predict_age_gender, avatar_url)
                        age, gender = future.result()

                        if age is None or age < MIN_AGE:
                            continue

                        gender_str = gender.title() if isinstance(gender, str) else max(gender, key=gender.get)
                        thread_replies = get_thread(uri, access_token)

                        collected.append([
                            username,
                            int(age),
                            gender_str,
                            text[:500],
                            thread_replies[:1000],
                            avatar_url
                        ])
                        seen_users.add(username)
                        user_count += 1
                        pbar.update(1)

                        if len(collected) % AUTOSAVE_INTERVAL == 0:
                            save_to_csv(collected, filename="autosave_depressed_45plus.csv", mode="w")

                    if not cursor:
                        break

            if len(collected) >= MAX_USERS:
                break

    except Exception as e:
        print(f"❌ Fatal error: {e}")
        save_to_csv(collected, filename="error_autosave_depressed_45plus.csv", mode="w")
        raise

    save_to_csv(collected)

if __name__ == "__main__":
    main()
