## *Web Scrapping of Reddit post through PRAW API*

In [None]:
import praw
import pandas as pd
from datetime import datetime
import time
import os

# Initialize Reddit instance with your credentials
reddit = praw.Reddit(
    client_id="IffN842DjoBPAubXeob9ew",           
    client_secret="SZfgpnRXyE_wRi-wHuoGGdWNqHpe_g",  
    user_agent="python:MentalHealthScraper:v1.0 (by u/u/FamiliarAd3777)"  
)

# Define subreddits and target
subreddits = [
    "depression", "anxiety", "mentalhealth", "BPD", "SuicideWatch",
    "OCD", "AnxietyDepression", "ptsd", "bipolar", "socialanxiety"
]
target_posts = 50000
posts_per_subreddit = target_posts // len(subreddits)

# Data storage with new columns
data = {
    "post_id": [], "subreddit": [], "title": [], "text": [], "author": [],
    "timestamp": [], "score": [], "comment_count": [], "comments": [],
    "location": [],  # For geospatial analysis
    "label": []      # For training/evaluation (to be filled later)
}

# List of specific locations to search for
specific_locations = [
    "Belfast", "Derry", "Dundalk", "Limerick", "Dublin", "Cork", "Galway",
    "Waterford", "Newtownabbey", "Bangor", "London", "Manchester", 
    "Birmingham", "Leeds", "Glasgow"
]

# Function to extract specific locations from text
def extract_location(text):
    if not text or pd.isna(text):
        return None
    text_lower = text.lower()  # Case-insensitive matching
    for location in specific_locations:
        if location.lower() in text_lower:
            return location
    return None

# Set to keep track of unique post IDs (to avoid duplicates)
seen_post_ids = set()

# Scrape data using multiple methods
for subreddit_name in subreddits:
    print(f"Scraping r/{subreddit_name}...")
    subreddit = reddit.subreddit(subreddit_name)
    post_count = 0

    # Define methods to scrape from
    methods = [
        ("new", None),
        ("hot", None),
        ("top", "all"),
        ("top", "year"),
        ("top", "month"),
        ("rising", None)
    ]

    for method, time_filter in methods:
        if post_count >= posts_per_subreddit:
            break
        print(f"  Using method: {method} (time_filter: {time_filter})")
        
        # Select the method dynamically
        if method == "new":
            submissions = subreddit.new(limit=None)
        elif method == "hot":
            submissions = subreddit.hot(limit=None)
        elif method == "top":
            submissions = subreddit.top(time_filter=time_filter, limit=None)
        elif method == "rising":
            submissions = subreddit.rising(limit=None)

        for submission in submissions:
            # Skip if post ID already seen (avoid duplicates)
            if submission.id in seen_post_ids:
                continue
            seen_post_ids.add(submission.id)

            try:
                data["post_id"].append(submission.id)
                data["subreddit"].append(subreddit_name)
                data["title"].append(submission.title)
                data["text"].append(submission.selftext)
                data["author"].append(str(submission.author) if submission.author else "[deleted]")
                data["timestamp"].append(datetime.fromtimestamp(submission.created_utc).isoformat())
                data["score"].append(submission.score)
                submission.comments.replace_more(limit=0)
                comments = [comment.body for comment in submission.comments.list()[:10]]
                data["comment_count"].append(len(comments))
                data["comments"].append(" | ".join(comments))
                
                # Extract location from title, text, or comments
                combined_text = f"{submission.title} {submission.selftext} {' '.join(comments)}"
                location = extract_location(combined_text)
                data["location"].append(location)
                
                # Placeholder for label (to be filled manually later)
                data["label"].append(None)  # Will be filled after scraping
                
                post_count += 1
                if post_count % 100 == 0:
                    print(f"  Scraped {post_count} posts from r/{subreddit_name}")
                if post_count >= posts_per_subreddit:
                    break
            except Exception as e:
                print(f"Error scraping post {submission.id}: {e}")
            time.sleep(1)

    print(f"Finished scraping r/{subreddit_name}. Total posts: {post_count}")

# Specify the save location
save_directory = "C:\\Users\\Asus\\OneDrive\\Desktop\\Sem2\\Research Practicum\\New"
output_filename = "mental_health_data.csv"
output_path = os.path.join(save_directory, output_filename)

# Create the directory if it doesn't exist
if not os.path.exists(save_directory):
    os.makedirs(save_directory)
    print(f"Created directory: {save_directory}")

# Save to CSV
df = pd.DataFrame(data)
df.to_csv(output_path, index=False)
print(f"Data saved to {output_path}. Total posts: {len(df)}")
print(df.head())

# Reminder to manually label the data
print("Next step: Manually label a subset of the 'label' column with 'positive', 'negative', or 'distress' for training and evaluation.")

Scraping r/depression...
  Using method: new (time_filter: None)
  Scraped 100 posts from r/depression
  Scraped 200 posts from r/depression
  Scraped 300 posts from r/depression
  Scraped 400 posts from r/depression
  Scraped 500 posts from r/depression
  Scraped 600 posts from r/depression
  Scraped 700 posts from r/depression
  Scraped 800 posts from r/depression
  Scraped 900 posts from r/depression
  Using method: hot (time_filter: None)
  Scraped 1000 posts from r/depression
  Using method: top (time_filter: all)
  Scraped 1100 posts from r/depression
  Scraped 1200 posts from r/depression
  Scraped 1300 posts from r/depression
  Scraped 1400 posts from r/depression
  Scraped 1500 posts from r/depression
  Scraped 1600 posts from r/depression
  Scraped 1700 posts from r/depression
  Scraped 1800 posts from r/depression
  Scraped 1900 posts from r/depression
  Using method: top (time_filter: year)
  Scraped 2000 posts from r/depression
  Scraped 2100 posts from r/depression
  Scra

In [None]:
import pandas as pd
from transformers import pipeline
import os
# Load your saved data
file_path = "C:\\Users\\Asus\\OneDrive\\Desktop\\Sem2\\Research Practicum\\New\\mental_health_data.csv"
df = pd.read_csv(file_path)

In [9]:
# Initialize the sentiment analysis pipeline with PyTorch framework
print("Loading sentiment model...")
sentiment_classifier = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    framework="pt"  # Explicitly use PyTorch to avoid TensorFlow/Keras issues
)

Loading sentiment model...


Device set to use cpu


In [10]:
# Define distress keywords
distress_keywords = ["help", "crisis", "suicide", "panic", "desperate", "kill", "hopeless", "emergency"]

In [11]:
# Function to classify text and integrate score
def classify_text(text, score):
    # Handle empty or NaN text
    if pd.isna(text) or text.strip() == "":
        return None
    
    # Truncate text to 512 tokens (model limit)
    combined_text = text[:512]
    
    # Get sentiment prediction
    result = sentiment_classifier(combined_text)[0]
    label = result["label"].lower()  # 'positive' or 'negative'
    confidence = result["score"]
    
    # Convert text to lowercase for keyword matching
    text_lower = text.lower()
    
    # Custom logic with distress keywords and score
    if any(keyword in text_lower for keyword in distress_keywords):
        if score > 1000:  # High score indicates significant distress
            return "distress"
        return "negative"  # Default to negative if distress keywords present but score is low
    
    # Refine sentiment with confidence and score
    if confidence > 0.7:  # Only accept high-confidence predictions
        if label == "negative" and score > 5000:  # High score + negative = potential distress
            return "distress" if "help" in text_lower else "negative"
        elif label == "positive" and score > 500:  # High score reinforces positive
            return "positive"
        return label
    return None  # Return None if confidence is too low

# Batch processing to avoid memory issues
def batch_classify(df, batch_size=100):
    print(f"Processing {len(df)} posts in batches of {batch_size}...")
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]
        df.loc[batch.index, 'label'] = batch.apply(
            lambda row: classify_text(f"{row['title']} {row['text']} {row['comments']}", row['score']), axis=1
        )
        print(f"Processed {min(i + batch_size, len(df))} posts...")

In [12]:
# Apply classification
batch_classify(df, batch_size=100)

Processing 34550 posts in batches of 100...


 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' None 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' None 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'positive' 'negative' 'negative' 'negative' 'negative' 'positive'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'positive' 'negative' 'negative' 'negati

Processed 100 posts...
Processed 200 posts...
Processed 300 posts...
Processed 400 posts...
Processed 500 posts...
Processed 600 posts...
Processed 700 posts...
Processed 800 posts...
Processed 900 posts...
Processed 1000 posts...
Processed 1100 posts...
Processed 1200 posts...
Processed 1300 posts...
Processed 1400 posts...
Processed 1500 posts...
Processed 1600 posts...
Processed 1700 posts...
Processed 1800 posts...
Processed 1900 posts...
Processed 2000 posts...
Processed 2100 posts...
Processed 2200 posts...
Processed 2300 posts...
Processed 2400 posts...
Processed 2500 posts...
Processed 2600 posts...
Processed 2700 posts...
Processed 2800 posts...
Processed 2900 posts...
Processed 3000 posts...
Processed 3100 posts...
Processed 3200 posts...
Processed 3300 posts...
Processed 3400 posts...
Processed 3500 posts...
Processed 3600 posts...
Processed 3700 posts...
Processed 3800 posts...
Processed 3900 posts...
Processed 4000 posts...
Processed 4100 posts...
Processed 4200 posts...
P

In [13]:
# Save the updated DataFrame
output_path = "C:\\Users\\Asus\\OneDrive\\Desktop\\Sem2\\Research Practicum\\New\\mental_health_data_labeled.csv"
df.to_csv(output_path, index=False)
print(f"Labeled data saved to {output_path}")

Labeled data saved to C:\Users\Asus\OneDrive\Desktop\Sem2\Research Practicum\New\mental_health_data_labeled.csv


In [14]:
# Display label distribution
print("\nLabel distribution:")
print(df['label'].value_counts(dropna=False))


Label distribution:
label
negative    29842
distress     2202
positive     2130
None          376
Name: count, dtype: int64


In [15]:
# Optional: Display a sample of labeled data
print("\nSample of labeled data:")
print(df[['title', 'score', 'label']].head(10))


Sample of labeled data:
                                               title  score     label
0  Can someone please help this guy ( AmoebaRepul...      1  negative
1                                    Dopamine Crash?      1  negative
2                How to help friend to do something?      1  negative
3                 thank you to anyone who reads this      2  negative
4                  Fun fact: I can't take it anymore      2  negative
5            Today’s not bad. Is it just false hope?      1  negative
6  Everyone keeps telling me how well I’m doing b...      1  negative
7                                        im so tired      1  negative
8                           Literally just gonna kms      1  negative
9  killing myself. wilI be better than staying Io...      2  negative


## *Data Cleaning*

In [16]:
import pandas as pd
import re
from datetime import datetime

# Load the labeled dataset
file_path = "C:\\Users\\Asus\\OneDrive\\Desktop\\Sem2\\Research Practicum\\New\\mental_health_data_labeled.csv"
df = pd.read_csv(file_path)

In [17]:
# Step 1: Handle Missing Values
print("Initial missing values:\n", df.isnull().sum())
df['title'] = df['title'].fillna("")  # Empty string for missing titles
df['text'] = df['text'].fillna("")    # Empty string for missing text
df['comments'] = df['comments'].fillna("")  # Empty string for missing comments
df['label'] = df['label'].fillna("None")  # Flag unlabeled rows
df['location'] = df['location'].fillna("British Isles")  # Flag missing locations

Initial missing values:
 post_id              0
subreddit            0
title                0
text              3612
author               0
timestamp            0
score                0
comment_count        0
comments          3014
location         34456
label              376
dtype: int64


In [18]:
# Step 2: Remove Duplicates
df = df.drop_duplicates(subset=['post_id'], keep='first')
print(f"Rows after removing duplicates: {len(df)}")

Rows after removing duplicates: 34550


In [19]:
# Step 3: Clean Text Data
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

df['title'] = df['title'].apply(clean_text)
df['text'] = df['text'].apply(clean_text)
df['comments'] = df['comments'].apply(clean_text)

In [20]:
# Step 4: Standardize Timestamps
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

In [21]:
# Step 5: Handle Outliers in Score
print("Score stats before cleaning:\n", df['score'].describe())
df = df[df['score'].between(0, 28000)]  # Remove negative or extreme outliers
print("Score stats after cleaning:\n", df['score'].describe())

Score stats before cleaning:
 count    34550.000000
mean       329.019595
std        746.044201
min          0.000000
25%          4.000000
50%         43.000000
75%        284.000000
max      27945.000000
Name: score, dtype: float64
Score stats after cleaning:
 count    34550.000000
mean       329.019595
std        746.044201
min          0.000000
25%          4.000000
50%         43.000000
75%        284.000000
max      27945.000000
Name: score, dtype: float64


In [22]:
# Step 6: Normalize Labels
df['label'] = df['label'].str.lower()  # Ensure consistent casing
valid_labels = ['positive', 'negative', 'distress']
df = df[df['label'].isin(valid_labels)]  # Drop invalid labels

In [23]:
# Step 7: Filter Irrelevant Data
df = df[df['author'] != '[deleted]']  # Remove deleted authors
df = df[df['title'].str.len() > 5]   # Keep titles longer than 5 chars
df = df[df['text'].str.len() > 10]   # Keep text longer than 10 chars

In [24]:
# Step 8: Simplify Comments (optional: split into list)
df['comments'] = df['comments'].str.split(" | ")  # Split comments into list
df['comment_count_cleaned'] = df['comments'].apply(len)  # Update comment count

In [25]:
# Step 9: Validate Location
df['location'] = df['location'].str.lower().str.strip()
valid_locations = [
    "belfast", "derry", "dundalk", "limerick", "dublin", "cork", "galway",
    "waterford", "newtownabbey", "bangor", "london", "manchester", 
    "birmingham", "leeds", "glasgow", "british isles"
]
df['location'] = df['location'].apply(lambda x: x if x in valid_locations else "british isles")

In [27]:
# Step 10: Final Data Validation
# Handle NaN or non-numeric values in 'score' and 'comment_count' before conversion
df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0).astype(int)  # Convert to int, NaN -> 0
df['comment_count'] = pd.to_numeric(df['comment_count'], errors='coerce').fillna(0).astype(int)  # Convert to int, NaN -> 0
df = df.dropna(subset=['timestamp'])  # Drop rows with invalid timestamps

In [28]:
# Save the cleaned dataset
output_path = "C:\\Users\\Asus\\OneDrive\\Desktop\\Sem2\\Research Practicum\\New\\mental_health_data_cleaned.csv"
df.to_csv(output_path, index=False)
print(f"Cleaned data saved to {output_path}")
print(f"Final shape: {df.shape}")
print("Final missing values:\n", df.isnull().sum())
print("Label distribution:\n", df['label'].value_counts())

Cleaned data saved to C:\Users\Asus\OneDrive\Desktop\Sem2\Research Practicum\New\mental_health_data_cleaned.csv
Final shape: (27808, 12)
Final missing values:
 post_id                  0
subreddit                0
title                    0
text                     0
author                   0
timestamp                0
score                    0
comment_count            0
comments                 0
location                 0
label                    0
comment_count_cleaned    0
dtype: int64
Label distribution:
 label
negative    25323
positive     1350
distress     1135
Name: count, dtype: int64
