In [1]:
import warnings
warnings.simplefilter('ignore')

**TASK are divided based on the provided structure in
 https://docs.google.com/document/d/e/2PACX-1vQfC8gkrSx_ycYkIOdae5sJ-fuqn2UA9nLtGqA5egBuwNKMNZpi_NBR0MRnnqdWt8WYqznE6x9_DIO0/pub**

These are scripts although they are also provided as separate files in script folder

# TASK-1

**Extract Reddit data**
Then Apply PreProcessing


In [2]:
# Provide your CLIENT_ID and CLIENT_SECRET in .env file
from dotenv import load_dotenv
import os
load_dotenv()

client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")

In [3]:
import praw
import pandas as pd
import json
from datetime import datetime
from tqdm import tqdm
import os
from dotenv import load_dotenv
load_dotenv()
class RedditMentalHealthScraper:
    def __init__(self, client_id, client_secret, user_agent="SocialAnalysis<v0>"):
        """
        Initialize Reddit API connection
        """
        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent
        )
        
        self.keywords = [
            # Mental Health
            "depressed", "anxiety", "mental health", "ptsd", "trauma", 
            "burnout", "emotional support", "mental breakdown", 
            "psychological distress", "intrusive thoughts",
            
            # Substance Use
            "addiction", "substance abuse", "recovery", "relapse", 
            "alcohol addiction", "drug addiction", "sober", 
            "addiction help", "substance use disorder",
            
            # Emotional Distress
            "overwhelmed", "struggling", "feeling hopeless", 
            "emotional pain", "self-harm", "suicidal thoughts", 
            "suicide prevention", "mental health crisis",
            
            # Treatment and Support
            "therapy", "counseling", "medication", "support group", 
            "mental health resources", "coping mechanisms"
        ]
    
    def extract_posts(self, target_subreddits, limit=100):
        """
        Extract posts from subreddits containing target keywords
        """
        extracted_posts = []
        
        # Find all subreddits matching target names
        matching_subreddits = []
        for target in target_subreddits:
            try:
                subreddits = self.reddit.subreddits.search_by_name(target, include_nsfw=True)
                for i in subreddits:
                    matching_subreddits.append(i.display_name)
            except Exception as e:
                print(f"Could not find subreddit matching {target}: {e}")
        
        print(f"Matching subreddits found: {matching_subreddits}")
        for subreddit_name in tqdm(matching_subreddits):
            try:
                subreddit = self.reddit.subreddit(subreddit_name)
                
                # Search hot posts and new posts
                for post_stream in [subreddit.hot(limit=limit), subreddit.new(limit=limit)]:
                    for post in post_stream:
                        # Check if post contains any of the keywords
                        if self._contains_keywords(post.title.lower()) or \
                           self._contains_keywords(post.selftext.lower()):
                            
                            post_data = {
                                'post_id': post.id,
                                'timestamp': datetime.fromtimestamp(post.created_utc).isoformat(),
                                'title': post.title,
                                'content': post.selftext,
                                'subreddit': subreddit_name,
                                'ups': post.ups,
                                'num_comments': post.num_comments,
                                'url': post.url
                            }
                            
                            extracted_posts.append(post_data)
            
            except Exception as e:
                print(f"Error processing subreddit {subreddit_name}: {e}")
        
        return extracted_posts
    
    # Method istested but not used in analysis, as it needs higher api calls
    def extract_posts_with_comments(self, target_subreddits, limit=100, comment_limit=50, replace_more_limit=0):
        """
        Extract posts from subreddits containing target keywords along with their comments
        
        Parameters:
        - target_subreddits: List of subreddit names to search
        - limit: Maximum number of posts to retrieve per subreddit
        - comment_limit: Maximum number of comments to retrieve per post
        
        Returns:
        - List of dictionaries containing post data and comments
        """
        extracted_posts = []
        
        # Find all subreddits matching target names
        matching_subreddits = []
        for target in target_subreddits:
            try:
                subreddits = self.reddit.subreddits.search_by_name(target, include_nsfw=True)
                for i in subreddits:
                    matching_subreddits.append(i.display_name)
            except Exception as e:
                print(f"Could not find subreddit matching {target}: {e}")
        
        print(f"Matching subreddits found: {matching_subreddits}")
        for subreddit_name in tqdm(matching_subreddits):
            try:
                subreddit = self.reddit.subreddit(subreddit_name)
                
                # Search hot posts and new posts
                for post_stream in [subreddit.hot(limit=limit), subreddit.new(limit=limit)]:
                    for post in post_stream:
                        # Check if post contains any of the keywords
                        if self._contains_keywords(post.title.lower()) or \
                        self._contains_keywords(post.selftext.lower()):
                            
                            # Extract comments
                            comments_data = []
                            post.comments.replace_more(limit=replace_more_limit)  # Replace MoreComments objects
                            
                            for comment in post.comments.list()[:comment_limit]:
                                comment_data = {
                                    'comment_id': comment.id,
                                    'author': str(comment.author) if comment.author else '[deleted]',
                                    'body': comment.body,
                                    'score': comment.score,
                                    'timestamp': datetime.fromtimestamp(comment.created_utc).isoformat(),
                                    'parent_id': comment.parent_id
                                }
                                comments_data.append(comment_data)
                            
                            post_data = {
                                'post_id': post.id,
                                'timestamp': datetime.fromtimestamp(post.created_utc).isoformat(),
                                'title': post.title,
                                'content': post.selftext,
                                'subreddit': subreddit_name,
                                'ups': post.ups,
                                'num_comments': post.num_comments,
                                'url': post.url,
                                'comments': comments_data
                            }
                            
                            extracted_posts.append(post_data)
            
            except Exception as e:
                print(f"Error processing subreddit {subreddit_name}: {e}")
        
        return extracted_posts
    
    def _contains_keywords(self, text):
        """
        Check if text contains any of the predefined keywords
        """
        return any(keyword.lower() in text for keyword in self.keywords)

    def save_to_csv(self, posts, filename='mental_health_posts.csv'):
        """
        Save extracted posts to CSV
        """
        df = pd.DataFrame(posts)
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Saved {len(posts)} posts to {filename}")

    def save_to_json(self, posts, filename='mental_health_posts.json'):
        """
        Save extracted posts to JSON
        """
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(posts, f, ensure_ascii=False, indent=4)
        print(f"Saved {len(posts)} posts to {filename}")

def main():
    # Reddit API credentials
    CLIENT_ID = os.getenv("CLIENT_ID")
    CLIENT_SECRET = os.getenv("CLIENT_SECRET")
    USER_AGENT = 'SocialAnalysis<v0>(by /u/Sad-Net-4568)'
    
    # Target subreddit partial names to search
    target_subreddits = [
        'mentalhealth', 'depression', 'anxiety', 
        'addiction', 'support', 'mental', 'psychiatry', 'ketamine', 'SuicideWatch',
        'bipolar', 'ptsd', 'trauma', 'burnout', 'emotional', 'intrusivethoughts', 'stopdrinking',
        'leaves', 'stopsmoking', 'stopdrugs', 'stopselfharm', 'opiatesrecovery', 'recovery', 'therapy', 'crisis',
        'selfharm', 'stress', 'panic'
    ]

    scraper = RedditMentalHealthScraper(CLIENT_ID, CLIENT_SECRET, USER_AGENT)
    
    # Used in analysis
    extracted_posts = scraper.extract_posts(target_subreddits, limit=1000)

    # Tested with comment extraction but not used in further analysis
    # extracted_posts = scraper.extract_posts_with_comments(target_subreddits, limit=100, comment_limit=50, replace_more_limit=1)
    
    scraper.save_to_csv(posts=extracted_posts, filename='mental_health_postsV1.csv')
    # scraper.save_to_json(posts=extracted_posts, filename='mental_health_postsV1.json') # Not used as CSV is easier to work with in Pandas
    print("Extraction complete.")

In [None]:
# main()

Matching subreddits found: ['mentalhealth', 'MentalHealthPH', 'MentalHealthSupport', 'MentalHealthUK', 'MentalHealthIsland', 'MentalHealthBabies', 'MentalHealthProviders', 'MentalHealthPros', 'mentalhealthadvice']


100%|██████████| 9/9 [09:37<00:00, 64.18s/it]


Saved 1018 posts to mental_health_postswith_comments.csv


Would analyze comments in the Project,

I am positive about how it can gives us more data around more users, not only the person who posted the post.

## preprocessing

It's been done for data that doesn't have comments-content

Reason- Avoided for now, because of it's high api hit

In [4]:
%%writefile scripts/nlp_preprocessing.py

import nltk
from transformers import AutoTokenizer, AutoModelForTokenClassification
from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer # didn't used
import re
import emoji
from tqdm import tqdm
import torch
import pandas as pd

# this is file that I saved, provided in repo also
df = pd.read_csv('mental_health_postsV1.csv') # use the file saved via reddit extraction code

data = df.copy()

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - {'no', 'not', 'nor', 'never'} # don't remove negation words

def preprocess_text_with_counts(text):
    if pd.isnull(text):
        return text, 0, 0, 0

    emoji_count = len([char for char in text if char in emoji.EMOJI_DATA])
    special_char_count = len(re.findall(r'[^A-Za-z\s]', text))
    stopword_count = len([word for word in text.split() if word.lower() in stop_words])

    text = emoji.replace_emoji(text, replace="")
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()

    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)

    return text, emoji_count, special_char_count, stopword_count

original_content = data['content'].copy()
emoji_changes = []
special_char_changes = []
stopword_changes = []

def process_row(row):
    processed_text, emoji_count, special_char_count, stopword_count = preprocess_text_with_counts(row)
    emoji_changes.append(emoji_count)
    special_char_changes.append(special_char_count)
    stopword_changes.append(stopword_count)
    return processed_text

for index, row in tqdm(data['content'].items(), total=len(data['content']), desc="Processing rows"):
    data.at[index, 'content'] = process_row(row)

changed_rows = (original_content != data['content']).sum()
data.to_csv('mental_health_postsV1_preprocessed.csv', index=False)

# Calculate total counts and rows with changes
total_emoji_removed = sum(emoji_changes)
total_special_chars_removed = sum(special_char_changes)
total_stopwords_removed = sum(stopword_changes)

rows_with_emoji_changes = sum(1 for count in emoji_changes if count > 0)
rows_with_special_char_changes = sum(1 for count in special_char_changes if count > 0)
rows_with_stopword_changes = sum(1 for count in stopword_changes if count > 0)

# Print the results
print(f"Number of rows changed: {changed_rows}")
print(f"Total emoji removed: {total_emoji_removed}")
print(f"Rows with emoji changes: {rows_with_emoji_changes}")
print(f"Total special characters removed: {total_special_chars_removed}")
print(f"Rows with special character changes: {rows_with_special_char_changes}")
print(f"Total stopwords removed: {total_stopwords_removed}")
print(f"Rows with stopword changes: {rows_with_stopword_changes}")

Writing scripts/nlp_preprocessing.py


In [21]:
data = pd.read_csv('mental_health_postsV1_preprocessed.csv')
data.head(10)

Unnamed: 0,post_id,timestamp,title,content,subreddit,ups,num_comments,url
0,1gd9l9c,2024-10-27T17:43:33,Elections and Politics,hello friends time year always intended rmenta...,mentalhealth,24,17,https://www.reddit.com/r/mentalhealth/comments...
1,1e297nd,2024-07-13T17:55:58,r/MentalHealth is looking for moderators,hey rmentalhealth looking grow moderation team...,mentalhealth,21,27,https://www.reddit.com/r/mentalhealth/comments...
2,1jl6wjb,2025-03-27T21:11:46,Being weak physically as a man makes me depres...,yo cant even lateral raises kg dumbbell basica...,mentalhealth,22,23,https://www.reddit.com/r/mentalhealth/comments...
3,1jlb42o,2025-03-28T00:05:58,Do you think you are losing your youth because...,mid twenties mental health declining much affe...,mentalhealth,9,9,https://www.reddit.com/r/mentalhealth/comments...
4,1jl422s,2025-03-27T19:05:37,"Medication withdrawl (Effexor, Venlafaxine)",call help hello im f french ive taking venlafa...,mentalhealth,20,9,https://i.redd.it/gnn4t73bj8re1.jpeg
5,1jl5xhf,2025-03-27T20:30:15,Burnt Out On Self Care,diagnosed cptsd gad mood disorder depressive f...,mentalhealth,14,2,https://i.redd.it/b7zgxgrey8re1.jpeg
6,1jkxutf,2025-03-27T12:14:06,"Shoutout Mental Health Muscle, I am proud to b...",,mentalhealth,41,2,https://i.redd.it/nhadpq2wh6re1.jpeg
7,1jl5tea,2025-03-27T20:25:32,Feeling hopeless,im feeling completely hopeless made many wrong...,mentalhealth,5,4,https://www.reddit.com/r/mentalhealth/comments...
8,1jkwbcw,2025-03-27T10:26:07,What’s One Small Habit That Has Helped Your Me...,sometimes small changes make big difference ma...,mentalhealth,30,41,https://www.reddit.com/r/mentalhealth/comments...
9,1jl97wc,2025-03-27T22:48:03,I just want to leave my phone and runaway. I f...,feel lost alone overwhelmed feel like ive abso...,mentalhealth,3,1,https://www.reddit.com/r/mentalhealth/comments...


## Things to consider
As said in Task, i have removed special-characters and emojis.

Although they may help in getting better context of post-how much difference it will create. that needs to be look upon.

I think it won't be much, main challenge can be understanding of emoticons with text, **with understanding of sarcasm**

**Comments content not used in this analysis, But in main Project I will definitely use it - script for it is provided too.**
- Work Upon creating a lexicon for crisis-terms, if it works sufficiently in comparison to analyze whole text via ML-methods.

# Task-2

## Risk Classification using bart

In [5]:
import pandas as pd
from transformers import pipeline
import torch
from datasets import Dataset
from tqdm import tqdm

def classify_post_risk(df:pd.DataFrame) -> pd.DataFrame:
    """
    Classify social media posts into risk levels using zero-shot classification.
    
    Parameters:
    df (pandas.DataFrame): DataFrame with 'post_id' and 'content' columns
    
    Returns:
    pandas.DataFrame: Original DataFrame with added 'risk_level' column
    """
    # Initialize zero-shot classification pipeline
    # Using a robust model for nuanced classification
    classifier = pipeline(
        "zero-shot-classification", 
        model="facebook/bart-large-mnli"
    )
    
    # Define risk level categories with descriptive labels
    risk_categories = [
        "High-Risk: Immediate Crisis",
        "Moderate Concern: Seeking Help",
        "Low Concern: General Discussion"
    ]
    
    # Classification function with detailed criteria
    def determine_risk_level(text):
        try:
            # Perform zero-shot classification
            result = classifier(
                text, 
                candidate_labels=risk_categories, 
                hypothesis_template="This text indicates {}"
            )
            
            # Get the top predicted category
            top_category = result['labels'][0]
            
            if "High-Risk" in top_category:
                return "High-Risk"
            elif "Moderate Concern" in top_category:
                return "Moderate Concern"
            else:
                return "Low Concern"
        
        except Exception as e:
            print(f"Error classifying text: {text}")
            return "Unclassified"
    
    # Apply risk classification to the DataFrame
    tqdm.pandas(desc="Classifying posts")
    df['risk_level'] = df['content'].progress_apply(determine_risk_level)
    return df

# Example usage
def main(df:pd.DataFrame): 
    # Classify posts
    classified_df = classify_post_risk(df)
    # classified_df.to_csv('mental_health_postsV1_classified.csv', index=False)
    # Display results
    print(classified_df)

# Additional helper functions for advanced analysis
def get_risk_level_summary(df):
    """
    Generate summary statistics of risk levels
    """
    risk_summary = df['risk_level'].value_counts(normalize=True) * 100
    print("\nRisk Level Distribution:")
    print(risk_summary)
    return risk_summary

def identify_high_risk_posts(df):
    """
    Extract and highlight high-risk posts
    """
    high_risk_posts = df[df['risk_level'] == 'High-Risk']
    print("\nHigh-Risk Posts:")
    print(high_risk_posts)
    return high_risk_posts


Writing scripts/risk_classification_bart.py


In [None]:
# It's a time consuming process, better to evaluate it from provided saved file

# Uncomment to run
# if __name__ == "__main__":
#     data = pd.read_csv('mental_health_postsV1_preprocessed.csv')
#     df = classify_post_risk(data)
#     df.to_csv('mental_health_postsV1_classified.csv', index=False)

In [None]:
df = pd.read_csv('mental_health_postsV1_classified.csv') # change with the name file you saved
get_risk_level_summary(df)

## Sentiment Classification

Code is distributed for parallel

But better to not run locally as I ran on kaagle's T4x2 GPU, so this can consume more time locally.

Plots are available via plotly -> .html files

You can just refer **dashboard2.png** for plots of this section

used model: all-MiniLM-L6-v2

**Things to try:**
A higher-dimension model like: sentence-transformers/gtr-t5-large

In [6]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import torch
from torch.utils.data import Dataset, DataLoader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from nltk.tokenize import word_tokenize
import nltk
import os
from tqdm import tqdm
import multiprocessing
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from functools import partial
nltk.download('vader_lexicon')

# Parallel VADER sentiment analysis
def process_chunk(chunk_df, text_column='content'):
    sid = SentimentIntensityAnalyzer()
    chunk_df['sentiment_scores'] = chunk_df[text_column].apply(
        lambda text: sid.polarity_scores(str(text)) if pd.notna(text) else {'compound': 0}
    )
    chunk_df['sentiment_score'] = chunk_df['sentiment_scores'].apply(lambda x: x['compound'])
    
    def categorize_sentiment(score):
        if score >= 0.05: return 'Positive'
        elif score <= -0.05: return 'Negative'
        else: return 'Neutral'
    
    chunk_df['sentiment'] = chunk_df['sentiment_score'].apply(categorize_sentiment)
    chunk_df = chunk_df.drop('sentiment_scores', axis=1)
    return chunk_df

def analyze_sentiment_vader_parallel(df, text_column='content', batch_size=1000, n_jobs=None):
    """
    Parallel implementation of VADER sentiment analysis
    """
    if n_jobs is None:
        n_jobs = multiprocessing.cpu_count()
    
    result_df = df.copy()
    
    # Split dataframe into chunks for parallel processing
    df_chunks = np.array_split(result_df, n_jobs)
    
    # Process chunks in parallel
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        processed_chunks = list(tqdm(
            executor.map(partial(process_chunk, text_column=text_column), df_chunks),
            total=len(df_chunks),
            desc="Sentiment Analysis"
        ))
    
    # Combine results
    result_df = pd.concat(processed_chunks)
    return result_df
    
# Custom PyTorch Dataset for BERT embeddings
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx]

# Multi-GPU BERT embeddings function
def get_bert_embeddings(texts, model_name='all-MiniLM-L6-v2', batch_size=32, num_gpus=None):
    """
    Generate embeddings using BERT with multi-GPU support
    """
    if num_gpus is None:
        num_gpus = torch.cuda.device_count()
    
    if num_gpus == 0:
        # Fall back to CPU if no GPUs available
        model = SentenceTransformer(model_name)
        return model.encode(texts, batch_size=batch_size, show_progress_bar=True)
    
    # Create multiple model instances for different GPUs
    models = []
    for gpu_id in range(num_gpus):
        device = f"cuda:{gpu_id}"
        model = SentenceTransformer(model_name)
        model.to(device)
        models.append((model, device))
    
    # Create DataLoader
    dataset = TextDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size * num_gpus, shuffle=False)
    
    # Generate embeddings in batches
    all_embeddings = []
    
    for batch in tqdm(dataloader, desc="Generating BERT embeddings"):
        # Split batch for each GPU
        batch_size_per_gpu = len(batch) // num_gpus
        batch_splits = []
        
        for i in range(num_gpus):
            start_idx = i * batch_size_per_gpu
            end_idx = start_idx + batch_size_per_gpu if i < num_gpus - 1 else len(batch)
            batch_splits.append(batch[start_idx:end_idx])
        
        # Process in parallel across GPUs
        batch_embeddings = []
        
        def encode_on_gpu(model_device, batch_text):
            model, device = model_device
            return model.encode(batch_text)
        
        with ThreadPoolExecutor(max_workers=num_gpus) as executor:
            batch_results = list(executor.map(
                encode_on_gpu, 
                models,
                batch_splits
            ))
        
        # Combine results
        for result in batch_results:
            batch_embeddings.append(result)
            
        combined_embeddings = np.vstack(batch_embeddings)
        all_embeddings.append(combined_embeddings)
    
    return np.vstack(all_embeddings)



def preprocess(text):
    if pd.isna(text) or text == '':
        return []
    return word_tokenize(text.lower())
    
def calculate_crisis_score(tokens, crisis_terms_set):
    if not tokens:
        return 0, []
    
    # Find crisis terms in post
    found_terms = [t for t in tokens if t in crisis_terms_set]
    
    # Calculate score based on number of crisis terms
    score = len(found_terms) / len(tokens) if tokens else 0
    
    return score, found_terms

def extract_terms_from_text(text_series, min_words=2, workers=None):
    if workers is None:
        workers = multiprocessing.cpu_count()
    
    
    with ProcessPoolExecutor(max_workers=workers) as executor:
        all_words_lists = list(executor.map(tokenize_text, text_series))
    
    # Flatten list of lists
    all_words = [word for sublist in all_words_lists for word in sublist]
    
    # Count word frequencies
    word_counts = pd.Series(all_words).value_counts()
    
    # Return top words
    return word_counts.head(20).index.tolist()
    
# Tokenize in parallel
def tokenize_text(text):
    if pd.isna(text) or text == '':
        return []
    words = word_tokenize(text.lower())
    min_words = 2
    return [w for w in words if len(w) > min_words and w.isalpha()]
    

def find_crisis_terms(text, terms):
    if pd.isna(text) or text == '' or not terms:
        return []
    
    text_lower = text.lower()
    found_terms = [term for term in terms if term in text_lower]
    return found_terms


# Detect high-risk terms using parallel BERT
def detect_crisis_terms_bert_parallel(df, text_column='content', title_column='title', batch_size=32, num_gpus=None):
    """
    Detect high-risk crisis terms using BERT with multi-GPU support
    """
    if num_gpus is None:
        num_gpus = torch.cuda.device_count()
    
    result_df = df.copy()
    
    # Combine title and content
    result_df['combined_text'] = result_df.apply(
        lambda row: str(row[title_column] or '') + ' ' + str(row[text_column] or ''), 
        axis=1
    )
    
    # Get post embeddings using multi-GPU
    posts = result_df['combined_text'].fillna('').tolist()
    if not posts:
        print("Warning: No valid text found for BERT model")
        result_df['crisis_score'] = 0
        result_df['high_risk_terms'] = [[] for _ in range(len(result_df))]
        return result_df
    
    post_embeddings = get_bert_embeddings(
        texts=posts,
        batch_size=batch_size,
        num_gpus=num_gpus
    )
    
    print("Clustering posts to identify high-risk content...")
    
    # Use KMeans to cluster posts
    n_clusters = min(5, len(posts))  # Use fewer clusters for smaller datasets
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(post_embeddings)
    
    # Identify high-risk cluster by looking for posts labeled as high-risk
    high_risk_posts_idx = result_df['risk_level'] == 'High-Risk'
    
    if high_risk_posts_idx.any():
        # Count posts in each cluster by risk level
        cluster_risk_counts = pd.crosstab(
            index=cluster_labels, 
            columns=result_df['risk_level']
        )
        
        # Identify clusters with higher proportion of high-risk posts
        if 'High-Risk' in cluster_risk_counts.columns:
            # Calculate proportion of high-risk posts in each cluster
            cluster_risk_props = cluster_risk_counts.div(
                cluster_risk_counts.sum(axis=1), axis=0
            )
            
            # Get clusters with high proportion of high-risk posts
            high_risk_clusters = cluster_risk_props[
                cluster_risk_props['High-Risk'] > 0.3
            ].index.tolist()
        else:
            high_risk_clusters = []
    else:
        # If no known high-risk posts, look for outlier clusters (smallest clusters)
        cluster_counts = pd.Series(cluster_labels).value_counts()
        high_risk_clusters = cluster_counts[cluster_counts < cluster_counts.median()].index.tolist()
    
    # Set crisis score based on cluster membership
    result_df['crisis_score'] = [
        0.9 if label in high_risk_clusters else 0.1 for label in cluster_labels
    ]
    
    # Extract keywords from high-risk clusters in parallel
    print("Extracting crisis terms from high-risk clusters...")
    
    # Function to extract most common words
    
    # Extract terms from high-risk clusters
    if high_risk_clusters:
        high_risk_mask = [label in high_risk_clusters for label in cluster_labels]
        high_risk_texts = result_df.loc[high_risk_mask, 'combined_text']
        
        workers = multiprocessing.cpu_count()
        crisis_terms = extract_terms_from_text(high_risk_texts, workers=workers)
        print(f"Extracted {len(crisis_terms)} crisis terms")
    else:
        crisis_terms = []

    # If crisis_terms is defined
    if not crisis_terms:
        print("Warning: No crisis terms found. Using fallback terms.")
        crisis_terms = [
            'suicide', 'kill myself', 'end my life', 'die', 'better off dead', 
            'no reason to live', 'can\'t go on', 'take my own life', 'ending it all',
            'want to die', 'don\'t want to be here', 'give up', 'hopeless',
            'self harm', 'cut myself', 'hurt myself', 'overdose', 'pills',
            'worthless', 'burden', 'no purpose', 'no point', 'saying goodbye',
            'last post', 'final note', 'throwaway account', 'final post', 
            'delete this later', 'not going to respond', 'just needed to say this'
        ]
        result_df = result_df.drop(['crisis_score'], axis=1)
        
    # Saving crisis_terms
    with open('crisis_terms.txt', 'w') as f:
        f.write(str(crisis_terms))
    # Apply to each post in parallel
    find_terms = partial(find_crisis_terms, terms=crisis_terms)
    
    with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
        high_risk_terms = list(tqdm(
            executor.map(find_terms, result_df['combined_text']),
            total=len(result_df),
            desc="Finding crisis terms in posts"
        ))
    
    result_df['high_risk_terms'] = high_risk_terms
    
    # Drop temporary columns
    result_df = result_df.drop('combined_text', axis=1)
    
    return result_df

# Interactive HTML Visualizations with Plotly
def create_interactive_visualizations(df, output_dir='.', method:str='bert'):
    os.makedirs(output_dir, exist_ok=True)
    
    # Sentiment distribution
    sentiment_counts = df['sentiment'].value_counts().reset_index()
    sentiment_counts.columns = ['Sentiment', 'Count']
    
    fig1 = px.bar(
        sentiment_counts, 
        x='Sentiment', 
        y='Count',
        color='Sentiment',
        title='Distribution of Posts by Sentiment',
        color_discrete_map={'Positive': 'green', 'Neutral': 'gray', 'Negative': 'red'}
    )
    
    fig1.update_layout(
        xaxis_title='Sentiment',
        yaxis_title='Number of Posts',
        legend_title='Sentiment'
    )
    
    fig1.write_html(f"{output_dir}/sentiment_{method}distribution.html")
    
    # Sentiment by risk level
    fig2 = px.histogram(
        df, 
        x='risk_level',
        color='sentiment',
        barmode='group',
        title='Distribution of Posts by Risk Level and Sentiment',
        color_discrete_map={'Positive': 'green', 'Neutral': 'gray', 'Negative': 'red'}
    )
    
    fig2.update_layout(
        xaxis_title='Risk Level',
        yaxis_title='Number of Posts',
        legend_title='Sentiment'
    )
    
    fig2.write_html(f"{output_dir}/sentiment_{method}_by_risk.html")

    if 'crisis_score' in df.columns:
        # Crisis score by risk level
        fig3 = px.box(
            df,
            x='risk_level',
            y='crisis_score',
            color='risk_level',
            title='Crisis Scores by Risk Level'
        )
        
        fig3.update_layout(
            xaxis_title='Risk Level',
            yaxis_title='Crisis Score'
        )
        
        fig3.write_html(f"{output_dir}/crisis_score_{method}_by_risk.html")
    
    # Heatmap of sentiment vs risk level
    crosstab = pd.crosstab(df['risk_level'], df['sentiment'])
    
    fig4 = px.imshow(
        crosstab,
        text_auto=True,
        aspect="auto",
        title='Heatmap of Risk Level vs Sentiment',
        color_continuous_scale='Viridis'
    )
    
    fig4.update_layout(
        xaxis_title='Sentiment',
        yaxis_title='Risk Level'
    )
    
    fig4.write_html(f"{output_dir}/sentiment_risk_{method}_heatmap.html")
    
    # Top high-risk terms
    all_terms = []
    for terms in df['high_risk_terms']:
        all_terms.extend(terms)
    
    if all_terms:
        term_counts = pd.Series(all_terms).value_counts().reset_index()
        term_counts.columns = ['Term', 'Frequency']
        term_counts = term_counts.sort_values('Frequency', ascending=False).head(15)
        
        fig5 = px.bar(
            term_counts,
            x='Term',
            y='Frequency',
            title='Top 15 High-Risk Terms',
            color='Frequency',
            color_continuous_scale='Reds'
        )
        
        fig5.update_layout(
            xaxis_title='Term',
            yaxis_title='Frequency',
            xaxis={'categoryorder':'total descending'}
        )
    else:
        # Create empty figure if no terms
        fig5 = go.Figure()
        fig5.update_layout(
            title="No High-Risk Terms Detected",
            xaxis_title="Term",
            yaxis_title="Frequency"
        )
    
    fig5.write_html(f"{output_dir}/top_crisis_{method}_terms.html")
    
    # Create dashboard
    from plotly.subplots import make_subplots
    if 'crisis_score'in df.columns:
        fig = make_subplots(
            rows=2, 
            cols=3,
            subplot_titles=(
                "Sentiment Distribution",
                "Risk Level vs Sentiment",
                "Crisis Scores by Risk Level",
                "Risk Level vs Sentiment Heatmap",
                "Top High-Risk Terms"
            )
        )
        
        # Add traces to subplots
        for trace in fig1.data:
            fig.add_trace(trace, row=1, col=1)
        
        for trace in fig2.data:
            fig.add_trace(trace, row=1, col=2)
    
        for trace in fig3.data:
            fig.add_trace(trace, row=1, col=3)
            
        for trace in fig4.data:
            fig.add_trace(trace, row=2, col=1)
            
        for trace in fig5.data:
            fig.add_trace(trace, row=2, col=2)
        
        # Update layout
        fig.update_layout(
            title_text="Reddit Post Analysis Dashboard",
            height=900,
            width=1500,
            showlegend=False
        )
        
        fig.write_html(f"{output_dir}/dashboard_{method}_.html")
        
        return fig1, fig2, fig3, fig4, fig5

    else:
        fig = make_subplots(
            rows=2, 
            cols=2,
            subplot_titles=(
                "Sentiment Distribution",
                "Risk Level vs Sentiment",
                "Risk Level vs Sentiment Heatmap",
                "Top High-Risk Terms"
            )
        )
        
        # Add traces to subplots
        for trace in fig1.data:
            fig.add_trace(trace, row=1, col=1)
        
        for trace in fig2.data:
            fig.add_trace(trace, row=1, col=2)
            
        for trace in fig4.data:
            fig.add_trace(trace, row=2, col=1)
            
        for trace in fig5.data:
            fig.add_trace(trace, row=2, col=2)
        
        # Update layout
        fig.update_layout(
            title_text="Reddit Post Analysis Dashboard",
            height=900,
            width=1500,
            showlegend=False
        )
        
        fig.write_html(f"{output_dir}/dashboard_{method}_.html")
        return fig1, fig2, fig4, fig5

# Main execution function
def process_reddit_data(df, output_dir='.', method='bert', batch_size=32, num_gpus=None):
    """
    Process Reddit data with parallel sentiment analysis and crisis term detection.
    
    Args:
        df: DataFrame containing Reddit posts
        output_dir: Directory to save HTML plots
        method: Detection method - 'bert' or 'word2vec'
        batch_size: Batch size for processing
        num_gpus: Number of GPUs to use (None = auto-detect)
    
    Returns:
        Processed DataFrame with sentiment and crisis term data
    """
    # Auto-detect GPU count if not specified
    if num_gpus is None:
        num_gpus = torch.cuda.device_count()
    print(f"Using {num_gpus} GPUs for processing")
    
    workers = multiprocessing.cpu_count()
    print(f"Using {workers} CPU cores for parallel processing")
    
    # Step 1: Parallel Sentiment Analysis
    print("Performing parallel sentiment analysis...")
    df_with_sentiment = analyze_sentiment_vader_parallel(df, n_jobs=workers)
    
    print("Detecting high-risk crisis terms with multi-GPU BERT...")
    final_df = detect_crisis_terms_bert_parallel(
        df_with_sentiment, 
        batch_size=batch_size,
        num_gpus=num_gpus
    )
    
    # Step 3: Create and save interactive visualizations
    print("Creating interactive visualizations...")
    create_interactive_visualizations(final_df, output_dir, method)
    
    # Print summary statistics
    print("\nSentiment Distribution:")
    print(final_df['sentiment'].value_counts())
    
    print("\nSentiment by Risk Level:")
    print(pd.crosstab(final_df['risk_level'], final_df['sentiment']))
    

    all_terms = []
    for terms in final_df['high_risk_terms']:
        all_terms.extend(terms)
    
    if all_terms:
        print("\nTop 20 Detected Crisis Terms:")
        print(pd.Series(all_terms).value_counts().head(20))
    
    # Sample high-risk posts with negative sentiment
    high_risk_negative = final_df[
        (final_df['risk_level'] == 'High-Risk') & 
        (final_df['sentiment'] == 'Negative')
    ]
    
    print(f"\nTop 5 High-Risk Negative Posts (by crisis score):")
    if len(high_risk_negative) > 0:
        for idx, row in high_risk_negative.head().iterrows():
            print(f"Post ID: {row['post_id']}")
            print(f"Title: {row['title']}")
            print(f"Risk Terms: {row['high_risk_terms']}")
            print(f"Sentiment Score: {row['sentiment_score']:.4f}")
            print("-" * 50)
    
    return final_df

Writing scripts/sentiment_classification.py


In [None]:
import pandas as pd

# Load your data
df = pd.read_csv('mental_health_postsV1_classified.csv')

# this line ran the above process, uncomment to run it
# processed_df = process_reddit_data(df, method='bert', output_dir='results_bert')
# processed_df.to_csv('crisis_terms_processed.csv')

"""You can just refer plots"""

'You can just refer dashboard_bert.html for plots'

## Conclusions
- Moderate Concern takes the majority of posts > Low Concern> High Concern
- Negative Posts > Positive Posts

# Task-3

**Extract Geographical data**

## Method - 1

using standard nlp techniques

In [2]:
%%writefile scripts/geographical_extract_nlp.py

import pandas as pd
import numpy as np
import spacy
import folium
from folium.plugins import HeatMap
import plotly.express as px
from collections import Counter
import re
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable
import time
from tqdm import tqdm

# Load spaCy model for NLP-based place recognition
nlp = spacy.load("en_core_web_lg")

def extract_locations(text):
    """Extract location entities from text using spaCy"""
    if not isinstance(text, str):
        return []
    
    doc = nlp(text)
    locations = []
    
    # Extract GPE (Geo-Political Entity) and LOC (Location) entities
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LOC"]:
            locations.append(ent.text)
    
    return locations

def geocode_location(location_name, geolocator):
    """Convert location name to coordinates using geocoding"""
    try:
        # Add 'USA' to improve geocoding accuracy for US locations
        location = geolocator.geocode(location_name, exactly_one=True) # removed US bias, let's see results
        if location is None:
            location = geolocator.geocode(location_name, exactly_one=True)
        
        if location:
            return {
                'location': location_name,
                'lat': location.latitude,
                'lon': location.longitude,
                'address': location.address
            }
        return None
    except (GeocoderTimedOut, GeocoderUnavailable):
        # Handle timeout errors by waiting and retrying once
        time.sleep(1)
        try:
            location = geolocator.geocode(location_name, exactly_one=True)
            if location:
                return {
                    'location': location_name,
                    'lat': location.latitude,
                    'lon': location.longitude,
                    'address': location.address
                }
            return None
        except (GeocoderTimedOut, GeocoderUnavailable):
            return None

def process_dataset(df:pd.DataFrame):
    """Process the dataset to extract and geocode locations"""
    print("Extracting locations from posts...")
    
    # Initialize a geolocator with a custom user agent
    geolocator = Nominatim(user_agent="crisis_mapping_tool")
    
    # Combine title and content for better location extraction
    df['full_text'] = df['title'].fillna('') + ' ' + df['content'].fillna('')
    
    # Extract locations from text
    df['extracted_locations'] = df['full_text'].apply(extract_locations)
    
    df.to_csv('mental_health_postsV1_extracted_unbiased_locations.csv', index=False)
    # Flatten the list of locations and count occurrences
    all_locations = []
    for locations in df['extracted_locations']:
        all_locations.extend(locations)
    
    location_counts = Counter(all_locations)
    
    # Get the top 50 locations for geocoding (to avoid excessive API calls)
    top_locations = location_counts.most_common(50)
    
    print(f"Top 10 most mentioned locations: {top_locations[:10]}")
    
    # Geocode the top locations
    geocoded_locations = {}
    print("Geocoding top locations...")
    for location_name, count in tqdm(top_locations):
        if location_name not in geocoded_locations:
            geo_info = geocode_location(location_name, geolocator)
            if geo_info:
                geo_info['count'] = count
                geocoded_locations[location_name] = geo_info
            time.sleep(1)  # Be nice to the open geocoding service :)
    
    return geocoded_locations, df

def create_folium_heatmap(geocoded_locations, output_file="crisis_heatmap.html"):
    """Create a Folium heatmap of crisis locations"""
    # Create a list of [lat, lon, weight] for heatmap
    heatmap_data = []
    for loc in geocoded_locations.values():
        # Use count as weight
        for _ in range(loc['count']):
            random_lat = loc['lat'] + np.random.normal(0, 0.01)
            random_lon = loc['lon'] + np.random.normal(0, 0.01)
            heatmap_data.append([random_lat, random_lon, 1])
    
    # Create base map centered on the USA
    m = folium.Map(location=[39.8283, -98.5795], zoom_start=4)
    
    # Add heatmap layer
    HeatMap(heatmap_data).add_to(m)
    
    # Add markers for top 5 locations
    top_5_locations = sorted(geocoded_locations.values(), key=lambda x: x['count'], reverse=True)[:5]
    
    for loc in top_5_locations:
        folium.Marker(
            location=[loc['lat'], loc['lon']],
            popup=f"{loc['location']}: {loc['count']} mentions",
            tooltip=loc['location']
        ).add_to(m)
    
    # Save map
    m.save(output_file)
    print(f"Folium heatmap saved to {output_file}")
    
    return top_5_locations

def create_plotly_heatmap(geocoded_locations, df, output_file="crisis_plotly_map.html"):
    """Create a Plotly choropleth map of crisis locations"""
    # Create a dataframe for plotting
    top_locations_df = pd.DataFrame([
        {
            'location': loc['location'],
            'lat': loc['lat'],
            'lon': loc['lon'],
            'count': loc['count']
        } for loc in geocoded_locations.values()
    ])
    
    # Create scatter mapbox
    fig = px.scatter_mapbox(
        top_locations_df,
        lat="lat",
        lon="lon",
        size="count",
        color="count",
        hover_name="location",
        size_max=25,
        zoom=3,
        mapbox_style="carto-positron",
        title="Crisis Mentions by Location",
        color_continuous_scale=px.colors.sequential.Inferno,
    )
    
    fig.update_layout(
        margin={"r": 0, "t": 30, "l": 0, "b": 0},
        coloraxis_colorbar=dict(title="Post Count")
    )
    
    # Save to HTML
    fig.write_html(output_file)
    print(f"Plotly visualization saved to {output_file}")
    
    return top_locations_df

def analyze_crisis_by_location(df, geocoded_locations):
    """Analyze crisis patterns by location"""
    # Create a mapping of each post to its detected locations
    post_locations = {}
    for idx, row in df.iterrows():
        for loc in row['extracted_locations']:
            if loc in geocoded_locations:
                if idx not in post_locations:
                    post_locations[idx] = []
                post_locations[idx].append(loc)
    
    # For posts with locations, analyze risk level patterns
    location_risk_levels = {}
    for idx, locations in post_locations.items():
        risk_level = df.loc[idx, 'risk_level']
        for loc in locations:
            if loc not in location_risk_levels:
                location_risk_levels[loc] = []
            location_risk_levels[loc].append(risk_level)
    
    # Calculate average risk level by location
    location_avg_risk = {}
    for loc, risks in location_risk_levels.items():
        # Convert any risk levels to numeric if they're not already
        numeric_risks = []
        for risk in risks:
            if isinstance(risk, (int, float)):
                numeric_risks.append(risk)
            elif isinstance(risk, str) and risk.replace('.', '', 1).isdigit():
                numeric_risks.append(float(risk))
        
        if numeric_risks:  # Only calculate if we have valid numeric risk values
            location_avg_risk[loc] = sum(numeric_risks) / len(numeric_risks)
    
    # Sort and return top locations by average risk
    sorted_locations = sorted(location_avg_risk.items(), key=lambda x: x[1], reverse=True)
    return sorted_locations

def main(data_file_path):
    """Main function to process data and generate visualizations"""
    # Load the dataset
    print(f"Loading data from {data_file_path}")
    df = pd.read_csv(data_file_path)
    
    print(f"Dataset loaded with {len(df)} posts")
    print(f"Columns: {df.columns.tolist()}")
    
    # Process dataset to extract and geocode locations
    geocoded_locations, processed_df = process_dataset(df)
    
    # Create heatmap visualizations
    top_5_locations = create_folium_heatmap(geocoded_locations, output_file="plots_png/crisis_heatmap_nlp_geoExtracted.html")
    top_locations_df = create_plotly_heatmap(geocoded_locations, processed_df, output_file="crisis_plotly_map_unbiased.html")
    
    # Analyze crisis patterns by location
    location_risk_analysis = analyze_crisis_by_location(processed_df, geocoded_locations)
    
    # Print top 5 locations with highest crisis discussions
    print("\nTop 5 locations with highest crisis discussions:")
    for i, (loc_name, loc_data) in enumerate(sorted(
            geocoded_locations.items(), 
            key=lambda x: x[1]['count'], 
            reverse=True
        )[:5]):
        print(f"{i+1}. {loc_name}: {loc_data['count']} mentions")
    
    # Print top 5 locations with highest average risk level
    if location_risk_analysis:
        print("\nTop 5 locations with highest average risk level:")
        for i, (loc, avg_risk) in enumerate(location_risk_analysis[:5]):
            print(f"{i+1}. {loc}: {avg_risk:.2f} average risk")
    
    print("\nAnalysis complete!")
    print("- Crisis heatmap saved as 'crisis_heatmap_unbiased.html'")
    print("- Plotly visualization saved as 'crisis_plotly_map_unbiased.html'")

Writing scripts/geographical_extract_nlp.py


In [None]:
# Uncomment to run the above process
# main(data_file_path='mental_health_postsV1_classified.csv')

Loading data from mental_health_postsV1_classified.csv
Dataset loaded with 104483 posts
Columns: ['post_id', 'timestamp', 'title', 'content', 'subreddit', 'ups', 'num_comments', 'url', 'risk_level']
Extracting locations from posts...
Top 10 most mentioned locations: [('uk', 977), ('us', 394), ('canada', 318), ('california', 296), ('va', 286), ('europe', 239), ('nc', 230), ('UK', 222), ('emdr', 219), ('florida', 210)]
Geocoding top locations...


100%|██████████| 50/50 [01:05<00:00,  1.31s/it]

*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



Folium heatmap saved to crisis_heatmap_unbiased.html
Plotly visualization saved to crisis_plotly_map_unbiased.html

Top 5 locations with highest crisis discussions:
1. uk: 977 mentions
2. us: 394 mentions
3. canada: 318 mentions
4. california: 296 mentions
5. va: 286 mentions

Analysis complete!
- Crisis heatmap saved as 'crisis_heatmap.html'
- Plotly visualization saved as 'crisis_plotly_map.html'


In [7]:
df[df['extracted_location'].str.len() > 2]

Unnamed: 0,post_id,timestamp,title,content,subreddit,ups,num_comments,url,risk_level,text_for_location,extracted_location
0,1ek7px9,2024-08-05T03:54:23,I want to go off my meds for my wedding,title pretty much covers getting married next ...,bipolar1,0,12,https://www.reddit.com/r/bipolar1/comments/1ek...,Moderate Concern,I want to go off my meds for my wedding title ...,vega
1,1ek7px9,2024-08-05T03:54:23,I want to go off my meds for my wedding,title pretty much covers getting married next ...,bipolar1,0,12,https://www.reddit.com/r/bipolar1/comments/1ek...,Moderate Concern,I want to go off my meds for my wedding title ...,vega
3,1att0dy,2024-02-18T17:57:37,New to Abilify,hi started abilify im already lithium lamictal...,bipolar1,4,2,https://www.reddit.com/r/bipolar1/comments/1at...,Moderate Concern,New to Abilify hi started abilify im already l...,home depot
4,1869tyv,2023-11-29T04:38:19,I was diagnosed bipolar 1 now year later I’m g...,uhm anyone else feel like world inside glass c...,bipolar1,2,1,https://www.reddit.com/r/bipolar1/comments/186...,Moderate Concern,I was diagnosed bipolar 1 now year later I’m g...,california
5,17rceyh,2023-11-09T18:30:56,Feeling isolated from my disorders,feel like theres no one understands im going e...,bipolar1,1,5,https://www.reddit.com/r/bipolar1/comments/17r...,Moderate Concern,Feeling isolated from my disorders feel like t...,psychiatric hospital
...,...,...,...,...,...,...,...,...,...,...,...
15859,1ieoubh,2025-02-01T02:15:26,Destroyed my dating life with manic tattoos,destroyed dating life horrible manic tattoos p...,bipolar1,7,14,https://www.reddit.com/gallery/1ieoubh,Moderate Concern,Destroyed my dating life with manic tattoos de...,switzerland
15862,1gkk70l,2024-11-06T04:40:38,never ending depression.,feel alone many good things appreciative grate...,bipolar1,8,4,https://www.reddit.com/r/bipolar1/comments/1gk...,Moderate Concern,never ending depression. feel alone many good...,baseball stadium
15863,1gffyt9,2024-10-30T11:39:12,BUSPAR,anyone prescribed buspar bipolar related anxie...,bipolar1,2,8,https://www.reddit.com/r/bipolar1/comments/1gf...,Moderate Concern,BUSPAR anyone prescribed buspar bipolar relat...,BUSPAR
15864,1fjt5qc,2024-09-18T19:14:13,My bipolar symptoms History based on what I ca...,first manic episode happened grade final year ...,bipolar1,5,0,https://www.reddit.com/r/bipolar1/comments/1fj...,Low Concern,My bipolar symptoms History based on what I ca...,america


## Method-2

using roberta-large-ner-english

In [None]:
%%writefile scripts/geographic_extraction_with_roberta.py
import pandas as pd
import folium
from folium.plugins import HeatMap
import plotly.express as px
import requests
import time
import plotly.graph_objects as go
from tqdm import tqdm
from collections import Counter
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import concurrent.futures
import os

# Create NER pipeline on specified device
def create_ner_pipeline(device_id=None):
    device = f"cuda:{device_id}" if device_id is not None else "cpu"
    
    tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
    model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
    
    model.to(device)
    
    return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

def extract_locations_with_transformer(text, ner_pipeline):
    """
    Use a transformer-based NER model to identify location mentions in text.
    Returns a list of location names found in the text.
    """
    if not text or pd.isna(text) or text.strip() == "":
        return []
    
    try:
        # Extract named entities
        results = ner_pipeline(text)
        # Filter for location entities (LOC or GPE)
        locations = []
        for entity in results:
            if entity['entity_group'] in ['LOC', 'GPE']:
                # Clean up the location string
                location = entity['word'].strip()
                if location:
                    locations.append(location)
        return locations
    
    except Exception as e:
        print(f"Error in NER extraction: {e}")
        return []

def process_batch(batch_df, device_id=None):
    # Create pipeline on appropriate device
    nlp = create_ner_pipeline(device_id)
    
    batch_results = []
    
    for _, row in batch_df.iterrows():
        # Extract locations from the combined text
        locations = extract_locations_with_transformer(row['text_for_location'], nlp)
        
        # Add each location as a separate entry with all original row data
        for loc in locations:
            # Create a copy of the original row
            result_row = row.copy()
            # Add the location
            result_row['extracted_location'] = loc
            # Add to results
            batch_results.append(result_row)
    
    return batch_results

# Function to geocode locations using a geocoding API
def geocode_location(location_name):
    """
    Convert location name to latitude and longitude using a geocoding API.
    Returns (lat, lng) tuple or None if geocoding fails.
    """
    try:
        base_url = "https://nominatim.openstreetmap.org/search"
        params = {
            "q": location_name,
            "format": "json",
            "limit": 1
        }
        headers = {
            "User-Agent": "CrisisLocationAnalysis/1.0"
        }
        
        response = requests.get(base_url, params=params, headers=headers)
        
        if response.status_code == 200:
            results = response.json()
            if results:
                lat = float(results[0]["lat"])
                lon = float(results[0]["lon"])
                return (lat, lon)
        
        return None
    
    except Exception as e:
        print(f"Geocoding error for {location_name}: {e}")
        return None

def map_risk_level_to_numeric(risk_text):
    # Define mapping of text values to numeric scores
    risk_mapping = {
        'Low Concern': 1.0,
        'Moderate Concern': 2.5,
        'High-Risk': 5.0,
        'Unclassified': 0.0  # Assign a small value for unclassified
    }
    
    # Return the mapped value or a default if not found
    return risk_mapping.get(risk_text, 0.5)
    
def main():
    # Load the dataset
    file_path = "mental_health_postsV1_classified.csv"
    df = pd.read_csv(file_path)
    
    # Combine title and content for better location extraction
    df['text_for_location'] = df['title'].fillna('') + ' ' + df['content'].fillna('')
    
    # Check for GPU availability
    gpu_available = torch.cuda.is_available()
    num_gpus = torch.cuda.device_count() if gpu_available else 0
    
    if gpu_available:
        print(f"GPU processing enabled! Found {num_gpus} GPU devices.")
        device_type = "GPU"
    else:
        print("No GPUs found. Using CPU processing.")
        device_type = "CPU"
        num_gpus = 0
    
    # Determine workers based on available hardware
    if gpu_available:
        # Use GPU processing - create one batch per GPU
        num_workers = num_gpus
    else:
        # Use CPU processing - create batches based on CPU cores
        num_workers = os.cpu_count() or 4
    
    # Create batches
    batch_size = max(1, len(df) // max(1, num_workers))
    batches = [df.iloc[i:i+batch_size] for i in range(0, len(df), batch_size)]
    print(f"Using {num_workers} {device_type} workers with batch size {batch_size}")
    
    # Extract locations using parallel processing
    print("Extracting locations using parallel processing...")
    all_results = []
    
    if gpu_available:
        # GPU processing using ThreadPoolExecutor
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_gpus) as executor:
            futures = []
            for i, batch in enumerate(batches):
                # Assign each batch to a GPU (cycling if more batches than GPUs)
                gpu_id = i % num_gpus
                future = executor.submit(process_batch, batch, gpu_id)
                futures.append(future)
            
            # Collect results as they complete
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
                try:
                    batch_results = future.result()
                    all_results.extend(batch_results)
                except Exception as e:
                    print(f"Error in batch processing: {e}")
    else:
        # CPU processing using ProcessPoolExecutor
        with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
            futures = []
            for batch in batches:
                future = executor.submit(process_batch, batch)
                futures.append(future)
            
            # Collect results as they complete
            for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
                try:
                    batch_results = future.result()
                    all_results.extend(batch_results)
                except Exception as e:
                    print(f"Error in batch processing: {e}")
    
    # Create a dataframe from all results
    if all_results:
        locations_df = pd.DataFrame(all_results)
    else:
        print("No locations found!")
        return
    
    # Save the complete dataset with extracted locations
    locations_df.to_csv("crisis_locations_extracted.csv", index=False)
    print(f"Saved {len(locations_df)} entries with location data to crisis_locations_extracted.csv")

    # # Count occurrences of each location
    location_counts = Counter(locations_df['extracted_location'])
    
    # Geocode each unique location
    print("Geocoding unique locations...")
    geocoded_locations = {}
    unique_locations = list(location_counts.keys())
    
    for loc in tqdm(unique_locations):
        if loc not in geocoded_locations:
            coords = geocode_location(loc)
            if coords:
                geocoded_locations[loc] = coords
            time.sleep(1)  # Respect API rate limits
    
    # Add geocoded coordinates to the locations dataframe
    locations_df['coordinates'] = locations_df['extracted_location'].map(lambda x: geocoded_locations.get(x, None))
    locations_df = locations_df.dropna(subset=['coordinates'])
    
    # Extract latitude and longitude from coordinates
    locations_df['latitude'] = locations_df['coordinates'].map(lambda x: x[0] if x else None)
    locations_df['longitude'] = locations_df['coordinates'].map(lambda x: x[1] if x else None)
    
    # Save the geocoded data
    locations_df.to_csv("crisis_locations_geocoded.csv", index=False)
    print(f"Saved {len(locations_df)} geocoded entries to crisis_locations_geocoded.csv")

    # this is the data we saved two lines above
    # locations_df = pd.read_csv("/kaggle/input/issr-reddit-locations-data/crisis_locations_geocoded.csv")

    # Calculate total risk per location (if risk_level exists in the dataset)
    geocoded_locations = {}

    # Iterate through unique locations in the dataframe
    for _, row in locations_df.drop_duplicates(subset=['extracted_location']).iterrows():
        if pd.notna(row['latitude']) and pd.notna(row['longitude']):
            # Store the coordinates as a tuple (lat, lng)
            geocoded_locations[row['extracted_location']] = (row['latitude'], row['longitude'])
    
    print(f"Built dictionary with {len(geocoded_locations)} unique locations")
    
    if 'risk_level' in locations_df.columns:

        print("Checking location names...")
        if any(loc.count('Unclassified') > 1 for loc in locations_df['extracted_location'].unique() if isinstance(loc, str)):
            print("Detected corrupted location names, fixing...")
        
            # Fix the corrupted location names
            def clean_location_name(name):
                if isinstance(name, str):
                    if name.count('Unclassified') > 1:
                        return 'Unclassified'
                    if name.count('Moderate Concern') > 1:
                        return 'Moderate Concern'
                    if name.count('High-Risk') > 1:
                        return 'High-Risk'
                    if name.count('Low Concern') > 1:
                        return 'Low Concern'
                return name
            
            # Apply the cleaning function
            locations_df['clean_location'] = locations_df['extracted_location'].apply(clean_location_name)
            # Use the cleaned column for visualization
            locations_df = locations_df.rename(columns={'extracted_location': 'original_location', 'clean_location': 'extracted_location'})
            
            # Update the geocoded_locations dictionary with the cleaned names
            new_geocoded_locations = {}
            for loc, coords in geocoded_locations.items():
                clean_loc = clean_location_name(loc)
                new_geocoded_locations[clean_loc] = coords
            geocoded_locations = new_geocoded_locations

        # Prepare data for heatmap
        locations_df['numeric_risk'] = locations_df['risk_level'].apply(map_risk_level_to_numeric)
        
        location_risk = locations_df.groupby('extracted_location')['numeric_risk'].sum().reset_index()
        location_risk = location_risk.sort_values('numeric_risk', ascending=False)
        
        # Get top 5 locations with highest crisis discussions
        top_5_locations = location_risk.head(5)
        print("\nTop 5 locations with highest crisis discussions:")
        print(top_5_locations)
        
        # Create heatmap using Folium
        print("Creating heatmap visualization...")
        m = folium.Map(location=[39.50, -98.35], zoom_start=4)
        
        # Prepare data for heatmap - ensure all values are numeric
        heat_data = []
        for _, row in locations_df.iterrows():
            try:
                lat = float(row['latitude'])
                lng = float(row['longitude'])
                risk = float(row['numeric_risk'])
                
                # Only add if all values are valid numbers
                if not (pd.isna(lat) or pd.isna(lng) or pd.isna(risk)):
                    heat_data.append([lat, lng, risk])
            except (ValueError, TypeError):
                continue
        
        # Add heatmap to the map if we have valid data
        if heat_data:
            import numpy as np
            heat_data_array = np.array(heat_data)
            HeatMap(heat_data_array, radius=15).add_to(m)
        else:
            print("Warning: No valid data for heatmap")
        
        # Add markers for top 5 locations with cleaner labels
        for _, row in top_5_locations.iterrows():
            if row['extracted_location'] in geocoded_locations:
                coords = geocoded_locations[row['extracted_location']]
                # Get the original risk text for this location (first occurrence)
                original_risk = locations_df[locations_df['extracted_location'] == row['extracted_location']]['risk_level'].iloc[0]
                
                # Create a cleaner popup
                popup_html = f"""
                <div style="font-family: Arial, sans-serif; padding: 5px;">
                    <h4>{row['extracted_location']}</h4>
                    <p>Risk Level: {original_risk}</p>
                    <p>Risk Score: {row['numeric_risk']:.1f}</p>
                </div>
                """
                
                folium.Marker(
                    location=coords,
                    popup=folium.Popup(popup_html, max_width=300),
                    tooltip=row['extracted_location'],
                    icon=folium.Icon(color='red', icon='info-sign')
                ).add_to(m)
        
        m.save('crisis_heatmap.html')
        
        # Create a Plotly visualization for top locations
        fig = px.bar(
            top_5_locations,
            x='extracted_location',
            y='numeric_risk',
            title='Top 5 Locations with Highest Crisis Discussions',
            labels={'extracted_location': 'Location', 'numeric_risk': 'Crisis Risk Score'},
            text='numeric_risk'
        )
        
        fig.update_layout(
            xaxis_title="Location",
            yaxis_title="Crisis Risk Score",
            xaxis={'categoryorder': 'total descending'},  # Sort by highest risk
            font=dict(size=12),
            xaxis_tickangle=-45,
            margin=dict(l=50, r=50, t=80, b=100)
        )
        
        # Format the bar text
        fig.update_traces(
            texttemplate='%{text:.1f}',
            textposition='outside'
        )
        
        fig.write_html('top_locations.html')
    
    print("\nAnalysis complete! Files saved:")
    print("- crisis_locations_extracted.csv (all records with location data)")
    print("- crisis_locations_geocoded.csv (records with geocoded locations)")
    if 'risk_level' in locations_df.columns:
        print("- crisis_heatmap.html (interactive map)")
        print("- top_locations.html (bar chart of top locations)")

Writing scripts/geographic_extraction_with_roberta.py


## Conclusions
- robert-extracted more locations in 16K posts, using SpaCy : 10K posts
- GeoCoded coords are visually flawed in Method-2
- openstreetmap's lack proper semantic search, GoogleMapsAPI can be used to overcome this.

In [None]:
## Uncomment this and then cell to extract locations
# if __name__ == "__main__":
#     main()

# Some more Analysis and Plots

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime


"""This script generates various visualizations for the crisis terms dataset: crisis_terms_bert.csv.
We are analyzing the risk levels and sentiment scores over time, as well as the distribution of these metrics.

Consclusion infered via plots:
 - Their is more data for the recent year, so we can't infer that high-risk posts are increasing over year.
 - The sentiment score is not a good indicator of risk level. Needs to study their methods relevance.
 - There is a clear seasonal pattern in the risk levels, with slightly more high-risk-levels in later second-half of year."""
try:
    df = pd.read_csv("crisis_terms_bert.csv") # Provide proper path to the dataset
except FileNotFoundError:
    print("File not found. Please check the file path.")
    exit()

df['timestamp'] = pd.to_datetime(df['timestamp'])

# Extract year and month for analysis
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['year_month'] = df['timestamp'].dt.strftime('%Y-%m')

# 1. Monthly Average Risk Level Analysis
# Convert risk_level to numeric for averaging
risk_map = {'Low Concern': 1, 'Moderate Concern': 2, 'High-Risk': 3, 'Unclassified': 0}
df['risk_numeric'] = df['risk_level'].map(risk_map)

# Group by year and month
monthly_risk = df.groupby('year_month').agg({
    'risk_numeric': 'mean',
    'sentiment_score': 'mean',
    'timestamp': 'first'  # Keep first timestamp for proper ordering
}).reset_index()

# Sort by timestamp
monthly_risk = monthly_risk.sort_values('timestamp')

# 2. Risk Level Distribution by Month and Year
risk_counts_monthly = df.groupby(['year_month', 'risk_level']).size().reset_index(name='count')
risk_counts_yearly = df.groupby(['year', 'risk_level']).size().reset_index(name='count')

# 3. Sentiment Analysis Over Time
monthly_sentiment = df.groupby('year_month').agg({
    'sentiment_score': 'mean',
    'timestamp': 'first'
}).reset_index().sort_values('timestamp')

sentiment_counts_monthly = df.groupby(['year_month', 'sentiment']).size().reset_index(name='count')
sentiment_counts_yearly = df.groupby(['year', 'sentiment']).size().reset_index(name='count')

# 4. Combined Risk and Sentiment Analysis
combined_monthly = df.groupby('year_month').agg({
    'risk_numeric': 'mean',
    'sentiment_score': 'mean',
    'timestamp': 'first'
}).reset_index().sort_values('timestamp')

# 5. High Risk Terms Analysis
df['has_high_risk_terms'] = df['high_risk_terms'].apply(lambda x: 1 if len(x) > 0 else 0)
high_risk_monthly = df.groupby('year_month').agg({
    'has_high_risk_terms': 'sum',
    'timestamp': 'first'
}).reset_index().sort_values('timestamp')

# Create visualizations

# 1. Monthly Average Risk Level Trend
fig1 = px.line(monthly_risk, x='year_month', y='risk_numeric', 
              title='Average Risk Level Over Time',
              labels={'risk_numeric': 'Risk Level (1=Low, 4=Critical)', 'year_month': 'Month'})

fig1.update_layout(xaxis_tickangle=-45)

# 2. Risk Level Distribution Stacked Area Chart
fig2 = px.area(risk_counts_monthly, x='year_month', y='count', color='risk_level',
              title='Risk Level Distribution by Month',
              labels={'count': 'Number of Posts', 'year_month': 'Month', 'risk_level': 'Risk Level'},
              color_discrete_map={'low': 'green', 'medium': 'yellow', 'high': 'orange', 'critical': 'red'})

fig2.update_layout(xaxis_tickangle=-45)

# 3. Sentiment Score Trend
fig3 = px.line(monthly_sentiment, x='year_month', y='sentiment_score',
              title='Average Sentiment Score Over Time',
              labels={'sentiment_score': 'Sentiment Score (-1 to 1)', 'year_month': 'Month'})

fig3.update_layout(xaxis_tickangle=-45)

# 4. Sentiment Distribution
fig4 = px.area(sentiment_counts_monthly, x='year_month', y='count', color='sentiment',
              title='Sentiment Distribution by Month',
              labels={'count': 'Number of Posts', 'year_month': 'Month', 'sentiment': 'Sentiment'},
              color_discrete_map={'negative': 'red', 'neutral': 'gray', 'positive': 'green'})

fig4.update_layout(xaxis_tickangle=-45)

# 5. Combined Risk and Sentiment by Month
fig5 = make_subplots(specs=[[{"secondary_y": True}]])

fig5.add_trace(
    go.Scatter(x=combined_monthly['year_month'], y=combined_monthly['risk_numeric'], name="Risk Level"),
    secondary_y=False,
)

fig5.add_trace(
    go.Scatter(x=combined_monthly['year_month'], y=combined_monthly['sentiment_score'], name="Sentiment Score"),
    secondary_y=True,
)

fig5.update_layout(
    title_text="Risk Level vs Sentiment Score Over Time",
    xaxis_tickangle=-45
)

fig5.update_yaxes(title_text="Risk Level (1-4)", secondary_y=False)
fig5.update_yaxes(title_text="Sentiment Score (-1 to 1)", secondary_y=True)

# 6. Yearly Risk Level Distribution
fig6 = px.bar(risk_counts_yearly, x='year', y='count', color='risk_level', barmode='group',
             title='Risk Level Distribution by Year',
             labels={'count': 'Number of Posts', 'year': 'Year', 'risk_level': 'Risk Level'},
             color_discrete_map={'low': 'green', 'medium': 'yellow', 'high': 'orange', 'critical': 'red'})

# 7. Yearly Sentiment Distribution
fig7 = px.bar(sentiment_counts_yearly, x='year', y='count', color='sentiment', barmode='group',
             title='Sentiment Distribution by Year',
             labels={'count': 'Number of Posts', 'year': 'Year', 'sentiment': 'Sentiment'},
             color_discrete_map={'negative': 'red', 'neutral': 'gray', 'positive': 'green'})

# 8. High Risk Terms Occurrence Over Time
fig8 = px.bar(high_risk_monthly, x='year_month', y='has_high_risk_terms',
             title='Posts with High Risk Terms Over Time',
             labels={'has_high_risk_terms': 'Number of Posts', 'year_month': 'Month'})

fig8.update_layout(xaxis_tickangle=-45)

# Display the figures
# fig1.show()
# fig2.show()
# fig3.show()
# fig4.show()
# fig5.show()
# fig6.show()
# fig7.show()
# fig8.show()

# Create a dashboard layout
dashboard_fig = make_subplots(
    rows=4, cols=2,
    subplot_titles=(
        "Average Risk Level Over Time",
        "Risk Level Distribution by Month",
        "Average Sentiment Score Over Time",
        "Sentiment Distribution by Month",
        "Risk Level vs Sentiment Score Over Time",
        "Risk Level Distribution by Year",
        "Sentiment Distribution by Year",
        "Posts with High Risk Terms Over Time"
    ),
    vertical_spacing=0.1,
    horizontal_spacing=0.1
)

# Add each figure as a subplot
dashboard_fig.add_traces(fig1.data, rows=1, cols=1)
dashboard_fig.add_traces(fig2.data, rows=1, cols=2)
dashboard_fig.add_traces(fig3.data, rows=2, cols=1)
dashboard_fig.add_traces(fig4.data, rows=2, cols=2)
dashboard_fig.add_traces(fig5.data, rows=3, cols=1)
dashboard_fig.add_traces(fig6.data, rows=3, cols=2)
dashboard_fig.add_traces(fig7.data, rows=4, cols=1)
dashboard_fig.add_traces(fig8.data, rows=4, cols=2)

# Update layout for better visualization
dashboard_fig.update_layout(
    height=1200, width=1800,
    title_text="Dashboard: Risk and Sentiment Analysis",
    showlegend=False
)

# Display the dashboard
dashboard_fig.write_image("dashboard.png", scale=2)
dashboard_fig.write_html("dashboard.html", include_plotlyjs='cdn')

def create_risk_heatmap(df):
    
    # Ensure risk_level is properly converted to numeric
    if df['risk_level'].dtype == 'object':
        risk_map = {'Low Concern': 1, 'Moderate Concern': 2, 'High-Risk': 3, 'Unclassified': 0}
        df['risk_numeric'] = df['risk_level'].map(risk_map)

    # Extract numeric month for proper ordering
    df['month_num'] = df['timestamp'].dt.month

    # Create pivot table with proper ordering
    heatmap_data = df.pivot_table(
        values='risk_numeric', 
        index='month_num',  # Use numeric month for correct ordering
        columns='year', 
        aggfunc='mean'
    ).round(2)

    # Create month labels for y-axis
    month_names = {
        1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
        7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'
    }

    # Sort by month number to ensure correct order
    heatmap_data = heatmap_data.sort_index()

    # Create the heatmap
    fig_heatmap = px.imshow(
        heatmap_data,
        labels=dict(x="Year", y="Month", color="Avg Risk Level"),
        x=heatmap_data.columns,
        y=[month_names[i] for i in heatmap_data.index],
        title="Monthly Risk Level Heatmap",
        color_continuous_scale="RdYlGn_r"
    )

    # Add custom layout
    fig_heatmap.update_layout(
        height=600,
        width=800,
        xaxis={'side': 'bottom'},
        xaxis_title="Year",
        yaxis_title="Month"
    )

    # Fix axis display issues
    fig_heatmap.update_xaxes(tickangle=0)

    fig_heatmap.write_image("monthly_risk_heatmap.png", scale=2)
    fig_heatmap.write_html("monthly_risk_heatmap.html", include_plotlyjs='cdn')

    return fig_heatmap

def seasonal_analysis(df):
    if df['risk_level'].dtype == 'object':
        risk_map = {'Low Concern': 1, 'Moderate Concern': 2, 'High-Risk': 3, 'Unclassified': 0}
        df['risk_numeric'] = df['risk_level'].map(risk_map)

    # Add season column with better ordering
    season_order = {'Winter': 1, 'Spring': 2, 'Summer': 3, 'Fall': 4}
    df['season'] = df['timestamp'].dt.month.map({
        1: 'Winter', 2: 'Winter', 3: 'Spring', 
        4: 'Spring', 5: 'Spring', 6: 'Summer',
        7: 'Summer', 8: 'Summer', 9: 'Fall',
        10: 'Fall', 11: 'Fall', 12: 'Winter'
    })

    seasonal_risk = df.groupby(['year', 'season']).agg({
        'risk_numeric': 'mean'
    }).reset_index()

    fig_seasonal = px.line(
        seasonal_risk, 
        x='season', 
        y='risk_numeric', 
        color='year',
        title='Seasonal Risk Level Analysis',
        labels={'risk_numeric': 'Average Risk Level', 'season': 'Season'},
        category_orders={"season": ["Winter", "Spring", "Summer", "Fall"]}
    )

    fig_seasonal.update_layout(
        height=600,
        width=900,
        legend_title="Year",
        yaxis_range=[1, 4]
    )

    fig_seasonal.write_image("seasonal_risk_analysis.png", scale=2)
    fig_seasonal.write_html("seasonal_risk_analysis.html", include_plotlyjs='cdn')

    return fig_seasonal

fig_seasonal = seasonal_analysis(df)
fig_heatmap = create_risk_heatmap(df)
