# Mental Health Sentiment Analysis

This notebook trains a sentiment analysis model for mental health text using DistilBERT.

## Setup and Installation

In [None]:
# Install required packages
!pip install -q transformers datasets scikit-learn pandas torch streamlit gdown

## Step 1: Download and Load Dataset

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Download dataset from Google Drive
# Dataset ID: 1B5QpclAWO_x78sYTx63Q05yv1kwc_hno
dataset_file = "rmhd_labeled_merged.csv"

# Check if running in Colab
is_colab = 'COLAB_GPU' in os.environ or 'google.colab' in str(get_ipython())

if is_colab:
    !pip install -q gdown
    !gdown 1B5QpclAWO_x78sYTx63Q05yv1kwc_hno -O {dataset_file}
    file_path = f"/content/{dataset_file}"
else:
    file_path = dataset_file

# Load and prepare the dataset
if not os.path.exists(file_path):
    raise FileNotFoundError(f"Dataset file not found at {file_path}. Please download it first.")

df = pd.read_csv(file_path)

# Handle different possible column names
if 'selftext' in df.columns:
    df = df.rename(columns={"selftext": "text"})
elif 'Text' in df.columns:
    df = df.rename(columns={"Text": "text"})

if 'Label' in df.columns:
    df = df.rename(columns={"Label": "label"})

# Clean the dataset
df = df[["text", "label"]].dropna()
df = df[df['text'].str.strip() != '']  # Remove blank rows

# Encode labels
le = LabelEncoder()
df["labels"] = le.fit_transform(df["label"])
label_names = list(le.classes_)
num_labels = len(label_names)

print(f"Dataset loaded successfully!")
print(f"Number of samples: {len(df)}")
print(f"Label names: {label_names}")
print(f"Number of labels: {num_labels}")
print(f"\nFirst few rows:")
print(df.head())

## Step 2: Convert to Hugging Face Dataset

In [None]:
from datasets import Dataset

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df[["text", "labels"]])

# Train-test split
tokenized_dataset = dataset.train_test_split(test_size=0.1, seed=42)

print(f"Train samples: {len(tokenized_dataset['train'])}")
print(f"Test samples: {len(tokenized_dataset['test'])}")

## Step 3: Initialize Tokenizer and Model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Use DistilBERT for faster training and inference
model_id = "distilbert-base-uncased"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=num_labels
)

print(f"Model and tokenizer loaded: {model_id}")

## Step 4: Tokenize Dataset

In [None]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

# Tokenize the dataset
tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)

# Remove text column and set format
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

print("Dataset tokenized successfully!")
print(f"Sample tokenized data: {tokenized_dataset['train'][0]}")

## Step 5: Training Configuration

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=1,
    report_to="none",
    seed=42
)

print("Training arguments configured!")

## Step 6: Train the Model

In [None]:
from transformers import Trainer

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Train the model
print("Starting training...")
trainer.train()

print("Training completed!")

## Step 7: Save Model and Tokenizer

In [None]:
output_dir = "./sentiment_model"

# Save model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

# Save label mapping for later use
import json
label_map = {i: label for i, label in enumerate(label_names)}
with open(f"{output_dir}/label_map.json", "w") as f:
    json.dump(label_map, f, indent=2)

print(f"Model, tokenizer, and label map saved to {output_dir}")

## Step 8: Test the Model

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import json

# Load the saved model and tokenizer
output_dir = "./sentiment_model"
loaded_model = AutoModelForSequenceClassification.from_pretrained(output_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Load label map
with open(f"{output_dir}/label_map.json", "r") as f:
    label_map = json.load(f)

def predict_sentiment(text):
    """
    Predicts the sentiment of a given text using the loaded model.
    
    Args:
        text (str): The input text for sentiment analysis.
    
    Returns:
        str: The predicted sentiment label.
    """
    # Tokenize the input text
    inputs = loaded_tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors="pt")
    
    # Perform inference
    with torch.no_grad():
        outputs = loaded_model(**inputs)
    
    # Get the predicted label (index with the highest score)
    predicted_label_id = torch.argmax(outputs.logits).item()
    
    # Map the label ID back to the sentiment label string
    predicted_sentiment = label_map.get(str(predicted_label_id), "Unknown Label")
    
    return predicted_sentiment

# Test examples
test_texts = [
    "I am feeling very anxious and stressed about my exams.",
    "This is the best day of my life!",
    "I've been struggling with depression lately."
]

print("Testing the model:")
print("=" * 50)
for text in test_texts:
    predicted_sentiment = predict_sentiment(text)
    print(f"Text: '{text}'")
    print(f"Predicted Sentiment: {predicted_sentiment}")
    print("-" * 50)