# Mental Health Sentiment Analysis

This notebook trains a sentiment analysis model for mental health text using LLaMA 2.

## Setup and Installation

**Important:** LLaMA 2 requires a Hugging Face account and access token. 
1. Create an account at https://huggingface.co
2. Request access to LLaMA 2 at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
3. Create a token at https://huggingface.co/settings/tokens
4. In Google Colab: Add the token as a secret named `HF_TOKEN`
5. Locally: Set it as environment variable or use `huggingface-cli login`

In [None]:
# Install required packages
!pip install -q transformers datasets scikit-learn pandas torch streamlit gdown accelerate bitsandbytes

## Step 1: Download and Load Dataset

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Download dataset from Google Drive
# Dataset ID: 1B5QpclAWO_x78sYTx63Q05yv1kwc_hno
dataset_file = "rmhd_labeled_merged.csv"

# Check if running in Colab
is_colab = 'COLAB_GPU' in os.environ or 'google.colab' in str(get_ipython())

if is_colab:
    !pip install -q gdown
    !gdown 1B5QpclAWO_x78sYTx63Q05yv1kwc_hno -O {dataset_file}
    file_path = f"/content/{dataset_file}"
else:
    file_path = dataset_file

# Load and prepare the dataset
if not os.path.exists(file_path):
    raise FileNotFoundError(f"Dataset file not found at {file_path}. Please download it first.")

df = pd.read_csv(file_path)

# Handle different possible column names
if 'selftext' in df.columns:
    df = df.rename(columns={"selftext": "text"})
elif 'Text' in df.columns:
    df = df.rename(columns={"Text": "text"})

if 'Label' in df.columns:
    df = df.rename(columns={"Label": "label"})

# Clean the dataset
df = df[["text", "label"]].dropna()
df = df[df['text'].str.strip() != '']  # Remove blank rows

# Encode labels
le = LabelEncoder()
df["labels"] = le.fit_transform(df["label"])
label_names = list(le.classes_)
num_labels = len(label_names)

print(f"Dataset loaded successfully!")
print(f"Number of samples: {len(df)}")
print(f"Label names: {label_names}")
print(f"Number of labels: {num_labels}")
print(f"\nFirst few rows:")
print(df.head())

## Step 2: Convert to Hugging Face Dataset

In [None]:
from datasets import Dataset

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df[["text", "labels"]])

# Train-test split
tokenized_dataset = dataset.train_test_split(test_size=0.1, seed=42)

print(f"Train samples: {len(tokenized_dataset['train'])}")
print(f"Test samples: {len(tokenized_dataset['test'])}")

## Step 3: Setup Hugging Face Authentication

**Note:** LLaMA 2 requires authentication. Make sure you have:
- Accepted the model license on Hugging Face
- Created a token at https://huggingface.co/settings/tokens
- Added it to Colab Secrets (name: `HF_TOKEN`) or set as environment variable

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from huggingface_hub import login

# LLaMA 2 model ID
model_id = "NousResearch/Llama-2-7b-chat-hf"

# Setup Hugging Face authentication
is_colab = 'COLAB_GPU' in os.environ or 'google.colab' in str(get_ipython())

if is_colab:
    # Try to get token from Colab secrets
    try:
        from google.colab import userdata
        hf_token = userdata.get('HF_TOKEN')
        if hf_token:
            login(token=hf_token)
            print("✓ Authenticated with Hugging Face using Colab secret")
        else:
            print("⚠️ HF_TOKEN not found in Colab secrets. Please add it in Colab Secrets.")
            print("You can also manually login using: login(token='your_token_here')")
    except ImportError:
        print("⚠️ Not in Colab environment. Please set HF_TOKEN environment variable or use login()")
else:
    # For local environment, try to get from environment variable
    hf_token = os.environ.get('HF_TOKEN')
    if hf_token:
        login(token=hf_token)
        print("✓ Authenticated with Hugging Face using environment variable")
    else:
        print("⚠️ HF_TOKEN not found. Please set it as environment variable or use login()")
        print("You can run: huggingface-cli login")

# Load tokenizer
print(f"\nLoading tokenizer for {model_id}...")
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# Set pad token if it doesn't exist (LLaMA doesn't have one by default)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Load model with quantization for memory efficiency (optional but recommended)
print(f"Loading model {model_id}...")
print("Note: This may take a few minutes and requires significant GPU memory.")

try:
    # Configure 4-bit quantization to reduce memory usage
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_id,
        num_labels=num_labels,
        quantization_config=quantization_config,
        device_map="auto"
    )
    print("✓ Model loaded with 4-bit quantization")
except Exception as e:
    print(f"⚠️ Could not load with quantization: {e}")
    print("Loading without quantization (requires more memory)...")
    model = AutoModelForSequenceClassification.from_pretrained(
        model_id,
        num_labels=num_labels,
        device_map="auto"
    )

print(f"✓ Model and tokenizer loaded: {model_id}")

## Step 4: Tokenize Dataset

In [None]:
def tokenize_function(example):
    # LLaMA 2 works better with longer sequences, but we'll keep 256 for consistency
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

# Tokenize the dataset
tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)

# Remove text column and set format
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

print("Dataset tokenized successfully!")
print(f"Sample tokenized data: {tokenized_dataset['train'][0]}")

## Step 5: Training Configuration

In [None]:
from transformers import TrainingArguments

# LLaMA 2 requires smaller batch sizes due to memory constraints
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=2,  # Reduced for LLaMA 2 (was 8)
    per_device_eval_batch_size=2,   # Reduced for LLaMA 2 (was 8)
    gradient_accumulation_steps=4,  # Effective batch size = 2 * 4 = 8
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=1,
    report_to="none",
    seed=42,
    fp16=True,  # Use mixed precision for LLaMA 2
    optim="adamw_torch",
    learning_rate=2e-5,
    warmup_steps=100
)

print("Training arguments configured for LLaMA 2!")
print("Note: Training LLaMA 2 requires significant GPU memory and time.")

## Step 6: Train the Model

In [None]:
from transformers import Trainer

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

# Train the model
print("Starting training...")
trainer.train()

print("Training completed!")

## Step 7: Save Model and Tokenizer

In [None]:
output_dir = "./sentiment_model"

# Save model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

# Save label mapping for later use
import json
label_map = {i: label for i, label in enumerate(label_names)}
with open(f"{output_dir}/label_map.json", "w") as f:
    json.dump(label_map, f, indent=2)

print(f"Model, tokenizer, and label map saved to {output_dir}")

## Step 8: Test the Model

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import json

# Load the saved model and tokenizer
output_dir = "./sentiment_model"
loaded_model = AutoModelForSequenceClassification.from_pretrained(output_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Load label map
with open(f"{output_dir}/label_map.json", "r") as f:
    label_map = json.load(f)

def predict_sentiment(text):
    """
    Predicts the sentiment of a given text using the loaded model.
    
    Args:
        text (str): The input text for sentiment analysis.
    
    Returns:
        str: The predicted sentiment label.
    """
    # Tokenize the input text
    inputs = loaded_tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors="pt")
    
    # Perform inference
    with torch.no_grad():
        outputs = loaded_model(**inputs)
    
    # Get the predicted label (index with the highest score)
    predicted_label_id = torch.argmax(outputs.logits).item()
    
    # Map the label ID back to the sentiment label string
    predicted_sentiment = label_map.get(str(predicted_label_id), "Unknown Label")
    
    return predicted_sentiment

# Test examples
test_texts = [
    "I am feeling very anxious and stressed about my exams.",
    "This is the best day of my life!",
    "I've been struggling with depression lately."
]

print("Testing the model:")
print("=" * 50)
for text in test_texts:
    predicted_sentiment = predict_sentiment(text)
    print(f"Text: '{text}'")
    print(f"Predicted Sentiment: {predicted_sentiment}")
    print("-" * 50)