<a href="https://colab.research.google.com/github/Ayushverma41/Mental-State-Prediction-using-NLP/blob/main/Code/MentalBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =============================
# 1. Install and Import Libraries
# =============================
!pip install transformers torch pandas scikit-learn matplotlib seaborn joblib -q

import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import joblib

In [None]:
# =============================
# 2. Load Dataset
# =============================
data_path = "/content/drive/MyDrive/Mental State model/Data/Train_Data.csv"
df = pd.read_csv(data_path)

print("âœ… Dataset Loaded Successfully!")
print("Shape:", df.shape)
print(df.head())

# Encode labels
le = LabelEncoder()
df['status'] = le.fit_transform(df['status'])
num_classes = len(le.classes_)
print("\nLabel Mapping:", dict(zip(le.classes_, range(num_classes))))

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['statement'].values, df['status'].values, test_size=0.2, random_state=42, stratify=df['status']
)

In [None]:
# =============================
# 3. Tokenization & Dataset Class
# =============================
tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")

class MentalHealthDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = MentalHealthDataset(train_texts, train_labels, tokenizer)
val_dataset = MentalHealthDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
# =============================
# 4. Define Model
# =============================
class MentalBERTClassifier(nn.Module):
    def __init__(self, embedding_dim=128, hidden_dim=128, output_dim=5, n_layers=2, dropout=0.3):
        super(MentalBERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("mental/mental-bert-base-uncased")
        self.lstm = nn.LSTM(self.bert.config.hidden_size, hidden_dim, num_layers=n_layers,
                            batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state
        lstm_out, _ = self.lstm(hidden_state)
        avg_pool = torch.mean(lstm_out, 1)
        output = self.dropout(avg_pool)
        return self.fc(output)

In [None]:
# =============================
# 5. Training Setup
# =============================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MentalBERTClassifier(output_dim=num_classes, dropout=0.3).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Scheduler & Early Stopping
epochs = 5
num_training_steps = len(train_loader) * epochs
num_warmup_steps = int(0.1 * num_training_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

best_val_loss = float('inf')
patience = 2
early_stop_counter = 0

train_losses, val_losses, train_accs, val_accs = [], [], [], []

In [None]:
# =============================
# 6. Training Loop (with Early Stopping)
# =============================
print("\nðŸš€ Starting Training...\n")
for epoch in range(epochs):
    model.train()
    total_loss, total_correct = 0, 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        preds = torch.argmax(outputs, dim=1)
        total_correct += torch.sum(preds == labels).item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    train_acc = total_correct / len(train_dataset)
    train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_correct, val_loss = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            val_correct += torch.sum(preds == labels).item()

    val_acc = val_correct / len(val_dataset)
    val_loss /= len(val_loader)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accs.append(train_acc)
    val_accs.append(val_acc)

    print(f"Epoch [{epoch+1}/{epochs}] | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} "
          f"| Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stop_counter = 0
        best_model_state = model.state_dict()
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("ðŸ›‘ Early stopping triggered â€” validation loss not improving.\n")
            break

# Load the best model
model.load_state_dict(best_model_state)
print("\nâœ… Training Completed Successfully with Early Stopping.\n")


In [None]:
# =============================
# 7. Evaluation & Visualization
# =============================
img_save_dir = "/content/drive/MyDrive/Mental State model/Images/MentalBERT/"
os.makedirs(img_save_dir, exist_ok=True)

model.eval()
val_preds, val_labels_true = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        val_preds.extend(preds.cpu().numpy())
        val_labels_true.extend(labels.cpu().numpy())

# ---- Metrics ----
val_acc = accuracy_score(val_labels_true, val_preds)
val_f1 = f1_score(val_labels_true, val_preds, average='weighted')
print("ðŸ“Š Validation Performance")
print(f"Accuracy: {val_acc:.4f}")
print(f"F1-Score: {val_f1:.4f}")
print("\nClassification Report:\n", classification_report(val_labels_true, val_preds, target_names=le.classes_))

# ---- Confusion Matrix ----
cm = confusion_matrix(val_labels_true, val_preds)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix - MentalBERT (Validation Data)")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.savefig(os.path.join(img_save_dir, "confusion_matrix_validation.png"))
plt.show()

# ---- Accuracy Plot ----
plt.figure(figsize=(8,6))
plt.plot(train_accs, label='Training Accuracy', marker='o')
plt.plot(val_accs, label='Validation Accuracy', marker='o')
plt.title("Accuracy Comparison (Training vs Validation)")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(img_save_dir, "accuracy_comparison.png"))
plt.show()

# ---- Loss Plot ----
plt.figure(figsize=(8,6))
plt.plot(train_losses, label='Training Loss', marker='o')
plt.plot(val_losses, label='Validation Loss', marker='o')
plt.title("Loss Comparison (Training vs Validation)")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(img_save_dir, "loss_comparison.png"))
plt.show()

print(f"\nâœ… All evaluation plots saved to: {img_save_dir}")

In [None]:
# =============================
# 8. Save Trained Model
# =============================
save_directory = "/content/drive/MyDrive/Mental State model/Model/Mentalbert/"
os.makedirs(save_directory, exist_ok=True)

torch.save(model.state_dict(), os.path.join(save_directory, "mentalbert_model.pt"))
tokenizer.save_pretrained(save_directory)
joblib.dump(le, os.path.join(save_directory, "label_encoder.pkl"))

print(f"\nðŸ’¾ Model, Tokenizer, and Label Encoder saved at:\n{save_directory}")

**Testing**

In [None]:
# =============================
# 1. Import Required Libraries
# =============================
import torch
import pandas as pd
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder
import joblib
import os

# =============================
# 2. Load Saved Model, Tokenizer, and Encoder
# =============================
from torch import nn

class MentalBERTClassifier(nn.Module):
    def __init__(self, embedding_dim=128, hidden_dim=128, output_dim=5, n_layers=2, dropout=0.3):
        super(MentalBERTClassifier, self).__init__()
        from transformers import AutoModel
        self.bert = AutoModel.from_pretrained("mental/mental-bert-base-uncased")
        self.lstm = nn.LSTM(self.bert.config.hidden_size, hidden_dim, num_layers=n_layers,
                            batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state
        lstm_out, _ = self.lstm(hidden_state)
        avg_pool = torch.mean(lstm_out, 1)
        output = self.dropout(avg_pool)
        return self.fc(output)

# Load everything
model_dir = "/content/drive/MyDrive/Mental State model/Model/Mentalbert/"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_dir)
le = joblib.load(os.path.join(model_dir, "label_encoder.pkl"))

# Rebuild model with correct output dimensions
model = MentalBERTClassifier(output_dim=len(le.classes_))
model.load_state_dict(torch.load(os.path.join(model_dir, "mentalbert_model.pt"), map_location=device))
model.to(device)
model.eval()

print("âœ… Model and tokenizer loaded successfully from:", model_dir)

# =============================
# 3. Load Test Data
# =============================
test_path = "/content/drive/MyDrive/Mental State model/Data/Test_Data.csv"
df_test = pd.read_csv(test_path)

print("ðŸ“„ Test Data Loaded â€” Shape:", df_test.shape)
print(df_test.head())

# =============================
# 4. Predict on Test Data
# =============================
predictions = []

for text in df_test['statement']:
    encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(encoding['input_ids'], encoding['attention_mask'])
        pred = torch.argmax(outputs, dim=1).item()
        pred_label = le.inverse_transform([pred])[0]
        predictions.append(pred_label)

df_test["Predicted_Status_MentalBERT"] = predictions

# =============================
# 5. Save Output CSV
# =============================
output_path = "/content/drive/MyDrive/Mental State model/Data/MentalBERT_Test_Predictions.csv"
df_test.to_csv(output_path, index=False)

print(f"\nâœ… Predictions completed and saved to:\n{output_path}")
print("\nSample Predictions:")
print(df_test.head())

# =============================
# 6. Single Sentence Prediction Function
# =============================
def predict_sentence(text):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(encoding['input_ids'], encoding['attention_mask'])
        pred = torch.argmax(outputs, dim=1).item()
    return le.inverse_transform([pred])[0]

# Example usage
example = "I feel worthless and anxious all the time."
print("\nðŸ§© Example Single Sentence Prediction:")
print("Input:", example)
print("Predicted Mental State:", predict_sentence(example))


**Evaluation**

In [None]:
# =============================
# 1. Import Libraries
# =============================
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import os

# =============================
# 2. Load Paths and Setup
# =============================
img_save_dir = "/content/drive/MyDrive/Mental State model/Images/Mentalbert/"
os.makedirs(img_save_dir, exist_ok=True)

# Assuming you have:
# df_test with actual + predicted
# df (training dataframe)
# model, tokenizer, le, train_loader, val_loader available

# =============================
# 3. Evaluate on Training Data
# =============================
train_preds, train_true = [], []

model.eval()
with torch.no_grad():
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)

        train_preds.extend(preds.cpu().numpy())
        train_true.extend(labels.cpu().numpy())

train_acc = accuracy_score(train_true, train_preds)
train_f1 = f1_score(train_true, train_preds, average='weighted')

print(f"\nðŸ“˜ Training Accuracy: {train_acc:.4f}")
print(f"ðŸ“˜ Training F1-score: {train_f1:.4f}")

# =============================
# 4. Evaluate on Test Data
# =============================
test_true = df_test["status"]
test_preds = df_test["Predicted_Status_MentalBERT"]

# Convert labels to numeric for metrics
test_true_enc = le.transform(test_true)
test_preds_enc = le.transform(test_preds)

test_acc = accuracy_score(test_true_enc, test_preds_enc)
test_f1 = f1_score(test_true_enc, test_preds_enc, average='weighted')

print(f"\nðŸ“— Testing Accuracy: {test_acc:.4f}")
print(f"ðŸ“— Testing F1-score: {test_f1:.4f}")

# =============================
# 5. Accuracy Bar Chart (Train vs Test)
# =============================
plt.figure(figsize=(7,6))
bars = plt.bar(["Training", "Testing"], [train_acc, test_acc], color=['#4c72b0', '#55a868'])
plt.title("Accuracy Comparison: Training vs Testing")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.bar_label(bars, fmt="%.3f", label_type='edge')
plt.tight_layout()
plt.savefig(os.path.join(img_save_dir, "accuracy_bar_comparison.png"))
plt.show()

# =============================
# 6. Confusion Matrix (Training)
# =============================
cm_train = confusion_matrix(train_true, train_preds)
plt.figure(figsize=(8,6))
sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix - Training Data (MentalBERT)")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.savefig(os.path.join(img_save_dir, "confusion_matrix_training.png"))
plt.show()

# =============================
# 7. Confusion Matrix (Testing)
# =============================
cm_test = confusion_matrix(test_true_enc, test_preds_enc)
plt.figure(figsize=(8,6))
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Greens',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix - Testing Data (MentalBERT)")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.savefig(os.path.join(img_save_dir, "confusion_matrix_testing.png"))
plt.show()

# =============================
# 8. Loss Visualization (Already from training)
# =============================
plt.figure(figsize=(8,6))
plt.plot(train_losses, label='Training Loss', marker='o')
plt.plot(val_losses, label='Validation Loss', marker='o')
plt.title("Training & Validation Loss Over Epochs")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(img_save_dir, "loss_curve.png"))
plt.show()

# =============================
# 9. Print Final Summary
# =============================
print("\nðŸ“Š Final Performance Summary")
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy: {test_acc:.4f}")
print(f"Training F1-Score: {train_f1:.4f}")
print(f"Testing F1-Score: {test_f1:.4f}")

print(f"\nâœ… All plots and images saved at:\n{img_save_dir}")
