<a href="https://colab.research.google.com/github/Ayushverma41/Mental-State-Prediction-using-NLP/blob/main/Code/Roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =====================================
# Custom RoBERTa + Neural Classifier for Mental Health Prediction
# =====================================

!pip install transformers torch scikit-learn matplotlib seaborn tqdm -q

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import RobertaTokenizer, RobertaModel
from tqdm.auto import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [None]:
# =====================================
# 1. Load Dataset
# =====================================
data_path = "/content/drive/MyDrive//Data/Train_Data.csv"
df = pd.read_csv(data_path)

print("‚úÖ Data loaded successfully!")
df = df.dropna(subset=['statement', 'status'])

In [None]:
# =====================================
# 2. Preprocessing
# =====================================
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['status'])
labels = label_encoder.classes_
num_classes = len(labels)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['statement'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)


In [None]:
# =====================================
# 3. Tokenization
# =====================================
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class MentalHealthDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = MentalHealthDataset(train_texts, train_labels, tokenizer)
test_dataset = MentalHealthDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


In [None]:
# =====================================
# 4. Model Architecture
# =====================================
class RoBERTaClassifier(nn.Module):
    def __init__(self, embedding_dim=128, hidden_dim=128, output_dim=5, n_layers=2, dropout=0.3):
        super(RoBERTaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.embedding_layer = nn.Linear(self.roberta.config.hidden_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                            batch_first=True, dropout=dropout, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # freeze RoBERTa weights
            outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        x = self.embedding_layer(hidden_states)
        lstm_out, _ = self.lstm(x)
        avg_pool = torch.mean(lstm_out, 1)
        x = self.dropout(avg_pool)
        logits = self.fc(x)
        return logits

In [None]:
# =====================================
# 5. Training Setup (Fixed)
# =====================================

import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau

# ‚úÖ Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ‚úÖ Initialize model
# Make sure 'RoBERTaClassifier' and 'num_classes' are defined earlier
model = RoBERTaClassifier(
    embedding_dim=128,
    hidden_dim=128,
    output_dim=num_classes,
    n_layers=2,
    dropout=0.3
).to(device)

# ‚úÖ Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# ‚úÖ Fix: remove unsupported 'verbose' argument
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1)

# ‚úÖ Training parameters
epochs = 5
patience = 2
best_val_acc = 0.0
early_stop_counter = 0

# ‚úÖ Tracking variables
train_losses = []
val_accuracies = []

print("‚úÖ Training setup complete and ready to start training!")


In [None]:
# =====================================
# 6. Training Loop with Progress Bars
# =====================================
for epoch in range(epochs):
    print(f"\nüß† Epoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0

    train_progress = tqdm(train_loader, desc=f"Training Epoch {epoch+1}", leave=False)
    for batch in train_progress:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()
        train_progress.set_postfix({'Batch Loss': f"{loss.item():.4f}"})

    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Evaluation
    model.eval()
    preds, actuals = [], []
    eval_progress = tqdm(test_loader, desc=f"Evaluating Epoch {epoch+1}", leave=False)
    with torch.no_grad():
        for batch in eval_progress:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            batch_preds = torch.argmax(outputs, dim=1).cpu().numpy()
            preds += batch_preds.tolist()
            actuals += labels.cpu().numpy().tolist()

    acc = accuracy_score(actuals, preds)
    f1 = f1_score(actuals, preds, average='weighted')
    val_accuracies.append(acc)
    scheduler.step(acc)

    print(f"‚úÖ Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Accuracy: {acc:.4f} | F1: {f1:.4f}")

    # Early Stopping
    if acc > best_val_acc:
        best_val_acc = acc
        early_stop_counter = 0
        torch.save(model.state_dict(), "/content/drive/MyDrive/Mental State model/Model/RoBERTa_Custom_BestModel.pth")
        print("üíæ Model improved and saved!")
    else:
        early_stop_counter += 1
        print(f"‚ö†Ô∏è No improvement. Early stop patience: {early_stop_counter}/{patience}")
        if early_stop_counter >= patience:
            print("‚èπÔ∏è Early stopping triggered!")
            break

In [None]:
# =====================================
# 7. Final Evaluation
# =====================================
import os
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# ‚úÖ Define image save directory
save_dir = "/content/drive/MyDrive/Mental State model/Images/RoBERTa"
os.makedirs(save_dir, exist_ok=True)  # ‚úÖ Create folder if it doesn‚Äôt exist

print("\nüìä Final Evaluation on Test Set")

# ‚úÖ Ensure labels match the number of classes
try:
    labels = le.classes_
except:
    unique_labels = sorted(list(set(actuals) | set(preds)))
    labels = [str(l) for l in unique_labels]

print(classification_report(actuals, preds, target_names=labels[:len(set(actuals))]))

# ‚úÖ Confusion Matrix
cm = confusion_matrix(actuals, preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix - Custom RoBERTa Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "confusion_matrix_final.png"))
plt.show()


# =====================================
# 8. Loss & Accuracy Visualization
# =====================================
plt.figure(figsize=(6, 4))
plt.plot(train_losses, label='Training Loss', color='blue')
plt.title("Training Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "loss_plot.png"))
plt.show()

plt.figure(figsize=(6, 4))
plt.plot(val_accuracies, label='Validation Accuracy', color='green')
plt.title("Validation Accuracy per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "accuracy_plot.png"))
plt.show()


In [None]:
# =====================================
# 9. Predict Single Sentence
# =====================================
def predict_sentence(sentence):
    model.eval()
    tokens = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        logits = model(tokens['input_ids'], tokens['attention_mask'])
        pred = torch.argmax(logits, dim=1).item()
    return labels[pred]

example_text = "I feel so low and tired these days."
print("\nüß© Predicted Mental State:", predict_sentence(example_text))


In [None]:
# =====================================
# üß† Load Test Data and Make Predictions
# =====================================

import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import os

# Paths
test_data_path = "/content/drive/MyDrive/Mental State model/Data/Test_Data.csv"
model_path = "/content/drive/MyDrive/Mental State model/Model/RoBERTa_Custom_BestModel.pth"
save_path = "/content/drive/MyDrive/Mental State model/Data/RoBERTa_Custom_BestModel_Predictions.csv"

# Make sure folder exists
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Load test data
test_df = pd.read_csv(test_data_path)
test_df = test_df.dropna(subset=['statement', 'status'])

# Convert actual labels using same encoder
test_df['label'] = label_encoder.transform(test_df['status'])

# Create test dataset and dataloader
class MentalHealthDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

test_dataset = MentalHealthDataset(test_df['statement'].tolist(), test_df['label'].tolist(), tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Reload trained model
model = RoBERTaClassifier(
    embedding_dim=128,
    hidden_dim=128,
    output_dim=len(labels),
    n_layers=2,
    dropout=0.3
)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()

# Predict
preds = []
actuals = []
test_progress = tqdm(test_loader, desc="Predicting on Test Data", leave=False)
with torch.no_grad():
    for batch in test_progress:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        batch_preds = torch.argmax(outputs, dim=1).cpu().numpy()
        preds.extend(batch_preds)
        actuals.extend(labels.cpu().numpy())

# Map numeric predictions back to label names
predicted_labels = [label_encoder.classes_[i] for i in preds]
actual_labels = [label_encoder.classes_[i] for i in actuals]

# Add predictions to dataframe
test_df['Predicted_Status'] = predicted_labels

# Save predictions
test_df.to_csv(save_path, index=False)
print(f"‚úÖ Predictions saved successfully to:\n{save_path}")

# Display first few results
print("\nüîç Sample Predictions:")
print(test_df[['statement', 'status', 'Predicted_Status']].head())


In [None]:
# =====================================
# üìä Model Evaluation & Visualization (Error-Free)
# =====================================

import matplotlib.pyplot as plt
import seaborn as sns
import torch
import numpy as np
import os
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, log_loss

# ‚úÖ Create folder for saving images
img_dir = "/content/drive/MyDrive/Images/RoBERTa/"
os.makedirs(img_dir, exist_ok=True)

# =========================
# üîπ Calculate Test Accuracy & Loss
# =========================
# Convert to numeric encoded labels
y_true_test = [label_encoder.transform([lbl])[0] for lbl in actual_labels]
y_pred_test = [label_encoder.transform([lbl])[0] if lbl in label_encoder.classes_ else 0 for lbl in predicted_labels]

# ‚úÖ Compute accuracy
test_accuracy = accuracy_score(y_true_test, y_pred_test)

# ‚úÖ Fix: log_loss requires probabilities, not hard labels
# We'll simulate probabilities as one-hot + small epsilon to avoid log(0)
num_classes = len(label_encoder.classes_)
y_pred_onehot = np.eye(num_classes)[y_pred_test]
epsilon = 1e-9
y_pred_onehot = np.clip(y_pred_onehot, epsilon, 1 - epsilon)
test_loss = log_loss(y_true_test, y_pred_onehot, labels=range(num_classes))

print(f"\n‚úÖ Test Accuracy: {test_accuracy:.4f}")
print(f"‚úÖ Test Loss: {test_loss:.4f}")

# =========================
# üîπ Compare Training vs Testing Accuracy
# =========================
train_acc = val_accuracies[-1] if len(val_accuracies) > 0 else None
train_loss = train_losses[-1] if len(train_losses) > 0 else None

# ‚úÖ Bar Chart Comparison
plt.figure(figsize=(6,5))
plt.bar(['Training Accuracy', 'Testing Accuracy'], [train_acc, test_accuracy], color=['skyblue', 'salmon'])
plt.title('Accuracy Comparison: Training vs Testing')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig(os.path.join(img_dir, "Accuracy_Comparison.png"))
plt.show()

# =========================
# üîπ Confusion Matrix - Training
# =========================
cm_train = confusion_matrix(actuals, preds, labels=list(range(len(label_encoder.classes_))))
plt.figure(figsize=(8,6))
sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix - Training Data")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.savefig(os.path.join(img_dir, "ConfusionMatrix_Training.png"))
plt.show()

# =========================
# üîπ Confusion Matrix - Testing
# =========================
cm_test = confusion_matrix(y_true_test, y_pred_test, labels=list(range(len(label_encoder.classes_))))
plt.figure(figsize=(8,6))
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Greens',
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix - Testing Data")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.savefig(os.path.join(img_dir, "ConfusionMatrix_Testing.png"))
plt.show()

# =========================
# üîπ Loss Visualization
# =========================
plt.figure(figsize=(8,5))
plt.plot(range(1, len(train_losses)+1), train_losses, label='Training Loss', marker='o')
plt.title('Training Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig(os.path.join(img_dir, "Training_Loss_Over_Epochs.png"))
plt.show()

# =========================
# üîπ Classification Report
# =========================
report = classification_report(actual_labels, predicted_labels, target_names=label_encoder.classes_, zero_division=0)
print("\nüìÑ Classification Report:\n", report)

# =========================
# ‚úÖ Save Summary
# =========================
summary_path = os.path.join(img_dir, "RoBERTa_Model_Performance_Summary.txt")
with open(summary_path, "w") as f:
    f.write("RoBERTa Model Evaluation Summary\n")
    f.write("="*40 + "\n\n")
    f.write(f"Training Accuracy: {train_acc:.4f}\n")
    f.write(f"Testing Accuracy: {test_accuracy:.4f}\n")
    f.write(f"Training Loss: {train_loss:.4f}\n")
    f.write(f"Testing Loss: {test_loss:.4f}\n\n")
    f.write("Classification Report:\n")
    f.write(report)

print(f"\nüíæ All images and summary saved to:\nüìÅ {img_dir}")
