<a href="https://colab.research.google.com/github/Ayushverma41/Mental-State-Prediction-using-NLP/blob/main/Code/BiLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================================================
# STEP 1: Import Dependencies
# ==========================================================
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer
import joblib

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train_data_path = "/content/drive/MyDrive/Mental State model/Data/Train_Data.csv"
# Add on_bad_lines='skip'
df = pd.read_csv(train_data_path, on_bad_lines='skip')

# The rest of your code
assert "statement" in df.columns and "status" in df.columns, "Dataset must have 'statement' and 'status' columns"

le = LabelEncoder()
df['label'] = le.fit_transform(df['status'])
num_classes = len(le.classes_)
print("Classes:", le.classes_)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['statement'].values, df['label'].values, test_size=0.2, random_state=42
)

print("Data loaded and split successfully!")


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 64

def encode_sentences(texts):
    return [tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=MAX_LEN)
            for text in texts]

train_encodings = encode_sentences(train_texts)
val_encodings = encode_sentences(val_texts)

class MentalHealthDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.encodings[idx]), torch.tensor(self.labels[idx])

def collate_fn(batch):
    sentences, labels = zip(*batch)
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)
    return sentences_padded, labels

train_dataset = MentalHealthDataset(train_encodings, train_labels)
val_dataset = MentalHealthDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)


In [None]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.batch_norm = nn.BatchNorm1d(num_classes)

    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        avg_pool = torch.mean(lstm_out, dim=1)
        out = self.dropout(avg_pool)
        out = self.fc(out)
        out = self.batch_norm(out)
        return out

vocab_size = tokenizer.vocab_size
model = BiLSTMClassifier(vocab_size, 128, 128, num_classes).to(device)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-4, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

EPOCHS = 5
patience = 3
best_val_loss = float('inf')
epochs_no_improve = 0

train_losses, val_losses, val_f1_scores = [], [], []
save_path = "/content/drive/MyDrive/Mental State model/Model/BiLSTM/bilstm_model.pth"


In [None]:
for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)

    for inputs, labels in loop:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_train_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Validation
    model.eval()
    val_loss, all_preds, all_labels = 0, [], []
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    val_losses.append(avg_val_loss)
    val_f1_scores.append(f1)

    scheduler.step(avg_val_loss)

    print(f"Epoch [{epoch+1}/{EPOCHS}] | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | F1: {f1:.4f}")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), save_path)
        epochs_no_improve = 0
        print("‚úÖ Model improved ‚Äî saved!")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("‚èπÔ∏è Early stopping triggered.")
            break


In [None]:
# ==========================================================
# STEP X: Centralized Save Directory for All Images
# ==========================================================
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Create the save directory if it doesn‚Äôt exist
save_dir = "/content/drive/MyDrive/Mental State model/Images/BiLSTM/"
os.makedirs(save_dir, exist_ok=True)

print(f"üìÅ All plots will be saved to: {save_dir}")

In [None]:
# ==========================================================
# STEP 7 (UPDATED): Training Visualization
# ==========================================================
plt.figure(figsize=(12,5))

# Loss Plot
plt.subplot(1,2,1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Loss over Epochs')
plt.legend()

# F1 Score Plot
plt.subplot(1,2,2)
plt.plot(val_f1_scores, label='Validation F1 Score', color='orange')
plt.title('F1 Score over Epochs')
plt.legend()

plt.tight_layout()
loss_f1_plot_path = os.path.join(save_dir, "Training_Loss_F1.png")
plt.savefig(loss_f1_plot_path, bbox_inches='tight', dpi=300)
plt.close()

print(f"‚úÖ Saved: {loss_f1_plot_path}")


In [None]:
# Load best model
model.load_state_dict(torch.load(save_path))
model.eval()

# Load test data
test_data_path = "/content/drive/MyDrive/Mental State model/Data/Test_Data.csv"
output_path = "/content/drive/MyDrive/Mental State model/Data/MentalHealth_BiLSTM_Predictions.csv"

test_df = pd.read_csv(test_data_path)
assert "statement" in test_df.columns, "Test CSV must have a 'statement' column."

# Encode test statements
encoded_test = [tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=MAX_LEN)
                for text in test_df["statement"].values]

# Predict
predicted_labels = []
with torch.no_grad():
    for enc in encoded_test:
        input_tensor = torch.tensor(enc).unsqueeze(0).to(device)
        output = model(input_tensor)
        _, pred = torch.max(output, 1)
        label = le.inverse_transform(pred.cpu().numpy())[0]
        predicted_labels.append(label)

# Save results
test_df["Predicted_Status"] = predicted_labels

if "status" in test_df.columns:
    test_df.rename(columns={"status": "Actual_Status"}, inplace=True)
    actual = le.transform(test_df["Actual_Status"])
    pred = le.transform(test_df["Predicted_Status"])
    acc = accuracy_score(actual, pred)
    f1 = f1_score(actual, pred, average="weighted")
    prec = precision_score(actual, pred, average="weighted")
    rec = recall_score(actual, pred, average="weighted")
    print(f"\n‚úÖ Test Set Results:\nAccuracy: {acc:.4f} | F1: {f1:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f}")

    cm = confusion_matrix(actual, pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title("Confusion Matrix (Test Set)")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

# Save CSV
test_df.to_csv(output_path, index=False)
print(f"‚úÖ Predictions saved to:\n{output_path}")


In [None]:
# ==========================================================
# STEP 9: Compare Predictions with Actual Status
# ==========================================================
from sklearn.metrics import classification_report

if "Actual_Status" in test_df.columns and "Predicted_Status" in test_df.columns:
    # Encode actual and predicted labels
    y_true = le.transform(test_df["Actual_Status"])
    y_pred = le.transform(test_df["Predicted_Status"])

    # Compute loss on test set
    test_loss = 0
    model.eval()
    with torch.no_grad():
        for enc, label in zip(encoded_test, y_true):
            input_tensor = torch.tensor(enc).unsqueeze(0).to(device)
            label_tensor = torch.tensor([label]).to(device)
            output = model(input_tensor)
            loss = criterion(output, label_tensor)
            test_loss += loss.item()
    avg_test_loss = test_loss / len(encoded_test)

    # Metrics
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    prec = precision_score(y_true, y_pred, average='weighted')
    rec = recall_score(y_true, y_pred, average='weighted')

    print("\n==================== Test Evaluation ====================")
    print(f"‚úÖ Test Accuracy: {acc:.4f}")
    print(f"‚úÖ Test F1-Score: {f1:.4f}")
    print(f"‚úÖ Test Precision: {prec:.4f}")
    print(f"‚úÖ Test Recall: {rec:.4f}")
    print(f"‚úÖ Average Test Loss: {avg_test_loss:.4f}")

    print("\nDetailed Classification Report:")
    print(classification_report(y_true, y_pred, target_names=le.classes_))

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
                xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title("üß† Confusion Matrix ‚Äî Test Set")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

else:
    print("‚ö†Ô∏è Test data must include both 'Actual_Status' and 'Predicted_Status' columns to compute metrics.")


In [None]:
# ==========================================================
# STEP 10: Single Sentence Prediction Function
# ==========================================================

def predict_single_sentence(sentence, model, tokenizer, label_encoder, max_len=64):
    model.eval()
    with torch.no_grad():
        # Tokenize and encode the input
        encoded = tokenizer.encode(sentence, add_special_tokens=True, truncation=True, max_length=max_len)
        input_tensor = torch.tensor(encoded).unsqueeze(0).to(device)

        # Forward pass
        output = model(input_tensor)
        _, predicted_class = torch.max(output, dim=1)

        # Decode predicted label
        predicted_label = label_encoder.inverse_transform(predicted_class.cpu().numpy())[0]
    return predicted_label


# ==========================================================
# Example: Use the function interactively
# ==========================================================
example_sentence = "I feel so hopeless and tired of everything lately."
predicted_status = predict_single_sentence(example_sentence, model, tokenizer, le)

print(f"\nüó£Ô∏è Input Sentence: {example_sentence}")
print(f"ü§ñ Predicted Mental State: {predicted_status}")


In [None]:
# ==========================================================
# STEP 11 (UPDATED): Accuracy Comparison between Training and Testing
# ==========================================================
from sklearn.metrics import accuracy_score

model.eval()
train_preds, train_labels_all = [], []

with torch.no_grad():
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        train_preds.extend(preds.cpu().numpy())
        train_labels_all.extend(labels.cpu().numpy())

train_accuracy = accuracy_score(train_labels_all, train_preds)

if "Actual_Status" in test_df.columns and "Predicted_Status" in test_df.columns:
    test_accuracy = accuracy_score(
        le.transform(test_df["Actual_Status"]),
        le.transform(test_df["Predicted_Status"])
    )

    # Plot Bar Chart
    accuracies = [train_accuracy, test_accuracy]
    labels = ['Training Accuracy', 'Testing Accuracy']

    plt.figure(figsize=(7, 5))
    bars = plt.bar(labels, accuracies, color=['skyblue', 'lightgreen'], edgecolor='black')
    plt.ylim(0, 1.0)
    plt.ylabel("Accuracy")
    plt.title("Training vs Testing Accuracy Comparison")

    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, height + 0.01, f"{height:.4f}",
                 ha='center', fontsize=10, fontweight='bold')

    acc_chart_path = os.path.join(save_dir, "Accuracy_Comparison.png")
    plt.savefig(acc_chart_path, bbox_inches='tight', dpi=300)
    plt.close()

    print(f"‚úÖ Saved Accuracy Comparison Chart: {acc_chart_path}")
else:
    print("‚ö†Ô∏è Test accuracy could not be computed (missing columns).")


In [None]:
# ==========================================================
# STEP 12 (UPDATED): Confusion Matrix & Heatmap for Train and Test Data
# ==========================================================
from sklearn.metrics import confusion_matrix

# ---- Training ----
train_preds, train_labels_all = [], []
model.eval()
with torch.no_grad():
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        train_preds.extend(preds.cpu().numpy())
        train_labels_all.extend(labels.cpu().numpy())

train_cm = confusion_matrix(train_labels_all, train_preds)
train_acc = accuracy_score(train_labels_all, train_preds)

# ---- Testing ----
y_true_test = le.transform(test_df["Actual_Status"])
y_pred_test = le.transform(test_df["Predicted_Status"])
test_cm = confusion_matrix(y_true_test, y_pred_test)
test_acc = accuracy_score(y_true_test, y_pred_test)

# ---- Training Confusion Matrix ----
plt.figure(figsize=(7, 6))
sns.heatmap(train_cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title(f"Training Confusion Matrix (Accuracy: {train_acc:.4f})")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
train_cm_path = os.path.join(save_dir, "Training_Confusion_Matrix.png")
plt.savefig(train_cm_path, bbox_inches='tight', dpi=300)
plt.close()

# ---- Testing Confusion Matrix ----
plt.figure(figsize=(7, 6))
sns.heatmap(test_cm, annot=True, fmt="d", cmap="Greens",
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title(f"Testing Confusion Matrix (Accuracy: {test_acc:.4f})")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
test_cm_path = os.path.join(save_dir, "Testing_Confusion_Matrix.png")
plt.savefig(test_cm_path, bbox_inches='tight', dpi=300)
plt.close()

# ---- Combined Side-by-Side ----
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.heatmap(train_cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title(f"Training Confusion Matrix (Accuracy: {train_acc:.4f})")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.subplot(1, 2, 2)
sns.heatmap(test_cm, annot=True, fmt="d", cmap="Greens",
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title(f"Testing Confusion Matrix (Accuracy: {test_acc:.4f})")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.tight_layout()
combined_cm_path = os.path.join(save_dir, "Combined_Confusion_Matrix.png")
plt.savefig(combined_cm_path, bbox_inches='tight', dpi=300)
plt.close()

print(f"""
‚úÖ All confusion matrices saved:
   - Training: {train_cm_path}
   - Testing:  {test_cm_path}
   - Combined: {combined_cm_path}
""")
