<a href="https://colab.research.google.com/github/Ayushverma41/Mental-State-Prediction-using-NLP/blob/main/Code/DISTILBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =======================================
# üì¶ Install Dependencies
# =======================================
!pip install transformers datasets torch scikit-learn matplotlib seaborn -q


In [None]:
# =======================================
# üìö Import Libraries
# =======================================
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import DistilBertTokenizerFast, DistilBertModel, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import os


In [None]:
# =======================================
# üìÇ Load Dataset
# =======================================
data_path = "/content/drive/MyDrive/Mental State model/Data/Train_Data.csv"
df = pd.read_csv(data_path)

print("Dataset shape:", df.shape)
print(df.head())


In [None]:
# =======================================
# üîñ Encode Labels
# =======================================
le = LabelEncoder()
df['label'] = le.fit_transform(df['status'])
num_labels = len(le.classes_)
print("\nLabel Mapping:", dict(zip(le.classes_, range(num_labels))))

In [None]:
# =======================================
# üß© Train-Test Split
# =======================================
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['statement'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

# ============================
# üß© Tokenization
# ============================
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

from datasets import Dataset
train_dataset = Dataset.from_dict({**train_encodings, "labels": train_labels})
val_dataset = Dataset.from_dict({**val_encodings, "labels": val_labels})


In [None]:
# =======================================
# üß† Custom DistilBERT Model Architecture
# =======================================
class DistilBERT_MentalHealth(nn.Module):
    def __init__(self, embedding_dim=128, hidden_dim=128, output_dim=5, n_layers=2, dropout=0.3):
        super(DistilBERT_MentalHealth, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.embedding = nn.Linear(768, embedding_dim)

        layers = []
        input_dim = embedding_dim
        for _ in range(n_layers):
            layers.append(nn.Linear(input_dim, hidden_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            input_dim = hidden_dim
        self.hidden_layers = nn.Sequential(*layers)

        self.classifier = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS token
        x = self.embedding(pooled_output)
        x = self.hidden_layers(x)
        logits = self.classifier(x)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}


In [None]:
# =======================================
# ‚öôÔ∏è Initialize Model
# =======================================
model = DistilBERT_MentalHealth(
    embedding_dim=128, hidden_dim=128, output_dim=num_labels, n_layers=2, dropout=0.3
)

In [None]:
# =======================================
# üìä Metrics Function (Accuracy + F1)
# =======================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average='macro')
    f1_weighted = f1_score(labels, preds, average='weighted')
    return {"accuracy": acc, "f1_macro": f1_macro, "f1_weighted": f1_weighted}

In [None]:
# =======================================
# üèãÔ∏è Training Configuration
# =======================================
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Mental State model/DistilBERT/results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,                 # 5 epochs
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir='/content/drive/MyDrive/Mental State model/DistilBERT/logs',
    logging_steps=50,
    report_to="none",
    disable_tqdm=False,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


In [None]:
# =======================================
# üöÄ Train Model (with progress bar)
# =======================================
print("üöÄ Training started... (5 epochs, early stopping enabled)\n")
train_output = trainer.train()
print("\n‚úÖ Training complete! Best model automatically loaded.")


In [None]:
# =======================================
# üìà Evaluate on Validation Data
# =======================================
results = trainer.evaluate()
print("\nüìä Evaluation Results:", results)

predictions = trainer.predict(val_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Detailed Classification Report
print("\nüßæ Classification Report:\n")
print(classification_report(val_labels, pred_labels, target_names=le.classes_))


In [None]:
# =======================================
# üî¢ Confusion Matrix
# =======================================
cm = confusion_matrix(val_labels, pred_labels)
save_path = "/content/drive/MyDrive/Mental State model/Images/DistilBERT/"
os.makedirs(save_path, exist_ok=True)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("DistilBERT Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig(os.path.join(save_path, "confusion_matrix.png"))
plt.show()


In [None]:
# =======================================
# üìâ Training vs Validation Loss Plot
# =======================================
train_loss = []
eval_loss = []
epochs = []

for log in trainer.state.log_history:
    if 'loss' in log:
        train_loss.append(log['loss'])
    if 'eval_loss' in log:
        eval_loss.append(log['eval_loss'])
        epochs.append(log['epoch'])

plt.figure(figsize=(8,6))
plt.plot(epochs, eval_loss, label='Validation Loss', marker='o')
plt.plot(range(1, len(train_loss)+1), train_loss, label='Training Loss', linestyle='--', alpha=0.7)
plt.title("Training vs Validation Loss (DistilBERT)")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(save_path, "loss_plot.png"))
plt.show()


In [None]:
# =======================================
# üìä Accuracy & F1 Comparison
# =======================================
train_acc = results.get("eval_accuracy", 0)
f1_macro = results.get("eval_f1_macro", 0)
f1_weighted = results.get("eval_f1_weighted", 0)

metrics = {
    "Metric": ["Accuracy", "F1-Macro", "F1-Weighted"],
    "Score": [train_acc, f1_macro, f1_weighted]
}

plt.figure(figsize=(6,5))
sns.barplot(x="Metric", y="Score", data=pd.DataFrame(metrics), palette="crest")
plt.ylim(0, 1)
plt.title("Model Performance Metrics (DistilBERT)")
plt.savefig(os.path.join(save_path, "accuracy_f1_comparison.png"))
plt.show()


In [None]:
# =======================================
# üíæ Save Trained Model & Tokenizer
# =======================================
model_save_path = "/content/drive/MyDrive/Mental State model/Model/DistilBERT/"
os.makedirs(model_save_path, exist_ok=True)

torch.save(model.state_dict(), os.path.join(model_save_path, "pytorch_model.bin"))
tokenizer.save_pretrained(model_save_path)

print(f"‚úÖ Trained model and tokenizer saved at:\n{model_save_path}")


In [None]:
# =======================================
# üß© Single Sentence Prediction
# =======================================
def predict_mental_state(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    pred = torch.argmax(outputs["logits"], dim=1).item()
    return le.inverse_transform([pred])[0]

example_text = "I feel hopeless and anxious about my future."
print("\nüß† Predicted Mental State:", predict_mental_state(example_text))


**Test Data Prediction & Output CSV**

In [None]:
# =======================================
# üì¶ Imports
# =======================================
import torch
import pandas as pd
from transformers import DistilBertTokenizerFast
import os

# =======================================
# üìÇ Paths
# =======================================
model_path = "/content/drive/MyDrive/Mental State model/Model/DistilBERT/"
test_path = "/content/drive/MyDrive/Mental State model/Data/Test_Data.csv"
output_path = "/content/drive/MyDrive/Mental State model/Data/"
os.makedirs(output_path, exist_ok=True)

# =======================================
# üß† Load Model and Tokenizer
# =======================================
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)

# Recreate the model architecture (must match training)
model = DistilBERT_MentalHealth(
    embedding_dim=128,
    hidden_dim=128,
    output_dim=5,
    n_layers=2,
    dropout=0.3
)
model.load_state_dict(torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location=torch.device('cpu')))
model.eval()

print("‚úÖ Model and tokenizer loaded successfully!")


In [None]:
# =======================================
# üìë Load Test Data
# =======================================
test_df = pd.read_csv(test_path)
print("Test Data Shape:", test_df.shape)
print(test_df.head())

In [None]:
# =======================================
# üî¢ Encode Test Labels (for comparison)
# =======================================
# Use the same LabelEncoder (fit on training data)
# If you reloaded the environment, re-fit on training labels
from sklearn.preprocessing import LabelEncoder

train_data_path = "/content/drive/MyDrive/Mental State model/Data/Train_Data.csv"
train_df = pd.read_csv(train_data_path)
le = LabelEncoder()
le.fit(train_df["status"])

test_df["encoded_label"] = le.transform(test_df["status"])

In [None]:
# =======================================
# üß© Tokenize Test Sentences
# =======================================
test_encodings = tokenizer(
    test_df["statement"].tolist(),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)

In [None]:
# =======================================
# üîÆ Predict on Test Data
# =======================================
with torch.no_grad():
    outputs = model(**test_encodings)
    preds = torch.argmax(outputs["logits"], dim=1).cpu().numpy()

# Convert predicted labels back to text
predicted_labels = le.inverse_transform(preds)
test_df["Predicted_Status"] = predicted_labels


In [None]:
# =======================================
# üìä Compare & Save Results
# =======================================
output_csv = os.path.join(output_path, "DistilBERT_Test_Predictions.csv")
test_df.to_csv(output_csv, index=False)
print(f"‚úÖ Predictions saved to:\n{output_csv}")

# Show sample comparison
print("\nüîç Sample Predictions:")
print(test_df.head())

In [None]:
# =======================================
# üì¶ Imports
# =======================================
import torch
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, log_loss
import os

# =======================================
# üìÇ Save Path
# =======================================
image_save_path = "/content/drive/MyDrive/Mental State model/Images/DistilBERT/"
os.makedirs(image_save_path, exist_ok=True)

In [None]:
# Convert encoded labels to numpy
true_labels = np.array(test_df["encoded_label"])
pred_labels = np.array(preds)

# Compute probabilities for loss calculation
with torch.no_grad():
    logits = model(**test_encodings)["logits"]
    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy()

# Metrics
test_accuracy = accuracy_score(true_labels, pred_labels)
test_f1_macro = f1_score(true_labels, pred_labels, average='macro')
test_loss = log_loss(true_labels, probs)

print(f"\nüìä Test Accuracy: {test_accuracy:.4f}")
print(f"üéØ Test F1 (Macro): {test_f1_macro:.4f}")
print(f"üí• Test Loss: {test_loss:.4f}")


In [None]:
cm_test = confusion_matrix(true_labels, pred_labels)

plt.figure(figsize=(8,6))
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Oranges',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("DistilBERT Confusion Matrix ‚Äî Test Data")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig(os.path.join(image_save_path, "confusion_matrix_test.png"))
plt.show()


In [None]:
# Get training/validation predictions (if not already done)
train_predictions = trainer.predict(train_dataset)
train_preds = np.argmax(train_predictions.predictions, axis=1)
train_labels = np.array(train_predictions.label_ids)

cm_train = confusion_matrix(train_labels, train_preds)

plt.figure(figsize=(8,6))
sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("DistilBERT Confusion Matrix ‚Äî Training Data")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig(os.path.join(image_save_path, "confusion_matrix_train.png"))
plt.show()


In [None]:
# Retrieve training accuracy from earlier eval results
train_results = trainer.evaluate()
train_accuracy = train_results["eval_accuracy"]
train_loss = train_results["eval_loss"]

# Accuracy comparison
plt.figure(figsize=(6,5))
sns.barplot(x=["Training", "Testing"],
            y=[train_accuracy, test_accuracy],
            palette=["skyblue", "salmon"])
plt.title("DistilBERT Accuracy Comparison")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.savefig(os.path.join(image_save_path, "accuracy_comparison.png"))
plt.show()


In [None]:
plt.figure(figsize=(6,5))
sns.barplot(x=["Training", "Testing"],
            y=[train_loss, test_loss],
            palette=["skyblue", "salmon"])
plt.title("DistilBERT Loss Comparison")
plt.ylabel("Loss")
plt.savefig(os.path.join(image_save_path, "loss_comparison.png"))
plt.show()


In [None]:
print("‚úÖ Evaluation Summary")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Training Loss: {train_loss:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")
print(f"Testing Loss: {test_loss:.4f}")
print(f"Testing F1 (Macro): {test_f1_macro:.4f}")

print(f"\nüìÅ All images saved to:\n{image_save_path}")