In [22]:
!pip install -q --upgrade transformers datasets accelerate scikit-learn torch

import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from transformers.trainer_callback import EarlyStoppingCallback
from torch import nn
import os, warnings
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore", category=UserWarning)

# ============================================================
# 1Ô∏è‚É£ Custom Weighted Trainer
# ============================================================

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        FINAL version:
        - Handles num_items_in_batch
        - Works with DataParallel or DDP
        - Supports both 'labels' and 'label_id'
        """
        # Extract labels safely
        labels = inputs.pop("labels", None)
        if labels is None:
            labels = inputs.pop("label_id", None)

        # Drop any unwanted Trainer args
        for key in ["num_items_in_batch", "loss_reduction"]:
            inputs.pop(key, None)
            kwargs.pop(key, None)

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # ‚úÖ Handle both model and DataParallel wrapper
        actual_model = model.module if hasattr(model, "module") else model

        # ‚úÖ Correct device + config
        device = next(model.parameters()).device
        num_labels = actual_model.config.num_labels

        # Weighted loss on correct device
        loss_fct = nn.CrossEntropyLoss(weight=weights.to(device))
        loss = loss_fct(
            logits.view(-1, num_labels),
            labels.view(-1)
        )

        return (loss, outputs) if return_outputs else loss


# ============================================================
# 2Ô∏è‚É£ Load and prepare dataset
# ============================================================
df = pd.read_csv("/kaggle/input/sentiment-analysis-for-mental-health/Combined Data.csv")

df = df.dropna(subset=["statement", "status"])
df["label"] = df["status"].str.lower().str.strip()

labels = sorted(df["label"].unique())
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}
df["label_id"] = df["label"].map(label2id)

print("üß© Labels:", label2id)
print("üìä Dataset size:", len(df))

train_df, val_df = train_test_split(
    df, test_size=0.15, stratify=df["label_id"], random_state=42
)

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["statement"], padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

# Rename label column to "labels" (so Trainer recognizes it)
train_ds = train_ds.rename_column("label_id", "labels")
val_ds = val_ds.rename_column("label_id", "labels")

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# ============================================================
# 3Ô∏è‚É£ Compute class weights
# ============================================================
class_counts = train_df["label_id"].value_counts().sort_index().values
weights = 1.0 / torch.tensor(class_counts, dtype=torch.float)
weights = weights / weights.sum() * len(class_counts)
print("‚öñÔ∏è Class Weights:", weights.tolist())

# ============================================================
# 4Ô∏è‚É£ Load model
# ============================================================
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

# ============================================================
# 5Ô∏è‚É£ Metrics and training args
# ============================================================
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_macro": f1}

training_args = TrainingArguments(
    output_dir="/kaggle/working/mental_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="/kaggle/working/logs",
    fp16=True,
    warmup_ratio=0.1,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    save_total_limit=2,
)

# ============================================================
# 6Ô∏è‚É£ Trainer + EarlyStopping
# ============================================================
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# ============================================================
# 7Ô∏è‚É£ Train
# ============================================================
trainer.train()

# ============================================================
# 8Ô∏è‚É£ Evaluate
# ============================================================
preds = trainer.predict(val_ds)
pred_labels = np.argmax(preds.predictions, axis=1)
print("\nüìã Classification Report:\n")
print(classification_report(val_df["label_id"], pred_labels, target_names=labels))

# ============================================================
# 9Ô∏è‚É£ Save
# ============================================================
save_path = "/kaggle/working/mentalwell_model_final"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"\n‚úÖ Model saved to: {save_path}")

# ============================================================
# üîü Test Prediction
# ============================================================
from transformers import pipeline

clf = pipeline("text-classification", model=save_path, tokenizer=save_path, return_all_scores=True)

examples = [
    "I feel completely worthless and tired.",
    "I am doing okay, just a little stressed.",
    "I'm so happy I talked to my therapist today!",
]

for text in examples:
    preds = clf(text)
    print(f"\nüß† Text: {text}")
    for e in preds[0]:
        print(f"  {e['label']}: {e['score']:.3f}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


üß© Labels: {'anxiety': 0, 'bipolar': 1, 'depression': 2, 'normal': 3, 'personality disorder': 4, 'stress': 5, 'suicidal': 6}
üìä Dataset size: 52681


Map:   0%|          | 0/44778 [00:00<?, ? examples/s]

Map:   0%|          | 0/7903 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


‚öñÔ∏è Class Weights: [0.8456476330757141, 1.1699321269989014, 0.21087905764579773, 0.1987646520137787, 3.014235496520996, 1.2555887699127197, 0.304952472448349]


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.73,0.598715,0.764773,0.742266
2,0.4898,0.533104,0.811211,0.790274
3,0.3475,0.549305,0.812603,0.798602
4,0.2553,0.548064,0.823864,0.810938



üìã Classification Report:

                      precision    recall  f1-score   support

             anxiety       0.87      0.88      0.87       576
             bipolar       0.83      0.88      0.86       417
          depression       0.81      0.72      0.76      2311
              normal       0.97      0.93      0.95      2452
personality disorder       0.71      0.78      0.74       161
              stress       0.71      0.84      0.77       388
            suicidal       0.68      0.77      0.72      1598

            accuracy                           0.82      7903
           macro avg       0.80      0.83      0.81      7903
        weighted avg       0.83      0.82      0.83      7903


‚úÖ Model saved to: /kaggle/working/mentalwell_model_final
üö® `do_pad` is part of DefaultFastImageProcessorKwargs, but not documented. Make sure to add it to the docstring of the function in /usr/local/lib/python3.11/dist-packages/transformers/image_processing_utils_fast.py.
üö® `

ImportError: cannot import name 'make_batched_metadata' from 'transformers.video_utils' (/usr/local/lib/python3.11/dist-packages/transformers/video_utils.py)

In [28]:
!pip install -q --force-reinstall transformers==4.44.2 safetensors==0.4.5

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_dir = "/kaggle/working/mentalwell_model_final"

# Load model + tokenizer automatically
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

# Build inference pipeline
clf = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

# Try a few examples
texts = [
    "I feel completely worthless and tired.",
    "I'm so happy I talked to my therapist today!",
    "Why do I feel like I have no control over anything",
    "I want to kill myself",
    "Things are so stressful. I feel so restless"
]

for text in texts:
    preds = clf(text)[0]
    print(f"\nüß† {text}")
    for p in preds:
        print(f"  {p['label']}: {p['score']:.3f}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
mkl-umath 0.1.1 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.3.4 which is incompatible.
mkl-random 1.2.4 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.3.4 which is incompatible.
mkl-fft 1.3.8 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.3.4 which is incompatible.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.3.4 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
ydata-profiling 4.17.0 requires numpy<2.2,>=1.16.0, but you have numpy 2.3.4

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



üß† I feel completely worthless and tired.
  anxiety: 0.005
  bipolar: 0.001
  depression: 0.590
  normal: 0.051
  personality disorder: 0.002
  stress: 0.003
  suicidal: 0.348

üß† I'm so happy I talked to my therapist today!
  anxiety: 0.030
  bipolar: 0.403
  depression: 0.073
  normal: 0.436
  personality disorder: 0.043
  stress: 0.007
  suicidal: 0.009

üß† Why do I feel like I have no control over anything
  anxiety: 0.071
  bipolar: 0.004
  depression: 0.771
  normal: 0.009
  personality disorder: 0.004
  stress: 0.010
  suicidal: 0.130

üß† I want to kill myself
  anxiety: 0.001
  bipolar: 0.000
  depression: 0.088
  normal: 0.002
  personality disorder: 0.000
  stress: 0.000
  suicidal: 0.909

üß† Things are so stressful. I feel so restless
  anxiety: 0.997
  bipolar: 0.000
  depression: 0.001
  normal: 0.000
  personality disorder: 0.000
  stress: 0.001
  suicidal: 0.000


In [29]:
!cd /kaggle/working && zip -r mentalwell_model_final.zip mentalwell_model_final


  adding: mentalwell_model_final/ (stored 0%)
  adding: mentalwell_model_final/vocab.txt (deflated 53%)
  adding: mentalwell_model_final/tokenizer.json (deflated 71%)
  adding: mentalwell_model_final/config.json (deflated 50%)
  adding: mentalwell_model_final/model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 8%)
  adding: mentalwell_model_final/tokenizer_config.json (deflated 75%)
  adding: mentalwell_model_final/special_tokens_map.json (deflated 42%)
