In [1]:
!pip install -q transformers datasets torch scikit-learn pandas shap lime

import os, re
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import f1_score


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lime (setup.py) ... [?25l[?25hdone


In [3]:
# If file is in /content directly
df = pd.read_csv('/content/go_emotions_dataset.csv')

def clean_text(text):
    text = str(text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower().strip()

df['clean_text'] = df['text'].apply(clean_text)

non_label_cols = ['id', 'text', 'example_very_unclear', 'clean_text']
label_cols = [c for c in df.columns if c not in non_label_cols]

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [4]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
max_length = 128

class GoEmotionsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, label_cols, max_length):
        self.texts = dataframe['clean_text'].tolist()
        self.labels = dataframe[label_cols].values.astype(float)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item['labels'] = labels
        return item

train_dataset = GoEmotionsDataset(train_df, tokenizer, label_cols, max_length)
val_dataset   = GoEmotionsDataset(val_df, tokenizer, label_cols, max_length)
test_dataset  = GoEmotionsDataset(test_df, tokenizer, label_cols, max_length)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [8]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(label_cols),
    problem_type="multi_label_classification"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()
    labels = labels.astype(int)
    micro_f1 = f1_score(labels, preds, average='micro')
    return {'micro_f1': micro_f1}

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',          # <-- change this line
    save_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='micro_f1'
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Micro F1
1,0.1151,0.1138,0.31143
2,0.1073,0.111903,0.351822
3,0.0987,0.113611,0.369399


TrainOutput(global_step=63369, training_loss=0.10964590572442118, metrics={'train_runtime': 6214.714, 'train_samples_per_second': 81.571, 'train_steps_per_second': 10.197, 'total_flos': 1.679604003108864e+16, 'train_loss': 0.10964590572442118, 'epoch': 3.0})

In [None]:
# Test set evaluation (run this after training)
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Full test results:", test_results)

# Also check validation results from last epoch
val_results = trainer.evaluate(eval_dataset=val_dataset)
print("Validation results:", val_results)


In [9]:
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test metrics:", test_results)

# Save model + tokenizer to Drive or /content
save_path = "/content/mental_health_emotion_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


Test metrics: {'eval_loss': 0.11380043625831604, 'eval_micro_f1': 0.3682558907729575, 'eval_runtime': 75.259, 'eval_samples_per_second': 280.671, 'eval_steps_per_second': 17.553, 'epoch': 3.0}


('/content/mental_health_emotion_model/tokenizer_config.json',
 '/content/mental_health_emotion_model/special_tokens_map.json',
 '/content/mental_health_emotion_model/vocab.txt',
 '/content/mental_health_emotion_model/added_tokens.json',
 '/content/mental_health_emotion_model/tokenizer.json')

In [10]:
import numpy as np

def predict_emotions(text):
    model.eval()
    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding='max_length',
        max_length=128
    )
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.sigmoid(logits).cpu().numpy()[0]

    predicted_indices = np.where(probs > 0.5)[0]
    predicted_emotions = [label_cols[i] for i in predicted_indices]
    predicted_scores   = [float(probs[i]) for i in predicted_indices]
    return list(zip(predicted_emotions, predicted_scores))

emotion_to_risk = {
    'admiration': 'low', 'amusement': 'low', 'anger': 'high',
    'annoyance': 'moderate', 'approval': 'low', 'caring': 'low',
    'confusion': 'moderate', 'curiosity': 'low', 'desire': 'low',
    'disappointment': 'moderate', 'disapproval': 'moderate',
    'disgust': 'high', 'embarrassment': 'moderate', 'excitement': 'low',
    'fear': 'high', 'gratitude': 'low', 'grief': 'high', 'joy': 'low',
    'love': 'low', 'nervousness': 'moderate', 'optimism': 'low',
    'pride': 'low', 'realization': 'low', 'relief': 'low',
    'remorse': 'high', 'sadness': 'high', 'surprise': 'low',
    'neutral': 'low'
}

def map_risk(predictions):
    risks = [emotion_to_risk.get(emotion, 'low') for emotion, _ in predictions]
    if 'high' in risks:
        return 'high'
    elif 'moderate' in risks:
        return 'moderate'
    else:
        return 'low'

sample_text = "Feeling anxious and overwhelmed today"
preds = predict_emotions(sample_text)
print("Predictions:", preds)
print("Risk level:", map_risk(preds))


Predictions: [('nervousness', 0.7248819470405579)]
Risk level: moderate


In [11]:
from transformers import pipeline
from lime.lime_text import LimeTextExplainer
import shap

nlp_pipe = pipeline(
    'text-classification',
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True
)

def lime_predict(texts):
    outputs = nlp_pipe(texts)
    probs = np.array([[item['score'] for item in out] for out in outputs])
    return probs

lime_explainer = LimeTextExplainer(class_names=label_cols)

example_text = "I'm feeling really anxious and sad today."
lime_exp = lime_explainer.explain_instance(
    example_text,
    lime_predict,
    num_features=10,
    top_labels=3
)
top_label = lime_exp.top_labels[0]
print("Top label:", label_cols[top_label])
print(lime_exp.as_list(label=top_label))

shap_explainer = shap.Explainer(nlp_pipe)
shap_values = shap_explainer([example_text])
shap.plots.text(shap_values[0])


Device set to use cuda:0


Top label: nervousness
[(np.str_('anxious'), 0.6257619330360809), (np.str_('sad'), -0.0689724105017361), (np.str_('I'), 0.06081926320608145), (np.str_('m'), 0.02134293635201155), (np.str_('really'), 0.019081201591422667), (np.str_('feeling'), 0.01542034509586995), (np.str_('and'), 0.011517499406790402), (np.str_('today'), 0.0026100527111543553)]


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
