# Implement monte-carlo dropout to estimate uncertainty

This can be implemented here because the google-bert/bert-base-uncased models, which are the models fine-tuned for this project, have dropout layers after each transformer layer output and in their clasifier head. Therefore, the models can just be imported, and dropout can be enabled. Then, I can just run the predictions several times with different random dropouts, and get SEs for those - in each row of the PRISM dataset.

This script was run through Google Colab in order to easily get access to a GPU, and speed up the uncertainty estimation. It was run using pay as you go compute units, on an Nvidia A100 GPU because of limited time.

**How is this document structured?**

0. Setup
1. Define helper functions
2. Implement Monte-Carlo Dropout to estimate undertainty

# 0. Setup

In [1]:
# Imports
import torch
import pandas as pd
from tqdm import tqdm
from google.colab import files
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the final classified data dataset - with pandas if run locally.
# PROJECT_ROOT = "/Users/carolinewagner/Desktop/Local/MY498-capstone-main"
# classified_path = os.path.join(PROJECT_ROOT, "01_data", "07_final_classified_data", "PRISM_classified_with_DSL_outcomes.csv")
# classified_data = pd.read_json(classified_path, lines=True)

# Use this line to manually upload the datafile into the google colab document.
uploaded = files.upload()

classified_data = pd.read_csv("PRISM_classified_with_DSL_outcomes.csv")

# Preview & check implementaiton
classified_data.head()
classified_data.columns

Saving PRISM_classified_with_DSL_outcomes.csv to PRISM_classified_with_DSL_outcomes (2).csv


Index(['utterance_id', 'interaction_id', 'conversation_id', 'user_id',
       'conversation_type', 'user_prompt', 'score', 'label_1A', 'score_1A',
       'label_1B',
       ...
       'gold_standard_labels_why_q', 'gold_standard_labels_whathow_q',
       'gold_standard_labels_hobsons_c', 'gold_standard_labels_M',
       'design_adjusted_whether_q', 'design_adjusted_which_q',
       'design_adjusted_why_q', 'design_adjusted_whathow_q',
       'design_adjusted_hobsons_c', 'design_adjusted_M'],
      dtype='object', length=144)

# 1. Define helper functions

In [2]:
# Define functions.
# These are not sourced from a helper file because this file is so short, and to facilitate understanding.

def enable_dropout(model):
    """Description"""
    for module in model.modules():
        if isinstance(module, torch.nn.Dropout):
            module.train()


# This implements num_samples forward passes through the network, each pass using a different random dropout mask
# simulating an ensemble of networks.
# This allows to measure epistemic uncertainty, i.e., how much predictions vary across runs.
def mc_dropout_predict(model, tokenizer, texts, num_samples=100, class_labels=None):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(model.device)

    logits_list = []

    for _ in range(num_samples):
        enable_dropout(model) # this ensures new dropout masks are sampled at every forward pass.
        with torch.no_grad():
            outputs = model(**inputs)
            logits = F.softmax(outputs.logits, dim=-1) # Shape: [batch_size, num_classes]
            logits_list.append(logits)

    probs = torch.stack(logits_list)  # Shape: [num_samples, batch_size, num_classes]
    mean_probs = probs.mean(dim=0) # Shape: [batch_size, num_classes]
    std_probs = probs.std(dim=0) # Shape: [batch_size, num_classes]

    # Build labeled output - this formats the output into a dictionary so that it is easier to interpret later on.
    results = []
    mc_classes = []
    mc_max_probs = []

    for i in range(mean_probs.shape[0]): # For each item in the batch
        # Dict of class -> {mean, std}
        labeled_result = {
            class_labels[j]: {
                "mean": float(mean_probs[i][j]),
                "std": float(std_probs[i][j])
            }
            for j in range(mean_probs.shape[1])
        }

        # Get argmax class index and probability for each sample
        sample_class_ids = probs[:, i, :].argmax(dim=-1).tolist()  # [num_samples]
        sample_class_probs = probs[:, i, :].max(dim=-1).values.tolist()  # [num_samples]
        sample_class_labels = [class_labels[class_id] for class_id in sample_class_ids]

        results.append(labeled_result)
        mc_classes.append(sample_class_labels)
        mc_max_probs.append(sample_class_probs)

    return results, mc_classes, mc_max_probs


# 2. Implement Monte-Carlo Dropout to estimate undertainty 

In [None]:
# Ensure your dataset is loaded
# classified_data = pd.read_json("final_classified_data.jsonl", lines=True)

# Note. This implementation uses the models' default dropout rate of 0.1.

# (1) Define fine-tuned BERTs 
model_names = [
    "carowagner/classify-questions-3A",
    "carowagner/classify-questions-2C",
    "carowagner/classify-questions-2B",
    "carowagner/classify-questions-2A",
    "carowagner/classify-questions-1B",
    "carowagner/classify-questions-1A",
    "carowagner/clasify-questions-4A"
]

# (2) Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# (3) Loop through each model
for model_name in model_names:
    print(f"Running MC Dropout for model: {model_name}")

    # (a) Load model + tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # (b) enable dropout during evaluation
    model.eval()
    enable_dropout(model)

    # (c) Get class labels
    class_labels = model.config.id2label

    # (d) Extract model ID for the column name (e.g., '1A' from '...questions-1A')
    model_suffix = model_name.split("-")[-1]
    uncertainty_col = f"mc_uncertainty_{model_suffix}"

    # (e) Run MC dropout in batches
    batch_size = 32
    prompts = classified_data["user_prompt"].tolist()
    # adding the utterance_id just to be able to check later on that the rows were properly handeled.
    utterance_ids = classified_data["utterance_id"].tolist()
    all_results = []
    all_mc_classes = []
    all_mc_probs = []

    for i in tqdm(range(0, len(prompts), batch_size)):
        batch_prompts = prompts[i:i + batch_size]
        batch_utterance_ids = utterance_ids[i:i + batch_size]

        batch_results, mc_class_outputs, mc_prob_outputs = mc_dropout_predict(
            model=model,
            tokenizer=tokenizer,
            texts=batch_prompts,
            num_samples=100,
            class_labels=class_labels
        )

        # (f) Attach utterance_id to each result
        for uid, result, cls_output, prob_output in zip(batch_utterance_ids, batch_results, mc_class_outputs, mc_prob_outputs):
            result_with_uid = {"utterance_id": uid}
            result_with_uid.update(result)
            all_results.append(result_with_uid)
            all_mc_classes.append(cls_output)
            all_mc_probs.append(prob_output)

    # (g) Save results into a new column
    classified_data[uncertainty_col] = all_results
    classified_data[f"mc_data_class_{model_suffix}"] = all_mc_classes
    classified_data[f"mc_data_prob_{model_suffix}"] = all_mc_probs

# (4) save the output as a .jsonl document and download it
classified_data.to_json("final_data_with_uncertainty.jsonl", lines=True, orient="records")
files.download("final_data_with_uncertainty.jsonl")

Running MC Dropout for model: carowagner/classify-questions-3A


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
100%|██████████| 251/251 [09:32<00:00,  2.28s/it]


Running MC Dropout for model: carowagner/classify-questions-2C


100%|██████████| 251/251 [09:32<00:00,  2.28s/it]

Running MC Dropout for model: carowagner/classify-questions-2B





config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

100%|██████████| 251/251 [09:31<00:00,  2.28s/it]

Running MC Dropout for model: carowagner/classify-questions-2A





config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

100%|██████████| 251/251 [09:31<00:00,  2.28s/it]

Running MC Dropout for model: carowagner/classify-questions-1B





config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

100%|██████████| 251/251 [09:31<00:00,  2.28s/it]

Running MC Dropout for model: carowagner/classify-questions-1A





config.json:   0%|          | 0.00/851 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

100%|██████████| 251/251 [09:31<00:00,  2.28s/it]


Running MC Dropout for model: carowagner/clasify-questions-4A


config.json:   0%|          | 0.00/931 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

100%|██████████| 251/251 [09:32<00:00,  2.28s/it]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
# Sanity check
classified_data[[f"mc_uncertainty_1A"]].iloc[0]


Unnamed: 0,0
mc_uncertainty_1A,"{'utterance_id': 'ut0', 'n': {'mean': 0.012388..."
