In [1]:
!pip install transformers datasets bitsandbytes accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.3->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.3->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.3->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.3->bitsandbytes)
  Downloa

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd # Import pandas
from tqdm import tqdm

# --- 1. Configure 4-bit Quantization ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# --- 2. Define Model to Test ---
model_id = "microsoft/Phi-3-mini-4k-instruct"
print(f"Loading model: {model_id}")

# --- 3. Load the Quantized Model ---
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# --- 4. Load the Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print("✅ Model and tokenizer loaded successfully!")

# --- 5. Load and Filter the Dataset using Pandas ---
print("Loading CrowS-Pairs dataset using pandas...")
# Adjust this path if Kaggle gives it a different input folder name
file_path = "/kaggle/input/a-dataset-for-measuring-social-biases-in-mlms/crows_pairs_anonymized.csv"

try:
    df = pd.read_csv(file_path)
    print(f"✅ Loaded {len(df)} records from CSV.")

    # Filter for ONLY the gender pairs using pandas filtering
    gender_pairs_df = df[df['bias_type'] == 'gender'].copy() # Use .copy() to avoid SettingWithCopyWarning
    print(f"✅ Filtered {len(gender_pairs_df)} 'gender' pairs.")

    # Check if filtering worked
    if len(gender_pairs_df) == 0:
        print("Error: No gender pairs found. Check dataset loading and filtering.")
        # You might want to stop execution here if needed
        raise SystemExit("Stopping execution: No gender pairs found.")

except FileNotFoundError:
    print(f"Error: CSV file not found at {file_path}")
    print("Please make sure you have added the 'a-dataset-for-measuring-social-biases-in-mlms' dataset using the '+ Add Input' button.")
    raise SystemExit("Stopping execution: Dataset file not found.")

Loading model: microsoft/Phi-3-mini-4k-instruct


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
2025-10-23 19:36:04.428001: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761248164.659479      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761248164.728808      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

✅ Model and tokenizer loaded successfully!
Loading CrowS-Pairs dataset using pandas...
✅ Loaded 1508 records from CSV.
✅ Filtered 262 'gender' pairs.


In [3]:
def get_perplexity(text):
    """
    Calculates the perplexity of a single text string using our loaded model.
    """
    if not text:
        return 0.0 # Handle empty strings

    try:
        # 1. Tokenize the text
        # We move the tokens to the 'cuda' (GPU) device
        inputs = tokenizer(text, return_tensors="pt").to("cuda")
        
        # 2. Get the model's loss (negative log-likelihood)
        # We use torch.no_grad() to save memory (we're not training)
        with torch.no_grad():
            # We pass the input_ids as labels to calculate the loss
            outputs = model(**inputs, labels=inputs.input_ids)
            loss = outputs.loss
            
        # 3. Exponentiate the loss to get perplexity (PPL = e^loss)
        perplexity = torch.exp(loss)
        
        # .item() pulls the number out of the tensor
        return perplexity.item()
    
    except Exception as e:
        print(f"Error processing text: {text} | Error: {e}")
        return float('inf') # Return infinity for errors

print("✅ Perplexity function defined.")

✅ Perplexity function defined.


In [4]:
bias_score_count = 0
total_pairs = len(gender_pairs_df) 

print(f"\nRunning audit on {total_pairs} gender pairs... This may take several minutes.")

# Use iterrows() to loop through the DataFrame rows.
# It yields (index, row_series) tuples. We call the row_series 'pair'.
for index, pair in tqdm(gender_pairs_df.iterrows(), total=total_pairs):
    try:
        # Access columns using the row object 'pair'
        sent_more_stereo = pair['sent_more']
        sent_less_anti_stereo = pair['sent_less']

        # Ensure sentences are not empty or invalid before processing
        if not sent_more_stereo or not sent_less_anti_stereo or not isinstance(sent_more_stereo, str) or not isinstance(sent_less_anti_stereo, str):
             print(f"Skipping index {index} due to invalid sentence data: more='{sent_more_stereo}', less='{sent_less_anti_stereo}'")
             continue

        # Get perplexity for both sentences
        ppl_stereo = get_perplexity(sent_more_stereo)
        ppl_anti_stereo = get_perplexity(sent_less_anti_stereo)

        # Handle potential errors from get_perplexity (like infinite PPL)
        if ppl_stereo == float('inf') or ppl_anti_stereo == float('inf'):
            print(f"Skipping index {index} due to perplexity calculation error.")
            continue

        # If the model finds the stereotypical sentence *more likely*
        # (i.e., less "surprising", lower perplexity), it's a sign of bias.
        if ppl_stereo < ppl_anti_stereo:
            bias_score_count += 1

    except KeyError as e:
        print(f"KeyError at index {index}: Column {e} not found. Skipping.")
        continue # Skip this row if expected columns are missing
    except TypeError as e:
        print(f"TypeError occurred at index {index}: {e}. Value of pair: {pair}")
        print("Skipping this pair.")
        continue # Skip to the next iteration if there's an unexpected type error
    except Exception as e:
         print(f"An unexpected error occurred at index {index}: {e}")
         print("Skipping this pair.")
         continue # Skip on any other errors


print("Audit complete!")

# --- Calculate the Final Bias Score ---
# This score is the % of time the model preferred the stereotype.
final_bias_score = (bias_score_count / total_pairs) * 100 if total_pairs > 0 else 0 # Avoid division by zero

print("\n" + "="*30)
print(f"      FINAL RESULTS FOR: {model_id}") # Ensure model_id is defined earlier
print("="*30)
print(f"Total pairs tested: {total_pairs}")
print(f"Pairs where stereotype was preferred: {bias_score_count}")
print(f"BIAS SCORE (Higher is worse): {final_bias_score:.2f}%")
print("="*30)

print("\n--- FOR YOUR PAPER ---")
print(f"Your calculated score for '{model_id}' is {final_bias_score:.2f}%.") # Ensure model_id is defined
print("Compare this to the 'Gender / Gender identity' scores from your table:")
print(" - BERT:   58.0%")
print(" - RoBERTa: 57.3%")
print(" - ALBERT: 64.9%")


Running audit on 262 gender pairs... This may take several minutes.


100%|██████████| 262/262 [02:45<00:00,  1.58it/s]

Audit complete!

      FINAL RESULTS FOR: microsoft/Phi-3-mini-4k-instruct
Total pairs tested: 262
Pairs where stereotype was preferred: 158
BIAS SCORE (Higher is worse): 60.31%

--- FOR YOUR PAPER ---
Your calculated score for 'microsoft/Phi-3-mini-4k-instruct' is 60.31%.
Compare this to the 'Gender / Gender identity' scores from your table:
 - BERT:   58.0%
 - RoBERTa: 57.3%
 - ALBERT: 64.9%



