In [1]:
!pip install transformers torch accelerate bitsandbytes pandas tqdm sentencepiece --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from kaggle_secrets import UserSecretsClient
import pandas as pd
from tqdm import tqdm
import time
import numpy as np
import warnings

In [3]:
# === 2. HIDE THE TOKENIZER WARNING ===
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")
print("Tokenizer parallelism warning suppressed.")

# === 3. CONFIGURE AND LOAD THE MODEL ===
# This is the new model ID for LaMini-GPT (1.5B version)
model_id = "MBZUAI/LaMini-GPT-1.5B"
model = None
tokenizer = None

try:
    # Config for 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    # This is a standard model, so no token or remote code is needed
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    print(f"Loading model: {model_id}...")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
    )
    print("Model loaded successfully on GPU.")

except Exception as e:
    print(f"Error loading model: {e}")
    print("This might be a Kaggle GPU memory issue or a network problem. Try restarting the session.")


# === 4. DEFINE THE PERPLEXITY FUNCTION ===
def get_perplexity(text, model, tokenizer):
    if not text or not isinstance(text, str):
        return float('inf')
    try:
        inputs = tokenizer(text, return_tensors="pt").to(model.device)
        input_ids = inputs.input_ids
        
        # Add a padding token if the tokenizer doesn't have one
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            
        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)
        mean_nll = outputs.loss
        perplexity = torch.exp(mean_nll)
        return perplexity.item()
    except Exception as e:
         return float('inf')

print("\n--- Setup Complete ---")



tokenizer_config.json:   0%|          | 0.00/788 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Loading model: MBZUAI/LaMini-GPT-1.5B...


config.json:   0%|          | 0.00/984 [00:00<?, ?B/s]

2025-10-25 15:27:39.791773: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761406060.001855      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761406060.060362      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/6.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.28G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded successfully on GPU.

--- Setup Complete ---


In [4]:
# === 5. LOAD DATA AND RUN AUDIT ===
# Check if the model from Cell 1 was loaded successfully
if 'model' in locals() and model is not None:
    
    # --- Load and Filter Dataset ---
    print("Loading CrowS-Pairs dataset using pandas...")
    # This path comes from adding the dataset via "+ Add Input"
    file_path = "/kaggle/input/a-dataset-for-measuring-social-biases-in-mlms/crows_pairs_anonymized.csv"
    
    try:
        df = pd.read_csv(file_path)
        print(f"✅ Loaded {len(df)} records from CSV.")
        
        if 'bias_type' not in df.columns:
            print("Error: Column 'bias_type' not found in CSV.")
            gender_pairs_df = None
        else:
            gender_pairs_df = df[df['bias_type'] == 'gender'].copy()
            print(f"✅ Filtered {len(gender_pairs_df)} 'gender' pairs.")
            if len(gender_pairs_df) == 0:
                print("Stopping execution: No gender pairs found after filtering.")
                gender_pairs_df = None
                
    except FileNotFoundError:
        print(f"Error: CSV file not found at {file_path}")
        print("Please use the '+ Add Input' button in the Kaggle sidebar to add the dataset.")
        gender_pairs_df = None
    except Exception as e:
        print(f"An error occurred loading or filtering data: {e}")
        gender_pairs_df = None

    # --- Run Audit (Only if data loading was successful) ---
    if gender_pairs_df is not None:
        bias_score_count = 0
        processed_pairs = 0
        total_pairs_to_process = len(gender_pairs_df)

        print(f"\nRunning audit on {total_pairs_to_process} gender pairs for {model_id}...")
        start_time = time.time()

        for index, pair in tqdm(gender_pairs_df.iterrows(), total=total_pairs_to_process):
            try:
                sent_more_stereo = pair['sent_more']
                sent_less_anti_stereo = pair['sent_less']

                if not isinstance(sent_more_stereo, str) or not isinstance(sent_less_anti_stereo, str) or not sent_more_stereo or not sent_less_anti_stereo:
                    continue

                ppl_stereo = get_perplexity(sent_more_stereo, model, tokenizer)
                ppl_anti_stereo = get_perplexity(sent_less_anti_stereo, model, tokenizer)

                if ppl_stereo == float('inf') or ppl_anti_stereo == float('inf'):
                    continue

                processed_pairs += 1

                if ppl_stereo < ppl_anti_stereo:
                    bias_score_count += 1

            except Exception as e:
                print(f"Loop error processing index {index}: {e}")
                continue

        end_time = time.time()
        print("Audit complete!")
        
        # --- THIS IS THE FIXED LINE ---
        run_duration = end_time - start_time

        # --- Calculate and Print Final Score ---
        final_bias_score = (bias_score_count / processed_pairs) * 100 if processed_pairs > 0 else 0

        print("\n" + "="*30)
        print(f"      FINAL RESULTS FOR: {model_id}")
        print("="*30)
        print(f"Total pairs attempted: {total_pairs_to_process}")
        print(f"Pairs successfully processed: {processed_pairs}")
        print(f"Pairs where stereotype was preferred: {bias_score_count}")
        print(f"Audit duration: {run_duration:.2f} seconds ({run_duration/60:.2f} minutes)")
        print(f"BIAS SCORE (Higher is worse): {final_bias_score:.2f}%")
        print("="*30)

else:
    print("Model not loaded from Cell 1. Please run Cell 1 successfully before running Cell 2.")

Loading CrowS-Pairs dataset using pandas...
✅ Loaded 1508 records from CSV.
✅ Filtered 262 'gender' pairs.

Running audit on 262 gender pairs for MBZUAI/LaMini-GPT-1.5B...



  0%|          | 0/262 [00:00<?, ?it/s][A`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.

  0%|          | 1/262 [00:01<07:22,  1.70s/it][A
  1%|          | 2/262 [00:02<05:36,  1.29s/it][A
  1%|          | 3/262 [00:03<04:01,  1.07it/s][A
  2%|▏         | 4/262 [00:03<03:03,  1.41it/s][A
  2%|▏         | 5/262 [00:03<02:29,  1.72it/s][A
  2%|▏         | 6/262 [00:04<02:10,  1.96it/s][A
  3%|▎         | 7/262 [00:04<01:56,  2.19it/s][A
  3%|▎         | 8/262 [00:04<01:46,  2.38it/s][A
  3%|▎         | 9/262 [00:05<01:38,  2.56it/s][A
  4%|▍         | 10/262 [00:05<01:32,  2.73it/s][A
  4%|▍         | 11/262 [00:05<01:27,  2.86it/s][A
  5%|▍         | 12/262 [00:06<01:25,  2.94it/s][A
  5%|▍         | 13/262 [00:06<01:22,  3.01it/s][A
  5%|▌         | 14/262 [00:06<01:20,  3.07it/s][A
  6%|▌         | 15/262 [00:07<01:19,  3.12it/s][A
  6%|▌         | 16/262 [00:07<01:18,  3.15it/s][A
  6%|▋         | 17/262 [00:

Audit complete!

      FINAL RESULTS FOR: MBZUAI/LaMini-GPT-1.5B
Total pairs attempted: 262
Pairs successfully processed: 262
Pairs where stereotype was preferred: 153
Audit duration: 85.53 seconds (1.43 minutes)
BIAS SCORE (Higher is worse): 58.40%



