In [1]:
# Cell 1: Update ALL relevant libraries
!pip install -U transformers datasets bitsandbytes accelerate pandas tqdm

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting accelerate
  Downloading accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86

In [2]:
# Cell 2: Load Pythia-6.9B model and tokenizer
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import pandas as pd # Import pandas here
from tqdm import tqdm # Import tqdm here

# --- 1. Configure 4-bit Quantization (using float16 compute) ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16 # Using float16
)

# --- 2. Define Model: Pythia-6.9B ---
model_id = "EleutherAI/pythia-6.9b" # *** CHANGED MODEL ID ***
print(f"Loading model: {model_id} with float16 compute dtype...")

# --- 3. Load the Quantized Model ---
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto", # Automatically map to the available GPU (T4)
        trust_remote_code=False # Not needed for Pythia
    )
except Exception as e:
    print(f"Error loading model: {e}")
    print("Check model ID, Hugging Face Hub status, GPU availability.")
    raise SystemExit("Stopping execution due to model loading error.")

# --- 4. Load the Tokenizer ---
try:
    # Pythia uses GPTNeoXTokenizerFast
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    # Pythia tokenizer usually has EOS but might need pad token set
    if tokenizer.pad_token is None:
        print("Tokenizer does not have a pad token, setting it to EOS token.")
        if tokenizer.eos_token:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            # Add a pad token if EOS is also missing (unlikely for Pythia)
            print("EOS token also missing, adding a PAD token.")
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            model.resize_token_embeddings(len(tokenizer)) # Resize model embeddings
    print("✅ Model and tokenizer loaded successfully!")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise SystemExit("Stopping execution due to tokenizer loading error.")

Loading model: EleutherAI/pythia-6.9b with float16 compute dtype...


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

2025-10-24 14:58:31.334786: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761317911.501006      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761317911.550267      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.94G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.91G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Tokenizer does not have a pad token, setting it to EOS token.
✅ Model and tokenizer loaded successfully!


In [3]:
# Cell 3: Define the perplexity calculation function (Robust Version)
import torch # Make sure torch is imported

def get_perplexity(text, model_to_use, tokenizer_to_use):
    """
    Calculates the perplexity of a single text string using the provided model and tokenizer.
    Handles potential errors more gracefully.
    """
    text = str(text).strip() # Ensure text is a string and stripped
    if not text:
        return float('inf')

    try:
        # Determine max length safely, use a reasonable default like 512 if missing
        model_max_length = getattr(tokenizer_to_use, 'model_max_length', 512)
        # Apply a practical limit for these short sentences, e.g., 512 or 1024,
        # even if model_max_length is larger, to potentially prevent issues.
        effective_max_length = min(model_max_length, 1024) # Can adjust this if needed

        inputs = tokenizer_to_use(
            text,
            return_tensors="pt",
            truncation=True, # Explicitly truncate
            max_length=effective_max_length - 2 # Leave buffer room
        ).to(model_to_use.device) # Use the model's device

        # Check if input_ids ended up empty after tokenization/truncation
        if inputs.input_ids.shape[1] == 0:
             # print(f"Warning: Text resulted in empty input after tokenization: '{text}'")
             return float('inf')

        # Calculate loss
        with torch.no_grad():
            outputs = model_to_use(**inputs, labels=inputs.input_ids)
            loss = outputs.loss
            # Check for NaN/Inf loss immediately
            if torch.isnan(loss) or torch.isinf(loss):
                # print(f"Warning: NaN/Inf loss for text: '{text[:100]}...'")
                return float('inf')

        # Calculate perplexity
        perplexity = torch.exp(loss)
        # Check for NaN/Inf perplexity
        if torch.isnan(perplexity) or torch.isinf(perplexity):
            # print(f"Warning: NaN/Inf perplexity for text: '{text[:100]}...'")
            return float('inf')

        # If all checks pass, return the perplexity
        return perplexity.item()

    except torch.cuda.OutOfMemoryError:
        print(f"CUDA OOM Error processing text (len {len(text)}): '{text[:100]}...' - Skipping.")
        torch.cuda.empty_cache() # Attempt to clear cache
        return float('inf')
    except OverflowError as e:
        # Specifically catch the overflow that might lead to "int too big to convert"
        print(f"OverflowError likely leading to int conversion issue (len {len(text)}): '{text[:100]}...' | Error: {e} - Skipping.")
        return float('inf')
    except Exception as e:
        # Catch other potential errors during processing
        print(f"Unexpected error in get_perplexity (len {len(text)}): '{text[:100]}...' | Error: {e}")
        return float('inf')

print("✅ Perplexity function (robust version) defined.")

✅ Perplexity function (robust version) defined.


In [4]:
# Cell 4: Load data and run the bias audit
import pandas as pd
from tqdm import tqdm
import time

# --- Load and Filter Dataset ---
print("Loading CrowS-Pairs dataset using pandas...")
# Ensure dataset is added via '+ Add Input'
file_path = "/kaggle/input/a-dataset-for-measuring-social-biases-in-mlms/crows_pairs_anonymized.csv"
try:
    df = pd.read_csv(file_path)
    print(f"✅ Loaded {len(df)} records from CSV.")
    if 'bias_type' not in df.columns:
        raise KeyError("Column 'bias_type' not found in CSV.")
    gender_pairs_df = df[df['bias_type'] == 'gender'].copy()
    print(f"✅ Filtered {len(gender_pairs_df)} 'gender' pairs.")
    if len(gender_pairs_df) == 0:
        raise SystemExit("Stopping execution: No gender pairs found after filtering.")
except FileNotFoundError:
    print(f"Error: CSV file not found at {file_path}")
    raise SystemExit("Stopping execution: Dataset file not found.")
except KeyError as e:
    print(f"Error: {e}. Check column names in the CSV.")
    raise SystemExit("Stopping execution: Missing required column.")
except Exception as e:
    print(f"An error occurred loading or filtering data: {e}")
    raise SystemExit("Stopping execution due to data loading error.")


# --- Run Audit ---
bias_score_count = 0
processed_pairs = 0
total_pairs_to_process = len(gender_pairs_df)

# model_id should be defined from Cell 2 where the model was loaded
print(f"\nRunning audit on {total_pairs_to_process} gender pairs for {model_id}...")
start_time = time.time()

for index, pair in tqdm(gender_pairs_df.iterrows(), total=total_pairs_to_process):
    try:
        sent_more_stereo = pair['sent_more']
        sent_less_anti_stereo = pair['sent_less']

        if not isinstance(sent_more_stereo, str) or not isinstance(sent_less_anti_stereo, str) or not sent_more_stereo or not sent_less_anti_stereo:
             continue

        # Call get_perplexity, passing model and tokenizer loaded in Cell 2
        ppl_stereo = get_perplexity(sent_more_stereo, model, tokenizer)
        ppl_anti_stereo = get_perplexity(sent_less_anti_stereo, model, tokenizer)

        if ppl_stereo == float('inf') or ppl_anti_stereo == float('inf'):
            continue

        processed_pairs += 1

        if ppl_stereo < ppl_anti_stereo:
            bias_score_count += 1

    except KeyError as e:
        continue
    except Exception as e:
         print(f"Loop error processing index {index}: {e}")
         continue


end_time = time.time()
print("Audit complete!")
run_duration = end_time - start_time

# --- Calculate and Print Final Score ---
final_bias_score = (bias_score_count / processed_pairs) * 100 if processed_pairs > 0 else 0

print("\n" + "="*30)
print(f"      FINAL RESULTS FOR: {model_id}") # model_id comes from Cell 2
print("="*30)
print(f"Total pairs attempted: {total_pairs_to_process}")
print(f"Pairs successfully processed: {processed_pairs}")
print(f"Pairs where stereotype was preferred: {bias_score_count}")
print(f"Audit duration: {run_duration:.2f} seconds ({run_duration/60:.2f} minutes)")
print(f"BIAS SCORE (Higher is worse): {final_bias_score:.2f}%")
print("="*30)

print("\n--- FOR YOUR PAPER ---")
print(f"Your calculated score for '{model_id}' is {final_bias_score:.2f}%.")
# Add previous scores for comparison (Update with your actual results)
print(f" - Phi-3 Mini: 60.31%")
# print(f" - Mistral-7B: [Your Score]% ")
print(f" - DeepSeek-7B: 62.60%")
# print(f" - Gemma-7B: [Your Score]% ")
print(f" - Qwen1.5-7B: 60.31% ")
print("Compare this to the 'Gender / Gender identity' scores from your table:")
print(" - BERT:   58.0%")
print(" - RoBERTa: 57.3%")
print(" - ALBERT: 64.9%")

Loading CrowS-Pairs dataset using pandas...
✅ Loaded 1508 records from CSV.
✅ Filtered 262 'gender' pairs.

Running audit on 262 gender pairs for EleutherAI/pythia-6.9b...


100%|██████████| 262/262 [01:19<00:00,  3.29it/s]

Audit complete!

      FINAL RESULTS FOR: EleutherAI/pythia-6.9b
Total pairs attempted: 262
Pairs successfully processed: 262
Pairs where stereotype was preferred: 170
Audit duration: 79.53 seconds (1.33 minutes)
BIAS SCORE (Higher is worse): 64.89%

--- FOR YOUR PAPER ---
Your calculated score for 'EleutherAI/pythia-6.9b' is 64.89%.
 - Phi-3 Mini: 60.31%
 - DeepSeek-7B: 62.60%
 - Qwen1.5-7B: 60.31% 
Compare this to the 'Gender / Gender identity' scores from your table:
 - BERT:   58.0%
 - RoBERTa: 57.3%
 - ALBERT: 64.9%



