In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.1"

# Load tokenizer and model (uses your logged-in credentials)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,   # GPU optimized for 6GB VRAM
    device_map="auto"
)

print("✅ Model loaded on:", next(model.parameters()).device)


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



✅ Model loaded on: cuda:0


In [62]:
import pandas as pd
from tqdm import tqdm
import re
import time

# Your current function (no prints)
def classify_stereotype_with_mistral_v5_silent(line):
    prompt = f"""You are a language model trained to detect gender stereotypes in lines from movie scripts.
Your task is to classify each line into exactly one of the following five categories, based on the presence of stereotypical portrayal—especially of women.

Categories:

occupation_gap → The character is not working or lacks any mention of professional role or ambition.
agency_gap → The character lacks independence, decision-making ability, or is portrayed as passive/submissive.
appearance_focus → The character is described primarily by looks, beauty, body, or clothing.
relationship_only → The character is introduced only in terms of their relationship to another person (e.g., someone's wife, daughter).
none → The line is neutral or empowering, without any clear stereotype or bias.

Assign only one label per line. Read carefully to detect subtle stereotypes.  Return only the  label for each line — no extra words or explanations.

Line: "{line}"
Label:"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=10,
        pad_token_id=tokenizer.eos_token_id,
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    label_matches = re.findall(r"Label:\s*(\w+)", decoded)
    valid_labels = {"occupation_gap", "agency_gap", "appearance_focus", "relationship_only", "none"}
    label = next((lbl.lower() for lbl in reversed(label_matches) if lbl.lower() in valid_labels), "none")
    return label


In [None]:
# Load your full CSV
df = pd.read_csv("cleaned_script_data.csv")
chunk_size = 250  # Experiment with 200–300 per chunk

for start in range(0, len(df), chunk_size):
    end = min(start + chunk_size, len(df))
    chunk = df.iloc[start:end].copy()

    print(f"🧠 Processing lines {start} to {end}")
    tqdm.pandas()
    chunk["stereotype_type"] = chunk["line"].progress_apply(classify_stereotype_with_mistral_v5_silent)

    chunk.to_csv(f"chunk_{start}_{end}.csv", index=False)
    print(f"✅ Saved: chunk_{start}_{end}.csv")

    # Free GPU memory
    torch.cuda.empty_cache()
    time.sleep(2)

🧠 Processing lines 0 to 250


100%|██████████| 250/250 [34:17<00:00,  8.23s/it]


✅ Saved: chunk_0_250.csv
🧠 Processing lines 250 to 500


100%|██████████| 250/250 [35:03<00:00,  8.41s/it]


✅ Saved: chunk_250_500.csv
🧠 Processing lines 500 to 750


100%|██████████| 250/250 [32:35<00:00,  7.82s/it]


✅ Saved: chunk_500_750.csv
🧠 Processing lines 750 to 1000


100%|██████████| 250/250 [29:44<00:00,  7.14s/it]


✅ Saved: chunk_750_1000.csv
🧠 Processing lines 1000 to 1250


100%|██████████| 250/250 [36:29<00:00,  8.76s/it]


✅ Saved: chunk_1000_1250.csv
🧠 Processing lines 1250 to 1500


100%|██████████| 250/250 [33:40<00:00,  8.08s/it]


✅ Saved: chunk_1250_1500.csv
🧠 Processing lines 1500 to 1750


 22%|██▏       | 55/250 [07:56<29:19,  9.02s/it]

In [None]:
import glob

files = sorted(glob.glob("chunk_*.csv"))
df_all = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
df_all.to_csv("ai_stereotype_annotated_final.csv", index=False)
print("✅ All chunks merged and saved.")
