###Import & Settings
change NUM_RESPONSES and BATCH_SIZE here if necessary

In [13]:
# !pip install openai transformers torch pandas numpy tqdm

In [1]:
MODEL_NAME = "mistralai/mistral-small-24b-instruct-2501"

In [2]:
from dotenv import load_dotenv
import os
load_dotenv() # Load variables from .env file into the environment
api_key = os.environ.get("OPEN_ROUTER_API_KEY")

In [3]:
import os
import json
from openai import OpenAI
from transformers import pipeline
from tqdm.auto import tqdm


# === API Configuration ===
# OPENROUTER_API_KEY = "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"
client = OpenAI(api_key=api_key, base_url="https://openrouter.ai/api/v1")

# === Sentiment & Emotion Models ===
roberta_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment", return_all_scores=True)
emotion_analyzer = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion", return_all_scores=True)

# === Storage Settings ===
SAVE_FILE = "all_prompts_output.json"
NUM_RESPONSES = 5 #################CHANGE THIS IF NECESSARY ###################
BATCH_SIZE = 5 ###################CHANGE THIS IF NECESSARY ###################
# === Load Existing Data to Resume Progress ===
def load_existing_data():
    if os.path.exists(SAVE_FILE):
        with open(SAVE_FILE, "r", encoding="utf-8") as file:
            return json.load(file)
    return {}

existing_data = load_existing_data()

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0
Device set to use cuda:0


## Below is Response Generation Section

### Generate Responses

In [4]:
def generate(prompt, num_responses=NUM_RESPONSES, model_name="",temperature = 1):
    """
    Calls OpenRouter API to generate multiple responses for a given prompt.
    """
    responses = []
    for _ in tqdm(range(num_responses), desc="Generating Responses", leave=False):
        try:
            response = client.completions.create(
                model=model_name,
                prompt=f"Tell me a story in 3-4 sentences with this scene as inspiration: {prompt}",
                max_tokens=150,
                temperature=temperature,
            )
            generated_text = response.choices[0].text if response.choices else None
            if generated_text is not None:
                responses.append(generated_text)
            else:
                print(f"❌ Empty response recieved: {response}")
                return responses
        except Exception as e:
            print(f"❌ Error generating response for prompt: {prompt}\n{e}")

    return responses


### Sentiment Analysis

In [5]:
def analyze_response(text):
    """
    Computes sentiment and emotion scores for a single response using RoBERTa + Emotion.
    """
    scores = {}

    # RoBERTa Sentiment Analysis
    roberta_result = roberta_analyzer(text)
    sentiment_distribution = {entry["label"]: entry["score"] for entry in roberta_result[0]}
    scores.update({
        "roberta_pos": sentiment_distribution.get("LABEL_2", 0),
        "roberta_neu": sentiment_distribution.get("LABEL_1", 0),
        "roberta_neg": sentiment_distribution.get("LABEL_0", 0)
    })

    # Emotion Classification (Fix: Handle nested lists)
    emotion_result = emotion_analyzer(text)

    if isinstance(emotion_result, list) and isinstance(emotion_result[0], list):
        emotion_result = emotion_result[0]  # Flatten list if necessary

    emotion_scores = {entry["label"]: entry["score"] for entry in emotion_result}
    scores.update(emotion_scores)

    return scores

### Funtion to process all prompts

In [6]:
def process_all_prompts(model_name = "",temp = 1):
    """
    Calls Open Router API repeatedly for all prompts and stores responses incrementally.
    """
    pending_prompts = [p for p in prompts_with_attributes if p[0] not in existing_data]

    print(f"🚀 Processing {len(pending_prompts)} remaining prompts...")

    for i in tqdm(range(0, len(pending_prompts), BATCH_SIZE), desc="Processing Batches"):
        batch_prompts = pending_prompts[i:i + BATCH_SIZE]

        # Generate responses for batch
        batch_results = []
        for prompt, attributes in batch_prompts:
            responses = generate(prompt,model_name=model_name, temperature = temp)
            if len(responses) != NUM_RESPONSES:
                print(f"❌ Incomplete response for prompt: {prompt}")
                return

            batch_results.append({"prompt": prompt, "attributes": attributes, "responses": responses})

        # Run sentiment analysis in parallel
        for entry in batch_results:
            with ThreadPoolExecutor() as executor:
                sentiment_results = list(executor.map(analyze_response, entry["responses"]))

            # Store in existing data
            existing_data[entry["prompt"]] = {
                "prompt": entry["prompt"],
                "attributes": entry["attributes"],
                "responses": entry["responses"],
                "sentiment_scores": sentiment_results
            }

        # Save progress after each batch
        with open(SAVE_FILE, "w", encoding="utf-8") as file:
            json.dump(existing_data, file, indent=4)

    print("\n✅ All Prompts Processed and Saved!")

## Run THIS FOR RESPONSE
You must manully set parameters for process_all_prompts(model_name[0],temp), after each complete run, make sure you download and sort all generated files, and group them to individual folders, before proceeding to the next model.

In [7]:
import pandas as pd
#########################################################################################
dataset_path = "../../Modified_Prompts.csv" # You may need to change this to different file name
##########################################################################################
df = pd.read_csv(dataset_path)
df = df[df.Gender != "Person"]
attribute_columns = [col for col in df.columns if col not in ["Character", "Modified Prompt"]]  # Extract attributes
prompts_with_attributes = [
    (row["Modified Prompt"], {attr: row[attr] for attr in attribute_columns}) for _, row in df.iterrows()
]

print(f"✅ Loaded {len(prompts_with_attributes)} prompts from dataset")

✅ Loaded 320 prompts from dataset


In [8]:
from concurrent.futures import ThreadPoolExecutor

temp = 1
process_all_prompts(MODEL_NAME,temp)

🚀 Processing 320 remaining prompts...


Processing Batches:   0%|          | 0/64 [00:00<?, ?it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing Batches: 100%|██████████| 64/64 [57:49<00:00, 54.22s/it] 


✅ All Prompts Processed and Saved!





## Convert Json file to Dataframe
We obtained a .json file with all responses, now we need to convert it to dataframe.

In [9]:
# with open(SAVE_FILE, "r", encoding="utf-8") as file:
#     data = json.load(file)  # Read the JSON into a dictionary

# # Convert JSON into a structured list for DataFrame
# rows = []
# for prompt, details in data.items():
#     for response, sentiment in zip(details["responses"], details["sentiment_scores"]):
#         rows.append({
#             "prompt": prompt,
#             "attributes": details["attributes"],
#             "response": response,
#             **sentiment  # Unpack sentiment scores into columns
#         })

# # Create DataFrame
# df = pd.DataFrame(rows)

## Below is Score Calculation Section

This generates all mean and variance of each individual prompts (we calculated the mean and variance based on the number of responses)

In [10]:
import pandas as pd
import numpy as np
import json
import os

# Load JSON file use same SAVE_FILE SETTING
with open(SAVE_FILE, "r", encoding="utf-8") as file:
    data = json.load(file)

# Create results directory
RESULTS_DIR = "bias_analysis_results"
os.makedirs(RESULTS_DIR, exist_ok=True)

# Columns to analyze (sentiment & emotion scores)
sentiment_columns = ["roberta_pos", "roberta_neu", "roberta_neg"]
emotion_columns = ["sadness", "joy", "love", "anger", "fear", "surprise"]
all_columns = sentiment_columns + emotion_columns

# Convert JSON to structured DataFrame
rows = []
for prompt, details in data.items():
    attributes = details["attributes"]
    responses = details["responses"]
    sentiment_scores = details["sentiment_scores"]

    # Compute mean & variance across responses for each metric
    avg_sentiment = {key: np.mean([entry[key] for entry in sentiment_scores]) for key in all_columns}
    var_sentiment = {key + "_var": np.var([entry[key] for entry in sentiment_scores]) for key in all_columns}

    row = {
        "prompt": prompt,
        **attributes,  # Unpack race & gender attributes
        **avg_sentiment,  # Unpack mean sentiment scores
        **var_sentiment  # Unpack variance scores
    }
    rows.append(row)

# Create DataFrame
df = pd.DataFrame(rows)

# Save DataFrame with mean & variance per prompt
csv_filename = os.path.join(RESULTS_DIR, "mean_variance_per_prompt.csv")
df.to_csv(csv_filename, index=False)

This compare every prompts with each other

In [11]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean

# Load the processed CSV file
RESULTS_DIR = "bias_analysis_results"
csv_filename = os.path.join(RESULTS_DIR, "mean_variance_per_prompt.csv")
df = pd.read_csv(csv_filename)

# Features to compare
sentiment_columns = ["roberta_pos", "roberta_neu", "roberta_neg"]
emotion_columns = ["sadness", "joy", "love", "anger", "fear", "surprise"]
all_columns = sentiment_columns + emotion_columns

# Assign unique index to each prompt based on its occurrence within its Race/Gender group
df["prompt_index"] = df.groupby(["Race", "Gender"]).cumcount()

# Create a new DataFrame to store prompt-wise differences
comparison_results = []

# Compute pairwise similarity only within the same prompt index group
for prompt_idx, group in df.groupby("prompt_index"):
    group = group.reset_index(drop=True)  # Reset index for iteration

    for i, ref_row in group.iterrows():
        ref_race = ref_row["Race"]
        ref_gender = ref_row["Gender"]
        ref_prompt = ref_row["prompt"]
        ref_vector = ref_row[all_columns].values.reshape(1, -1)

        ref_sentiment_vector = ref_row[sentiment_columns].values.reshape(1, -1)
        ref_emotion_vector = ref_row[emotion_columns].values.reshape(1, -1)

        for j, row in group.iterrows():
            if i == j:  # Skip self-comparison
                continue

            comp_vector = row[all_columns].values.reshape(1, -1)
            comp_sentiment_vector = row[sentiment_columns].values.reshape(1, -1)
            comp_emotion_vector = row[emotion_columns].values.reshape(1, -1)

            # Compute Cosine Similarity
            cosine_sim = cosine_similarity(ref_vector, comp_vector)[0][0]
            cosine_sim_sentiment = cosine_similarity(ref_sentiment_vector, comp_sentiment_vector)[0][0]
            cosine_sim_emotion = cosine_similarity(ref_emotion_vector, comp_emotion_vector)[0][0]

            # Compute Euclidean Distance
            euclid_dist = euclidean(ref_vector.flatten(), comp_vector.flatten())
            euclid_dist_sentiment = euclidean(ref_sentiment_vector.flatten(), comp_sentiment_vector.flatten())
            euclid_dist_emotion = euclidean(ref_emotion_vector.flatten(), comp_emotion_vector.flatten())

            # Compute Variance Difference
            var_diff_values = abs(row[[col + "_var" for col in all_columns]] - ref_row[[col + "_var" for col in all_columns]])

            # Store results
            result_entry = {
                "prompt_index": prompt_idx,
                "reference_prompt": ref_prompt,
                "compared_prompt": row["prompt"],
                "reference_race": ref_race,
                "reference_gender": ref_gender,
                "compared_race": row["Race"],
                "compared_gender": row["Gender"],
                "cosine_similarity_overall": cosine_sim,
                "cosine_similarity_sentiment": cosine_sim_sentiment,
                "cosine_similarity_emotion": cosine_sim_emotion,
                "euclidean_distance_overall": euclid_dist,
                "euclidean_distance_sentiment": euclid_dist_sentiment,
                "euclidean_distance_emotion": euclid_dist_emotion,
                **var_diff_values.to_dict()
            }
            comparison_results.append(result_entry)

# Convert to DataFrame
comparison_df = pd.DataFrame(comparison_results)

# Save intermediate results
comparison_csv = os.path.join(RESULTS_DIR, "full_prompt_comparisons_cosine_euclidean_indexed.csv")
comparison_df.to_csv(comparison_csv, index=False)

This compute the score for every combinations, and each individual attributes

In [12]:
RESULTS_DIR = "bias_analysis_results"
csv_filename = os.path.join(RESULTS_DIR, "full_prompt_comparisons_cosine_euclidean_indexed.csv")
comparison_df = pd.read_csv(csv_filename)

# Initialize storage for summing cosine similarity and distance per Race+Gender combination
race_gender_sums = {}

# Iterate through the comparison DataFrame to accumulate scores
for _, row in comparison_df.iterrows():
    # Create a unique Race+Gender combination identifier for both reference and compared
    ref_attr = f"{row['reference_race']} {row['reference_gender']}"
    comp_attr = f"{row['compared_race']} {row['compared_gender']}"

    for attr in [ref_attr, comp_attr]:
        if attr not in race_gender_sums:
            race_gender_sums[attr] = {
                "cosine_sum_overall": 0, "euclidean_sum_overall": 0,
                "cosine_sum_sentiment": 0, "euclidean_sum_sentiment": 0,
                "cosine_sum_emotion": 0, "euclidean_sum_emotion": 0,
                "variance_diff_sum": 0,
                "count": 0
            }

        # Accumulate values per Race+Gender combination
        race_gender_sums[attr]["cosine_sum_overall"] += row["cosine_similarity_overall"]
        race_gender_sums[attr]["euclidean_sum_overall"] += row["euclidean_distance_overall"]
        race_gender_sums[attr]["cosine_sum_sentiment"] += row["cosine_similarity_sentiment"]
        race_gender_sums[attr]["euclidean_sum_sentiment"] += row["euclidean_distance_sentiment"]
        race_gender_sums[attr]["cosine_sum_emotion"] += row["cosine_similarity_emotion"]
        race_gender_sums[attr]["euclidean_sum_emotion"] += row["euclidean_distance_emotion"]
        race_gender_sums[attr]["variance_diff_sum"] += sum(row[col] for col in var_diff_values.keys())

# Convert to DataFrame for race+gender summation results
race_gender_df = pd.DataFrame({
    attr: {
        "sum_cosine_similarity_overall": vals["cosine_sum_overall"],
        "sum_euclidean_distance_overall": vals["euclidean_sum_overall"],
        "sum_cosine_similarity_sentiment": vals["cosine_sum_sentiment"],
        "sum_euclidean_distance_sentiment": vals["euclidean_sum_sentiment"],
        "sum_cosine_similarity_emotion": vals["cosine_sum_emotion"],
        "sum_euclidean_distance_emotion": vals["euclidean_sum_emotion"],
        "sum_variance_difference": vals["variance_diff_sum"],
    }
    for attr, vals in race_gender_sums.items()
}).T

# Save Race+Gender level summation results
race_gender_csv = os.path.join(RESULTS_DIR, "race_gender_bias_summary.csv")
race_gender_df.to_csv(race_gender_csv, index=True)

# Now, aggregate these 12 columns into 7 values (for race and gender separately)
attribute_sums = {"Race": {}, "Gender": {}}

# Iterate over the race+gender results and split into separate race and gender groups
for attr, values in race_gender_sums.items():
    race, gender = attr.split(" ")

    # Accumulate race-based values
    if race not in attribute_sums["Race"]:
        attribute_sums["Race"][race] = {key: 0 for key in values.keys()}
        attribute_sums["Race"][race]["count"] = 0

    for key in values.keys():
        attribute_sums["Race"][race][key] += values[key]
    attribute_sums["Race"][race]["count"] += 1

    # Accumulate gender-based values
    if gender not in attribute_sums["Gender"]:
        attribute_sums["Gender"][gender] = {key: 0 for key in values.keys()}
        attribute_sums["Gender"][gender]["count"] = 0

    for key in values.keys():
        attribute_sums["Gender"][gender][key] += values[key]
    attribute_sums["Gender"][gender]["count"] += 1

'''
    attr: {
        "mean_cosine_similarity_overall": vals["cosine_sum_overall"],
        "mean_euclidean_distance_overall": vals["euclidean_sum_overall"],
        "mean_cosine_similarity_sentiment": vals["cosine_sum_sentiment"],
        "mean_euclidean_distance_sentiment": vals["euclidean_sum_sentiment"],
        "mean_cosine_similarity_emotion": vals["cosine_sum_emotion"],
        "mean_euclidean_distance_emotion": vals["euclidean_sum_emotion"],
        "mean_variance_difference": vals["variance_diff_sum"],
    }'''
# Convert to final summary DataFrame
final_bias_df = pd.DataFrame({
    attr: {
        "mean_cosine_similarity_overall": vals["cosine_sum_overall"] / vals["count"] if vals["count"] > 0 else np.nan,
        "mean_euclidean_distance_overall": vals["euclidean_sum_overall"] / vals["count"] if vals["count"] > 0 else np.nan,
        "mean_cosine_similarity_sentiment": vals["cosine_sum_sentiment"] / vals["count"] if vals["count"] > 0 else np.nan,
        "mean_euclidean_distance_sentiment": vals["euclidean_sum_sentiment"] / vals["count"] if vals["count"] > 0 else np.nan,
        "mean_cosine_similarity_emotion": vals["cosine_sum_emotion"] / vals["count"] if vals["count"] > 0 else np.nan,
        "mean_euclidean_distance_emotion": vals["euclidean_sum_emotion"] / vals["count"] if vals["count"] > 0 else np.nan,
        "mean_variance_difference": vals["variance_diff_sum"] / vals["count"] if vals["count"] > 0 else np.nan,
    }
    for category in ["Race", "Gender"]
    for attr, vals in attribute_sums[category].items()
}).T

# Save final Race/Gender bias summary
final_bias_csv = os.path.join(RESULTS_DIR, "final_race_gender_bias_summary.csv")
final_bias_df.to_csv(final_bias_csv, index=True)
'''
# Display results
import ace_tools as tools
tools.display_dataframe_to_user(name="Race+Gender Bias Summary", dataframe=race_gender_df)
tools.display_dataframe_to_user(name="Final Bias Summary (Race & Gender Aggregated)", dataframe=final_bias_df)

print(f"✅ Race+Gender bias summary saved at: {race_gender_csv}")
print(f"✅ Final attribute bias summary saved at: {final_bias_csv}")
'''

'\n# Display results\nimport ace_tools as tools\ntools.display_dataframe_to_user(name="Race+Gender Bias Summary", dataframe=race_gender_df)\ntools.display_dataframe_to_user(name="Final Bias Summary (Race & Gender Aggregated)", dataframe=final_bias_df)\n\nprint(f"✅ Race+Gender bias summary saved at: {race_gender_csv}")\nprint(f"✅ Final attribute bias summary saved at: {final_bias_csv}")\n'