<a href="https://colab.research.google.com/github/AI-Cultural-Heritage-Lab/ushmm_model_comparisons/blob/main/Generate_and_Compare_LLM_Outputs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
#import API keys and mount google drive
from google.colab import drive
from google.colab import userdata
!pip install --upgrade openai
#make sure we have openAI and Gemini access tokens before proceeding (key icon on left)
import os
from openai import OpenAI
import google.generativeai as genai
# Pass the API key to Open AI
os.environ['OPEN_AI_API_KEY'] = userdata.get("OPEN_AI_PROJECT_KEY")
client = OpenAI(api_key=os.environ.get("OPEN_AI_API_KEY"))
# Pass the API key to Gemini
os.environ['GEMINI_API_KEY'] = userdata.get("GEMINI_PROJECT_KEY")
# Pass the API key to Grok
os.environ['GROK_API_KEY'] = userdata.get("GROK_PROJECT_KEY")
# Pass the API key to SERP
os.environ['SERP_API_KEY'] = userdata.get("SERP_PROJECT_KEY")
#mount drive
drive.mount('/content/drive')

Collecting openai
  Downloading openai-1.82.0-py3-none-any.whl.metadata (25 kB)
Downloading openai-1.82.0-py3-none-any.whl (720 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m720.4/720.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.78.1
    Uninstalling openai-1.78.1:
      Successfully uninstalled openai-1.78.1
Successfully installed openai-1.82.0
Mounted at /content/drive


In [None]:
## Import Data - imports the data as an excel file from the Google Sheet containing the text you want to feed to the LLM(s)

import gdown
import pandas as pd


#ALL 6000 search queries (1000 for each country - USA, Germany, S Korea, Russia, Turkey, China)
#this is Ulysses' "ushmm_data_with_articles" spreadsheet that includes the ushmmm article link and text for each query
url_ALL = 'https://docs.google.com/spreadsheets/d/1i6PRehieuZKl2Zk7EoJFjwRuS4e-NR8mtBAbTt6rZ4c/edit?gid=525940975#gid=525940975'
export_url_ALL= f"{url_ALL}"+"/export?format=xlsx"
temp_file_ALL = "file_ALL.xlsx"
gdown.download(export_url_ALL, temp_file_ALL, quiet=False, fuzzy=True)

# Load the Excel file into pandas
df_ALL = pd.read_excel(temp_file_ALL)

# Display the DataFrame
print(df_ALL.head())

In [7]:
#If that does not work, simply load the excel file from the directory
df_ALL = pd.read_excel('ushmm_data_with_articles.xlsx')
# Display the DataFrame
print(df_ALL.head())

   id                                          file_name  file_row_index  \
0   0  Copy of Holocaust Encyclopedia - search querie...               0   
1   1  Copy of Holocaust Encyclopedia - search querie...               1   
2   2  Copy of Holocaust Encyclopedia - search querie...               2   
3   3  Copy of Holocaust Encyclopedia - search querie...               3   
4   4  Copy of Holocaust Encyclopedia - search querie...               4   

        location  year                            Top queries  Clicks  \
0  United States  2024  how many people died in the holocaust   40678   
1  United States  2024                      armenian genocide   37535   
2  United States  2024                 holocaust encyclopedia   35664   
3  United States  2024                    first they came for   34702   
4  United States  2024                              holocaust   33996   

   Impressions     CTR  Position ai_overview_standalone  \
0       189593  0.2146      1.02             

# Initialize LLMs and define functions to query them

In [10]:
#load API keys
OPENAI_API_KEY = userdata.get('OPEN_AI_PROJECT_KEY')
GOOGLE_API_KEY = userdata.get('GEMINI_PROJECT_KEY')
GROK_API_KEY = userdata.get('GROK_PROJECT_KEY')

In [11]:
#GPT

from typing import List, Dict, Any, Callable
from pydantic import BaseModel, Field, ValidationError
import json
import time

# Import Colab Secrets userdata module
from google.colab import userdata
import os
from openai import OpenAI


# Pass the API key to Open AI
client = OpenAI(api_key=userdata.get('OPEN_AI_PROJECT_KEY'))

def query_gpt4o(prompt: str) -> str:
  try:
    time.sleep(0.13)
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="gpt-4o",
    )
    return response.choices[0].message.content
  except Exception as e:
    return f"GPT-4o Error: {str(e)}"

  #alternative querying functions
def base_gpt(input, model):
    response = client.chat.completions.create(
        messages=[
            {"role": "user", "content": input,}],
        model=model,
    )
    output_dict = {
        "model": model,
        "output": response.choices[0].message.content
    }
    return response.choices[0].message.content

def generate_LLM_response(row) -> List[str]:
    #Let's start the the fewest assumptions
    input_data = row['Top queries']
    response = base_gpt(input_data, "gpt-4o")
    return response

In [12]:
#Gemini

from typing import List, Dict
import json
import os
import requests
import google.generativeai as genai
from google.colab import userdata
from openai import OpenAI

# Initialize Gemini
genai.configure(api_key=GOOGLE_API_KEY)

# Query Gemini 2.0 Flash
def query_gemini(prompt: str) -> str:
    model = genai.GenerativeModel(
    'gemini-2.0-flash',
    generation_config=genai.GenerationConfig(
        max_output_tokens=2000,
        temperature=0.9,
    ))
    try:
        time.sleep(4.1)  # ~4 seconds delay to stay under 2.0 flash rate limit 15 requests/minute
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        return f"Error: {str(e)}"

In [13]:
#Grok

# Query Grok latest version
# Initialize Grok Client
grok_client = OpenAI(
    api_key=GROK_API_KEY,
    base_url="https://api.x.ai/v1",
)

# Function to Query Grok
def query_grok(prompt: str) -> str:
    try:
        time.sleep(0.13) # ~1/8 seconds delay to stay under grok-2-1212 rate limit of 8 requests/second
        response = grok_client.chat.completions.create(
            model="grok-2-latest",  # Use the latest Grok model
            messages=[{"role": "user", "content": prompt}],
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Grok API Error: {str(e)}"

#Generate LLM Responses

In [14]:
# Generate LLM Responses=
def generate_LLM_responses(row) -> Dict[str, str]:
    prompt = row['Top queries']

    return {
        "GPT-4o": query_gpt4o(prompt),
        "Gemini": query_gemini(prompt),
        "Grok": query_grok(prompt)
    }

import pandas as pd

# Process DataFrame and Add LLM Responses
def generate_LLM_responses_df(df):
    # Apply the function to each row and create new columns
    df[['GPT-4o Response', 'Gemini Response', 'Grok Response']] = df['Top queries'].apply(
        lambda prompt: pd.Series(generate_LLM_responses({"Top queries": prompt}))
    )
    return df


In [None]:
df_2 = generate_LLM_responses_df(df_ALL[:2])

In [None]:
print(df_2.head())

In [None]:
us_search_responses_df = generate_LLM_responses_df(df_ALL[:1000])
print(us_search_responses_df.head())

# Define functions for obtaining similarity and readability metrics

READABILITY METRICS

In [18]:
#Word Count
import re
def word_count(text):
    #Returns the number of words in a given text using regex
    words = re.findall(r'\b\w+\b', text)
    return len(words)

#Flesch Reading Score
def count_sentences(text):
    """Counts the number of sentences based on punctuation."""
    return max(1, len(re.findall(r'[.!?]', text)))  # Avoid zero sentences

def count_words(text):
    """Counts the number of words in the text."""
    return max(1, len(text.split()))  # Avoid division by zero

def count_syllables(word):
    """Counts the number of syllables in a word using regex heuristics."""
    word = word.lower()
    syllables = re.findall(r'[aeiouy]+', word)  # Matches vowel clusters
    if word.endswith(("es", "ed")) and len(syllables) > 1:
        syllables.pop()  # Discount silent syllables
    return max(1, len(syllables))

def count_total_syllables(text):
    """Counts the total number of syllables in the text."""
    words = re.findall(r'\b\w+\b', text)  # Extract words
    return sum(count_syllables(word) for word in words)

def flesch_reading_ease(text):
    """Calculates the Flesch Reading Ease Score."""
    words = count_words(text)
    sentences = count_sentences(text)
    syllables = count_total_syllables(text)

    asl = words / sentences  # Average Sentence Length
    asw = syllables / words  # Average Syllables per Word

    score = 206.835 - (1.015 * asl) - (84.6 * asw)
    return round(score, 2)

#Coleman Liau Index
def count_letters(text):
    """Counts the number of letters (A-Z, a-z) in the text."""
    return sum(c.isalpha() for c in text)

def count_sentences(text):
    """Counts the number of sentences based on punctuation."""
    return max(1, len(re.findall(r'[.!?]', text)))  # Avoid zero sentences

def coleman_liau_index(text):
    """Calculates the Coleman-Liau Index for readability assessment."""
    letters = count_letters(text)
    words = count_words(text)
    sentences = count_sentences(text)

    L = (letters / words) * 100  # Average letters per 100 words
    S = (sentences / words) * 100  # Average sentences per 100 words

    CLI = 0.0588 * L - 0.296 * S - 15.8
    return round(CLI, 2)

SIMILARITY METRICS

In [19]:
#Cosine Similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Define Get Embedding Function
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = text, model=model).data[0].embedding

# Example Usage

# Create A text string
text = "Hello, I am a member of a Digital Humanities Lab"

# Apply the function to the text and inspect the output
print(get_embedding(text))

def cosine_similarity_text(text1, text2):
    # Generate Embeeddings for the text
    embedding1 = get_embedding(text1)
    embedding2 = get_embedding(text2)

    # Convert embeddings to numpy arrays
    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)

    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)[0][0]

    # Display the result
    print(f"Cosine similarity: {similarity}")

def cosine_similarity_text_2(text1, text2):
    #Calculate cosine similarity between two long text strings using TF-IDF
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

[-0.030613349750638008, -0.003794631687924266, 0.03129681199789047, -0.011291342787444592, 0.0024437285028398037, -0.01742825098335743, -0.005976722575724125, 0.0819583460688591, -0.034173041582107544, -0.06333404034376144, -0.010109525173902512, -0.037419483065605164, -0.0326637327671051, -0.024561874568462372, -0.0037412361707538366, 0.04097917303442955, -0.04573492333292961, 0.019749170169234276, -0.007902516052126884, 0.06914345920085907, 0.04536471515893936, 0.00037732734926976264, 0.0035579120740294456, 0.00984610803425312, -0.009952898137271404, -0.00466675590723753, -0.021884985268115997, 0.026199333369731903, 0.01695837266743183, -0.007368562277406454, 0.001522658159956336, -0.023081041872501373, -0.07757280766963959, 0.012380608357489109, -0.07130774855613708, 0.051999978721141815, -0.013455635868012905, 0.025928795337677002, 0.013370202854275703, -0.020603496581315994, -0.003602408105507493, -0.04086526483297348, 0.008863632567226887, 0.0015689341817051172, -0.03565387427806

In [20]:
#Jaccard Index

from nltk.util import ngrams

def jaccard_index(text1, text2, n=3):
    """Calculate Jaccard index using n-grams for better comparison of longer texts."""
    set1 = set(ngrams(text1.split(), n))
    set2 = set(ngrams(text2.split(), n))

    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))

    return intersection / union if union != 0 else 0

# Apply readability/similarity functions

In [27]:
#This function calculates the individual metrics (cosine similarity, jaccard index, flesch score, coleman-liau) individually
def process_text_comparisons(df):

    # Ensure required columns exist
    required_columns = ["ushmm_article", "GPT-4o Response", "Gemini Response", "Grok Response"]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"CSV must contain the columns: {required_columns}")

    # Compute Readability and Word Count
    for col in required_columns:
        df[f"{col} - Word Count"] = df[col].apply(count_words)
        df[f"{col} - Flesch Score"] = df[col].apply(flesch_reading_ease)
        df[f"{col} - Coleman-Liau"] = df[col].apply(coleman_liau_index)

    # Pairwise Comparisons
    pairings = [
        ("ushmm_article", "GPT-4o Response"),
        ("ushmm_article", "Gemini Response"),
        ("ushmm_article", "Grok Response"),
        ("GPT-4o Response", "Gemini Response"),
        ("GPT-4o Response", "Grok Response"),
        ("Gemini Response", "Grok Response"),
    ]

    for col1, col2 in pairings:
        df[f"{col1}/{col2} - Cosine Sim"] = df.apply(lambda row: cosine_similarity_text_2(row[col1], row[col2]), axis=1)
        df[f"{col1}/{col2} - Jaccard Index"] = df.apply(lambda row: jaccard_index(row[col1], row[col2]), axis=1)

    return df

#calculates word count only
def process_word_count(df):

    # Ensure required columns exist
    required_columns = ["ushmm_article", "GPT-4o Response", "Gemini Response", "Grok Response"]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"CSV must contain the columns: {required_columns}")

    # Compute Readability and Word Count
    for col in required_columns:
        df[f"{col} - Word Count"] = df[col].apply(count_words)
        df[f"{col} - Flesch Score"] = df[col].apply(flesch_reading_ease)
        df[f"{col} - Coleman-Liau"] = df[col].apply(coleman_liau_index)
    return df

#Function for computing the composite similarity between two texts
def composite_similarity(text1, text2):
    cos = cosine_similarity_text_2(text1, text2)
    jac = jaccard_index(text1, text2)
    flesch_sim = 1 - max(0, 1 - abs(flesch_reading_ease(text1) - flesch_reading_ease(text2)) / 100)
    cli_sim = 1 - abs(coleman_liau_index(text1) - coleman_liau_index(text2)) / 20
    return round(0.4 * cos + 0.3 * jac + 0.15 * flesch_sim + 0.15 * cli_sim, 4)

#Function for taking in a df of USHMM, GPT, Gemini, and Grok responses to queries  and computing the composite similarities between each responses for each query
def compute_composite_similarity_scores(df):
    """
    Prepares a DataFrame by selecting specific columns, adding empty columns for similarity scores,
    and computing similarity between specified pairs of text fields.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A new DataFrame with similarity columns populated.
    """
    import pandas as pd

    # Define the columns to keep
    columns_to_keep = [
        'location', 'Top queries', 'ushmm_article',
        'GPT-4o Response', 'Gemini Response', 'Grok Response'
    ]

    # Define the new empty columns to add
    new_columns = [
        'USHMM-GPT Similarity', 'USHMM-Gemini Similarity', 'USHMM-Grok Similarity',
        'GPT-Gemini Similarity', 'GPT-Grok Similarity', 'Gemini-Grok Similarity',
        'Notes'
    ]

    # Filter the DataFrame to keep only the required columns
    df_cleaned = df[columns_to_keep].copy()

    # Add the new empty columns
    for col in new_columns:
        df_cleaned[col] = ""

    # Define the text column pairs for which similarity is computed
    pairs = [
        ("ushmm_article", "GPT-4o Response", "USHMM-GPT Similarity"),
        ("ushmm_article", "Gemini Response", "USHMM-Gemini Similarity"),
        ("ushmm_article", "Grok Response", "USHMM-Grok Similarity"),
        ("GPT-4o Response", "Gemini Response", "GPT-Gemini Similarity"),
        ("GPT-4o Response", "Grok Response", "GPT-Grok Similarity"),
        ("Gemini Response", "Grok Response", "Gemini-Grok Similarity")
    ]

    # Compute the similarity for each specified pair
    for col1, col2, new_col in pairs:
        df_cleaned[new_col] = df_cleaned.apply(
            lambda row: composite_similarity(row[col1], row[col2]),
            axis=1
        )

    return df_cleaned

In [28]:
df_2_with_composite = compute_composite_similarity_scores(df_2)

In [None]:
print(df_2_with_composite.head())

In [None]:
us_search_responses_df_with_composite = compute_composite_similarity_scores(us_search_responses_df)

In [None]:
print(us_search_responses_df_with_composite.head())