In [None]:
# --- 0. Core Libraries & Installations ---
# Install core libraries and ensure NumPy compatibility for MoverScore early.
# --force-reinstall is crucial to ensure numpy is downgraded.
!pip install pandas nltk spacy transformers huggingface_hub tensorflow
!pip install 'numpy<2' --force-reinstall

Collecting numpy<2
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
pytensor 2.35.1 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which 

In [None]:
# Install required dependencies for moverscore and general libraries.
!pip install pyemd
!pip install pytorch_pretrained_bert
!pip install moverscore bert-score

Collecting pyemd
  Using cached pyemd-1.0.0.tar.gz (87 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pyemd
  Building wheel for pyemd (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pyemd: filename=pyemd-1.0.0-cp312-cp312-linux_x86_64.whl size=742847 sha256=35889350bc6383e1755d1a8689433639c5edd96db5320ebd0fcc87023c93aa49
  Stored in directory: /root/.cache/pip/wheels/bf/7d/e0/84ae1a3c2e45898a01b400c288b56a601c03fd36f2a4d060bf
Successfully built pyemd
Installing collected packages: pyemd
Successfully installed pyemd-1.0.0
Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl.metadata (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.7/86.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting boto3 (from pytorch_pretrained_bert)
  Downloading

In [None]:
import pandas as pd
import numpy as np
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import string
from collections import defaultdict
import json
import os

In [None]:
# Install spaCy model
!python -m spacy download en_core_web_sm

# Clone BLEURT repository for local installation (required for bleurt.score import structure)
# This is primarily to make the 'bleurt' package available for import, even if
# we end up using transformers for model loading.
if not os.path.exists('bleurt'):
    print("Cloning BLEURT repository...")
    !git clone https://github.com/google-research/bleurt.git
else:
    print("BLEURT repository already cloned.")

# Install BLEURT library locally from the cloned repository.
# This makes the bleurt.score module available, although its scorer is not directly used for the HF model.
current_dir = os.getcwd()
%cd bleurt
print("Installing TensorFlow and BLEURT library from cloned repo...")
!pip install .
%cd {current_dir}

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m141.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Cloning BLEURT repository...
Cloning into 'bleurt'...
remote: Enumerating objects: 134, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 134 (delta 0), reused 17 (delta 0), pack-reused 116 (from 1)[K
Receiving objects: 100% (134/134), 31.28 MiB | 15.52 MiB/s,

In [None]:
# --- 1. Imports (moved here for clarity after installations) ---
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

try:
    from bert_score import score as bert_score_calc
except ImportError:
    print("bert_score not found. Please ensure it's installed.")
    bert_score_calc = None

try:
    # The original bleurt library is still installed, but its scorer won't be used directly
    # if the HF model is loaded instead. We still need to import it for the `BleurtScorer` class definition.
    from bleurt.score import BleurtScorer
except ImportError:
    print("bleurt not found. Please ensure it's installed.")

try:
    from moverscore import get_idf_dict, word_mover_score
    print("moverscore functions imported successfully.")
except ImportError:
    print("moverscore not found or failed to import. Please run 'pip install moverscore' and its dependencies.")
    get_idf_dict = None
    word_mover_score = None
except Exception as e:
    print(f"An unexpected error occurred during moverscore import: {e}")
    get_idf_dict = None
    word_mover_score = None


# --- 2. One-time NLTK Downloads ---
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# --- 3. Global Model Loadings ---
# Load spaCy model
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print("Spacy model 'en_core_web_sm' not found. Please run: python -m spacy download en_core_web_sm")
    nlp = None

# Define BLEURT model details (Hugging Face)
BLEURT_MODEL_NAME = "Elron/bleurt-base-512"
LOCAL_BLEURT_CHECKPOINT_DIR = os.path.abspath("bleurt-base-512") # Use absolute path for robustness

# Download Hugging Face BLEURT checkpoint (if not already downloaded)
from huggingface_hub import snapshot_download

os.makedirs(LOCAL_BLEURT_CHECKPOINT_DIR, exist_ok=True)
print(f"Downloading BLEURT checkpoint '{BLEURT_MODEL_NAME}' from Hugging Face...")
try:
    snapshot_download(
        repo_id=BLEURT_MODEL_NAME,
        local_dir=LOCAL_BLEURT_CHECKPOINT_DIR,
        local_dir_use_symlinks=False
    )
    print(f"Successfully downloaded BLEURT checkpoint to {LOCAL_BLEURT_CHECKPOINT_DIR}")
except Exception as e:
    print(f"Error downloading BLEURT checkpoint: {e}")
    print("Please manually verify the model name and available files on Hugging Face for BLEURT.")

# Load BLEURT model using Hugging Face Transformers
transformers_bleurt_tokenizer = None
transformers_bleurt_model = None

try:
    # Load tokenizer and model from the local directory where snapshot_download saved them.
    # Direct path is used, without local_files_only=True, as it caused issues previously.
    print(f"Attempting to load BLEURT tokenizer and model from transformers library using {LOCAL_BLEURT_CHECKPOINT_DIR}")
    transformers_bleurt_tokenizer = AutoTokenizer.from_pretrained(LOCAL_BLEURT_CHECKPOINT_DIR)
    transformers_bleurt_model = AutoModelForSequenceClassification.from_pretrained(LOCAL_BLEURT_CHECKPOINT_DIR)
    # Ensure model is in evaluation mode
    transformers_bleurt_model.eval() # Ensure model is in evaluation mode
    print("BLEURT model (transformers) loaded successfully.")
except Exception as e:
    print(f"Error loading Hugging Face BLEURT model from {LOCAL_BLEURT_CHECKPOINT_DIR}: {e}")
    print("Please verify the downloaded checkpoint is a valid Hugging Face Transformers model.")

# Set bleurt_scorer to None, as it won't be used due to incompatibility
# with the downloaded checkpoint. The get_bleurt function will now use
# the transformers_bleurt_model.
bleurt_scorer = None

Downloading https://github.com/AIPHES/emnlp19-moverscore/releases/download/0.6/MNLI_BERT.zip to /root/.moverscore/MNLI_BERT.zip
[--------------------------------------------------]
moverscore functions imported successfully.
Downloading BLEURT checkpoint 'Elron/bleurt-base-512' from Hugging Face...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

README.md:   0%|          | 0.00/999 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Successfully downloaded BLEURT checkpoint to /content/bleurt-base-512
Attempting to load BLEURT tokenizer and model from transformers library using /content/bleurt-base-512
BLEURT model (transformers) loaded successfully.


In [None]:
# --- 4. Helper Functions ---
def preprocess_text_for_lexical(text: str) -> list:
    """
    Lowercase, remove punctuation, remove stopwords, and tokenize.
    """
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words and word.isalpha()]

def get_syntactic_complexity(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """
    Calculates syntactic complexity features using spaCy.

    This provides proxies for MLC (Mean Length of Clause) and
    CN/C (Complex Nominals per Clause).

    Metrics:
    - mean_sentence_length: Avg. # of tokens per sentence.
    - mean_noun_chunk_length: Avg. # of tokens per noun chunk (proxy for nominal elaboration).
    - sub_clauses_per_sentence: Avg. # of subordinating conjunctions ('mark' dependency)
      per sentence (proxy for clausal complexity).
    """
    if nlp is None:
        raise EnvironmentError("spaCy model is not loaded. Please check setup.")

    results = []
    for text in df[col]:
        doc = nlp(text)

        num_sentences = len(list(doc.sents))
        num_tokens = len([t for t in doc if not t.is_punct])
        noun_chunks = list(doc.noun_chunks)
        num_sub_clauses = len([t for t in doc if t.dep_ == 'mark'])

        mean_sentence_length = num_tokens / num_sentences if num_sentences > 0 else 0
        sub_clauses_per_sentence = num_sub_clauses / num_sentences if num_sentences > 0 else 0
        mean_noun_chunk_length = sum(len(nc) for nc in noun_chunks) / len(noun_chunks) if len(noun_chunks) > 0 else 0

        results.append({
            'mean_sentence_length': mean_sentence_length,
            'mean_noun_chunk_length': mean_noun_chunk_length,
            'sub_clauses_per_sentence': sub_clauses_per_sentence
        })

    return pd.DataFrame(results)

def get_lexical_richness(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """
    Calculates lexical richness features: TTR and Hapax Rate.

    Metrics:
    - ttr (Type-Token Ratio): Unique tokens / Total tokens.
    - hapax_rate (Hapax Legomena Rate): Tokens appearing only once / Total tokens.
    """
    results = []
    for text in df[col]:
        tokens = preprocess_text_for_lexical(text)
        total_tokens = len(tokens)

        if total_tokens == 0:
            results.append({'ttr': 0, 'hapax_rate': 0})
            continue

        num_unique_tokens = len(set(tokens))
        ttr = num_unique_tokens / total_tokens

        freq_dist = FreqDist(tokens)
        num_hapaxes = len(freq_dist.hapaxes())
        hapax_rate = num_hapaxes / total_tokens

        results.append({'ttr': ttr, 'hapax_rate': hapax_rate})

    return pd.DataFrame(results)

def get_bertscore(df: pd.DataFrame, col: str, reference_text: str) -> pd.DataFrame:
    """
    Calculates BERTScore (Precision, Recall, F1) against a reference text.
    """
    if 'bert_score_calc' not in globals() or bert_score_calc is None:
        raise EnvironmentError("bert_score library not loaded or function not found.")

    candidates = df[col].tolist()
    # Create a list of the same reference text for all candidates
    references = [reference_text] * len(candidates)

    # Ensure CUDA_VISIBLE_DEVICES is not interfering for bert-score if a GPU is available.
    # bert-score handles device placement internally based on torch availability.
    P, R, F1 = bert_score_calc(candidates, references, lang='en', model_type='bert-base-uncased')

    results = {
        'bertscore_precision': P.numpy(),
        'bertscore_recall': R.numpy(),
        'bertscore_f1': F1.numpy()
    }

    return pd.DataFrame(results)

def get_bleurt(df: pd.DataFrame, col: str, reference_text: str) -> pd.DataFrame:
    """
    Calculates the BLEURT score against a reference text using the transformers library.
    """
    global transformers_bleurt_model, transformers_bleurt_tokenizer

    if transformers_bleurt_model is None or transformers_bleurt_tokenizer is None:
        raise EnvironmentError("Hugging Face BLEURT model or tokenizer not loaded. Please check model loading in setup steps.")

    candidates = df[col].tolist()
    references = [reference_text] * len(candidates)

    inputs = transformers_bleurt_tokenizer(
        references, candidates, padding=True, truncation=True, return_tensors="pt"
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    transformers_bleurt_model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = transformers_bleurt_model(**inputs)
        scores = outputs.logits.squeeze().tolist()

    return pd.DataFrame({'bleurt_score': scores})

def get_moverscore(df: pd.DataFrame, col: str, reference_text: str) -> pd.DataFrame:
    """
    Calculates MoverScore against a reference text.
    """
    # Check if MoverScore functions were successfully imported
    if get_idf_dict is None or word_mover_score is None:
        print("MoverScore functionality is not available due to import issues.")
        return pd.DataFrame() # Return empty DataFrame or handle as appropriate

    candidates = df[col].tolist()
    references = [reference_text] * len(candidates)

    # MoverScore requires IDF dictionaries.
    # Create them from the corpus itself (candidates + references).
    all_texts = candidates + references
    idf_dict = get_idf_dict(all_texts)

    # Set stop_words=[] because MoverScore's default list is large
    # and may may remove important words for semantic comparison.
    # Force CPU for MoverScore, as it had persistent GPU driver issues in CPU-only runtime.
    # In a GPU runtime, it should ideally use GPU if available, but for robustness,
    # if CUDA_VISIBLE_DEVICES was an issue, we can be explicit if needed.
    # However, for a consolidated cell in a GPU runtime, it should work fine without explicit CPU force.
    scores = word_mover_score(
        references,
        candidates,
        idf_dict,
        idf_dict,
        stop_words=[],
        n_gram=1,
        remove_subwords=True,
        batch_size=48 # Adjust batch size based on your GPU/CPU memory
    )

    return pd.DataFrame({'moverscore': scores})

# --- 5. Sample Data and Test ---
sample_data = pd.DataFrame({
    'candidate_text': [
        'The cat sat on the mat.',
        'A feline was resting on the rug.',
        'The dog barked loudly.',
        'This is a completely different sentence.'
    ]
})
reference_text = 'The cat is on the mat.'

print("\n--- Testing BERTScore ---")
try:
    bert_scores_df = get_bertscore(sample_data, 'candidate_text', reference_text)
    print("BERTScore scores calculated successfully:")
    print(bert_scores_df)
except Exception as e:
    print(f"Error testing get_bertscore function: {e}")

print("\n--- Testing BLEURT ---")
print(f"Using Hugging Face BLEURT model downloaded to: {LOCAL_BLEURT_CHECKPOINT_DIR}")
try:
    bleurt_scores_df = get_bleurt(sample_data, 'candidate_text', reference_text)
    print("BLEURT scores calculated successfully:")
    print(bleurt_scores_df)
except Exception as e:
    print(f"Error testing get_bleurt function: {e}")

print("\n--- Testing MoverScore ---")
try:
    moverscore_df = get_moverscore(sample_data, 'candidate_text', reference_text)
    if not moverscore_df.empty:
        print("MoverScore scores calculated successfully:")
        print(moverscore_df)
    else:
        print("MoverScore returned an empty DataFrame, indicating an issue or disabled functionality.")
except Exception as e:
    print(f"Error testing get_moverscore function: {e}")


--- Testing BERTScore ---


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTScore scores calculated successfully:
   bertscore_precision  bertscore_recall  bertscore_f1
0             0.865730          0.865730      0.865730
1             0.669104          0.705554      0.686846
2             0.600975          0.577775      0.589147
3             0.445026          0.442023      0.443520

--- Testing BLEURT ---
Using Hugging Face BLEURT model downloaded to: /content/bleurt-base-512
BLEURT scores calculated successfully:
   bleurt_score
0      0.556563
1     -0.448874
2     -0.955141
3     -1.634602

--- Testing MoverScore ---
MoverScore scores calculated successfully:
   moverscore
0    0.557910
1    0.207203
2   -0.055930
3   -0.244879


In [None]:
# Define file paths
file_paths = [
    '/content/drive/MyDrive/Clubs/BEA/LLM Project/data/batch_68fdd5ebf65881908d4e99b7198f0a0d_output.jsonl',
    '/content/drive/MyDrive/Clubs/BEA/LLM Project/data/batch_69098e17eef8819086a297f97d255493_output.jsonl'
]

In [None]:
# Initialize an empty list to store the processed DataFrames
df_list = []

# Iterate through each file path and process the data
for file_path in file_paths:
    df = pd.read_json(file_path, lines=True)
    df_flat = pd.json_normalize(df.to_dict(orient="records"), sep='_')
    df_list.append(df_flat)
    print(f"Processed DataFrame '{file_path}' with shape: {df_flat.shape}")

print("\nAll files processed and DataFrames stored in 'dataframes_list'.")

Processed DataFrame '/content/drive/MyDrive/Clubs/BEA/LLM Project/data/batch_68fdd5ebf65881908d4e99b7198f0a0d_output.jsonl' with shape: (2000, 40)
Processed DataFrame '/content/drive/MyDrive/Clubs/BEA/LLM Project/data/batch_69098e17eef8819086a297f97d255493_output.jsonl' with shape: (500, 21)

All files processed and DataFrames stored in 'dataframes_list'.


In [None]:
for df in df_list:
  print(df.columns)

Index(['id', 'custom_id', 'error', 'response_status_code',
       'response_request_id', 'response_body_id', 'response_body_object',
       'response_body_created_at', 'response_body_status',
       'response_body_background', 'response_body_billing_payer',
       'response_body_error', 'response_body_incomplete_details',
       'response_body_instructions', 'response_body_max_output_tokens',
       'response_body_max_tool_calls', 'response_body_model',
       'response_body_output', 'response_body_parallel_tool_calls',
       'response_body_previous_response_id', 'response_body_prompt_cache_key',
       'response_body_reasoning_effort', 'response_body_reasoning_summary',
       'response_body_safety_identifier', 'response_body_service_tier',
       'response_body_store', 'response_body_temperature',
       'response_body_text_format_type', 'response_body_text_verbosity',
       'response_body_tool_choice', 'response_body_tools',
       'response_body_top_logprobs', 'response_body_top_

In [None]:
for id in df_list[0].columns:
  print(df_list[0][id][0])

batch_req_68fddb23a43c819094e757a75fd8a4cb
price_only_0001
nan
200
1b86e42ce09336037ac4d2e9d28d9db3
resp_040352ce273cd5f60068fdd666438481938eb0df4dadd4ef18
response
1761465958
completed
False
openai
None
None
None
None
None
gpt-5-mini-2025-08-07
[{'id': 'rs_040352ce273cd5f60068fdd6672c10819395ac13d2c17e4396', 'type': 'reasoning', 'summary': []}, {'id': 'msg_040352ce273cd5f60068fdd66a799c81939cf5ab94509235e4', 'type': 'message', 'status': 'completed', 'content': [{'type': 'output_text', 'annotations': [], 'logprobs': [], 'text': 'Round 1 — Choose car. Fuel €54 < train €60, so driving is cheaper. I also value the door-to-door convenience and flexibility for errands.\n\nRound 2 — Choose car. Fuel €60 = train €60, but I’d lean to the car for flexibility and comfort (unless I expect traffic or parking problems).\n\nRound 3 — Choose train. Fuel €66 > train €60, so the train is cheaper; I’d take it unless I really needed the car at the destination.\n\nRound 4 — Choose train. Fuel €72 > €60, s

In [None]:
df_list[0]['response_body_output'][0]

[{'id': 'rs_040352ce273cd5f60068fdd6672c10819395ac13d2c17e4396',
  'type': 'reasoning',
  'summary': []},
 {'id': 'msg_040352ce273cd5f60068fdd66a799c81939cf5ab94509235e4',
  'type': 'message',
  'status': 'completed',
  'content': [{'type': 'output_text',
    'annotations': [],
    'logprobs': [],
    'text': 'Round 1 — Choose car. Fuel €54 < train €60, so driving is cheaper. I also value the door-to-door convenience and flexibility for errands.\n\nRound 2 — Choose car. Fuel €60 = train €60, but I’d lean to the car for flexibility and comfort (unless I expect traffic or parking problems).\n\nRound 3 — Choose train. Fuel €66 > train €60, so the train is cheaper; I’d take it unless I really needed the car at the destination.\n\nRound 4 — Choose train. Fuel €72 > €60, so the train saves money and is less wear-and-tear on the car.\n\nRound 5 — Choose train. Fuel €78 > €60, train is clearly cheaper and I’d avoid the high fuel cost.\n\nRound 6 — Choose train. Fuel €84 > €60; cost and environ

In [None]:
for id in df_list[1].columns:
  print(df_list[1][id][0])

batch_req_69098ee7cf948190a47411bf26169206
individual_0001
nan
200
9760d35f09a85067dc80f138595fe8c6
chatcmpl-CY3pTTcFlEDK2LbiHuLFGFPZvFWJc
chat.completion
1762233907
gpt-5-mini-2025-08-07
[{'index': 0, 'message': {'role': 'assistant', 'content': '1. Round 1 — I know A is 90% (EV = 10*0.9 - 3 = 6) and B is 60% (EV = 3). A has higher expected value, so I choose Button A.\n\n2. Round 2 — Same situation: A (90% → EV 6) vs B (60% → EV 3). A is better. Choice: Button A.\n\n3. Round 3 — A: 90% (EV 6). B: 60% (EV 3). I pick the higher EV, so Button A.\n\n4. Round 4 — A (90%, EV 6) dominates B (60%, EV 3). Choice: Button A.\n\n5. Round 5 — Still early: A 90% → EV 6; B 60% → EV 3. I go with Button A.\n\n6. Round 6 — A (90%, EV 6) vs B (60%, EV 3). I choose Button A.\n\n7. Round 7 — A has the better expected return (6 vs 3). Choice: Button A.\n\n8. Round 8 — A: 90% (EV 6). B: 60% (EV 3). I choose Button A.\n\n9. Round 9 — A’s EV is higher (6 > 3). Choice: Button A.\n\n10. Round 10 — A (90%, EV 6)

In [None]:
df_list[1]['response_body_choices'][0]

[{'index': 0,
  'message': {'role': 'assistant',
   'content': '1. Round 1 — I know A is 90% (EV = 10*0.9 - 3 = 6) and B is 60% (EV = 3). A has higher expected value, so I choose Button A.\n\n2. Round 2 — Same situation: A (90% → EV 6) vs B (60% → EV 3). A is better. Choice: Button A.\n\n3. Round 3 — A: 90% (EV 6). B: 60% (EV 3). I pick the higher EV, so Button A.\n\n4. Round 4 — A (90%, EV 6) dominates B (60%, EV 3). Choice: Button A.\n\n5. Round 5 — Still early: A 90% → EV 6; B 60% → EV 3. I go with Button A.\n\n6. Round 6 — A (90%, EV 6) vs B (60%, EV 3). I choose Button A.\n\n7. Round 7 — A has the better expected return (6 vs 3). Choice: Button A.\n\n8. Round 8 — A: 90% (EV 6). B: 60% (EV 3). I choose Button A.\n\n9. Round 9 — A’s EV is higher (6 > 3). Choice: Button A.\n\n10. Round 10 — A (90%, EV 6) is preferable to B (60%, EV 3). Choice: Button A.\n\n11. Round 11 — A gives EV 6, B gives EV 3. I pick Button A.\n\n12. Round 12 — A (EV 6) beats B (EV 3). Choice: Button A.\n\n13. R

In [None]:
def flatten_list(lst, sep="_"):
    out = {}
    for i, item in enumerate(lst):
        flat = pd.json_normalize(item, sep=sep).to_dict(orient="records")[0]
        out.update({f"{i}{sep}{k}": v for k, v in flat.items()})
    return out

In [None]:
flattened = pd.DataFrame(df_list[0]["response_body_output"].apply(flatten_list).tolist())
df_list[0] = df_list[0].drop(columns=["response_body_output"]).join(flattened)

In [None]:
flattened = pd.DataFrame(df_list[1]["response_body_choices"].apply(flatten_list).tolist())
df_list[1] = df_list[1].drop(columns=["response_body_choices"]).join(flattened)

In [None]:
df_list[0].columns, df_list[1].columns

(Index(['id', 'custom_id', 'error', 'response_status_code',
        'response_request_id', 'response_body_id', 'response_body_object',
        'response_body_created_at', 'response_body_status',
        'response_body_background', 'response_body_billing_payer',
        'response_body_error', 'response_body_incomplete_details',
        'response_body_instructions', 'response_body_max_output_tokens',
        'response_body_max_tool_calls', 'response_body_model',
        'response_body_parallel_tool_calls',
        'response_body_previous_response_id', 'response_body_prompt_cache_key',
        'response_body_reasoning_effort', 'response_body_reasoning_summary',
        'response_body_safety_identifier', 'response_body_service_tier',
        'response_body_store', 'response_body_temperature',
        'response_body_text_format_type', 'response_body_text_verbosity',
        'response_body_tool_choice', 'response_body_tools',
        'response_body_top_logprobs', 'response_body_top_p',
     

In [None]:
print("First value of df_list[0]['1_content']:")
if '1_content' in df_list[0].columns:
    display(df_list[0]['1_content'].iloc[0])
else:
    print("Column '1_content' not found in df_list[0].")

print("\nFirst value of df_list[1]['0_message_content']:")
if '0_message_content' in df_list[1].columns:
    display(df_list[1]['0_message_content'].iloc[0])
else:
    print("Column '0_message_content' not found in df_list[1].")


First value of df_list[0]['1_content']:


[{'type': 'output_text',
  'annotations': [],
  'logprobs': [],
  'text': 'Round 1 — Choose car. Fuel €54 < train €60, so driving is cheaper. I also value the door-to-door convenience and flexibility for errands.\n\nRound 2 — Choose car. Fuel €60 = train €60, but I’d lean to the car for flexibility and comfort (unless I expect traffic or parking problems).\n\nRound 3 — Choose train. Fuel €66 > train €60, so the train is cheaper; I’d take it unless I really needed the car at the destination.\n\nRound 4 — Choose train. Fuel €72 > €60, so the train saves money and is less wear-and-tear on the car.\n\nRound 5 — Choose train. Fuel €78 > €60, train is clearly cheaper and I’d avoid the high fuel cost.\n\nRound 6 — Choose train. Fuel €84 > €60; cost and environmental considerations push me to the train.\n\nRound 7 — Choose train. Fuel €90 > €60; train is both cheaper and often quicker into city centers.\n\nRound 8 — Choose train. Fuel €94 > €60; not worth driving at that fuel price unless I ne


First value of df_list[1]['0_message_content']:


'1. Round 1 — I know A is 90% (EV = 10*0.9 - 3 = 6) and B is 60% (EV = 3). A has higher expected value, so I choose Button A.\n\n2. Round 2 — Same situation: A (90% → EV 6) vs B (60% → EV 3). A is better. Choice: Button A.\n\n3. Round 3 — A: 90% (EV 6). B: 60% (EV 3). I pick the higher EV, so Button A.\n\n4. Round 4 — A (90%, EV 6) dominates B (60%, EV 3). Choice: Button A.\n\n5. Round 5 — Still early: A 90% → EV 6; B 60% → EV 3. I go with Button A.\n\n6. Round 6 — A (90%, EV 6) vs B (60%, EV 3). I choose Button A.\n\n7. Round 7 — A has the better expected return (6 vs 3). Choice: Button A.\n\n8. Round 8 — A: 90% (EV 6). B: 60% (EV 3). I choose Button A.\n\n9. Round 9 — A’s EV is higher (6 > 3). Choice: Button A.\n\n10. Round 10 — A (90%, EV 6) is preferable to B (60%, EV 3). Choice: Button A.\n\n11. Round 11 — A gives EV 6, B gives EV 3. I pick Button A.\n\n12. Round 12 — A (EV 6) beats B (EV 3). Choice: Button A.\n\n13. Round 13 — A 90% → EV 6; B 60% → EV 3. I choose A.\n\n14. Round 

In [None]:
# Extract text from df_list[0]['1_content'] and df_list[1]['0_message_content']

# For df_list[0]['1_content'] (assuming it's a list containing one dict with a 'text' key)
if '1_content' in df_list[0].columns:
    df0_text = df_list[0]['1_content'].apply(lambda x: x[0]['text'] if isinstance(x, list) and len(x) > 0 and 'text' in x[0] else '')
else:
    df0_text = pd.Series([''] * len(df_list[0]))

# For df_list[1]['0_message_content'] (assuming it's already a string)
if '0_message_content' in df_list[1].columns:
    df1_text = df_list[1]['0_message_content']
else:
    df1_text = pd.Series([''] * len(df_list[1]))

# Create new DataFrames to store original text and metrics
df_0_metrics = pd.DataFrame({'original_text': df0_text})
df_1_metrics = pd.DataFrame({'original_text': df1_text})

print("--- Applying metrics to df_0_metrics ---")
# Syntactic Complexity for df_0
syntactic_df_0 = get_syntactic_complexity(df_0_metrics, 'original_text')
syntactic_df_0 = syntactic_df_0.add_prefix('syntactic_')
df_0_metrics = pd.concat([df_0_metrics, syntactic_df_0], axis=1)

# Lexical Richness for df_0
lexical_df_0 = get_lexical_richness(df_0_metrics, 'original_text')
lexical_df_0 = lexical_df_0.add_prefix('lexical_')
df_0_metrics = pd.concat([df_0_metrics, lexical_df_0], axis=1)

print("Syntactic and Lexical Richness for df_0_metrics (first 5 rows):")
display(df_0_metrics.head())

print("\n--- Applying metrics to df_1_metrics ---")
# Syntactic Complexity for df_1
syntactic_df_1 = get_syntactic_complexity(df_1_metrics, 'original_text')
syntactic_df_1 = syntactic_df_1.add_prefix('syntactic_')
df_1_metrics = pd.concat([df_1_metrics, syntactic_df_1], axis=1)

# Lexical Richness for df_1
lexical_df_1 = get_lexical_richness(df_1_metrics, 'original_text')
lexical_df_1 = lexical_df_1.add_prefix('lexical_')
df_1_metrics = pd.concat([df_1_metrics, lexical_df_1], axis=1)

print("Syntactic and Lexical Richness for df_1_metrics (first 5 rows):")
display(df_1_metrics.head())


--- Applying metrics to df_0_metrics ---
Syntactic and Lexical Richness for df_0_metrics (first 5 rows):


Unnamed: 0,original_text,syntactic_mean_sentence_length,syntactic_mean_noun_chunk_length,syntactic_sub_clauses_per_sentence,lexical_ttr,lexical_hapax_rate
0,"Round 1 — Choose car. Fuel €54 < train €60, so...",11.684211,2.354167,0.157895,0.446602,0.330097
1,Round 1 — Choose car. Fuel for the trip costs ...,10.578947,2.306122,0.105263,0.414894,0.287234
2,Round 1 — Fuel €54 vs Train €60: I choose the ...,15.777778,1.569444,0.111111,0.428571,0.310924
3,Round 1: Choose car (Fuel €54 vs Train €60). ...,16.444444,1.585366,0.111111,0.434211,0.322368
4,Round 1 — Car (fuel €54 vs train €60)\nI’d dri...,10.44,1.544118,0.04,0.452381,0.333333



--- Applying metrics to df_1_metrics ---
Syntactic and Lexical Richness for df_1_metrics (first 5 rows):


Unnamed: 0,original_text,syntactic_mean_sentence_length,syntactic_mean_noun_chunk_length,syntactic_sub_clauses_per_sentence,lexical_ttr,lexical_hapax_rate
0,1. Round 1 — I know A is 90% (EV = 10*0.9 - 3 ...,8.080645,1.633197,0.0,0.080838,0.035928
1,Quick summary of the math I’m using to decide:...,9.157143,1.862155,0.064286,0.152778,0.069444
2,Sorry—I can’t share my step‑by‑step internal m...,30.166667,2.488889,0.333333,0.605634,0.394366
3,Round 1: I pick Button A. With A at 90% succes...,15.404494,2.196481,0.089888,0.103448,0.04023
4,Quick note on how I'm thinking: expected value...,7.423077,1.465909,0.019231,0.112903,0.054435


In [None]:
output_dir = '/content/drive/MyDrive/Clubs/BEA/LLM Project/data_results'

# Define output filenames
filename_df0 = os.path.join(output_dir, 'batch_68fdd5ebf65881908d4e99b7198f0a0d_output_syntactic_lexical_results.csv')
filename_df1 = os.path.join(output_dir, 'batch_69098e17eef8819086a297f97d255493_output_syntactic_lexical_results.csv')

# Save df_0_metrics to CSV
df_0_metrics.to_csv(filename_df0, index=False)
print(f"Saved df_0_metrics to: {filename_df0}")

# Save df_1_metrics to CSV
df_1_metrics.to_csv(filename_df1, index=False)
print(f"Saved df_1_metrics to: {filename_df1}")

Saved df_0_metrics to: /content/drive/MyDrive/Clubs/BEA/LLM Project/data_results/batch_68fdd5ebf65881908d4e99b7198f0a0d_output_syntactic_lexical_results.csv
Saved df_1_metrics to: /content/drive/MyDrive/Clubs/BEA/LLM Project/data_results/batch_69098e17eef8819086a297f97d255493_output_syntactic_lexical_results.csv
