In [1]:
import os
import json
import time
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# === Grok API setup ===
api_key = "api-key"
api_url = "https://api.x.ai/v1/chat/completions"
model_name = "grok-3-mini"

headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json",
}

# === Paths ===
base = Path.home() / "Desktop" / "Benchmark"
subs_dir = base / "json_benchmark"
summaries_dir = base / "summaries"
output_dir = base / "scored_good_vs_evil_grok"
prompts_dir = base / "prompts_good_vs_evil_grok"

output_dir.mkdir(parents=True, exist_ok=True)
prompts_dir.mkdir(parents=True, exist_ok=True)

matches_df = pd.read_csv(base / "matches_benchmark.csv")

# === Prompt template ===
def create_good_vs_evil_prompt(summary, chunk):
    return f"""
You are analyzing how a film portrays its central moral conflict based on its plot summary and dialogue.

Your task is to classify the film’s depiction of **Good vs Evil** into one of the following three categories:

- **Clear**: There is a strong moral binary. Good and evil are clearly distinguished. One side is righteous, the other is corrupt or villainous.
- **Neutral**: The film does not emphasize good vs evil. There may be conflict, but it is not framed in strong moral terms.
- **Complicated**: The line between good and evil is blurred. Moral ambiguity is central, and characters or institutions cannot be easily classified as good or evil.

Please:
1. Choose the best fitting label: "clear", "neutral", or "complicated"
2. Give a brief justification (1–2 sentences)
3. Provide a confidence score between 0.0 and 1.0

Return your answer in this exact JSON format:

{{
  "good_vs_evil": "...",
  "confidence": ...,
  "explanation": "..."
}}

Film Summary:
\"\"\"{summary}\"\"\"

Dialogue:
\"\"\"{chunk}\"\"\"
""".strip()

# === API call to Grok ===
def call_grok(prompt):
    payload = {
        "model": model_name,
        "temperature": 0,
        "stream": False,
        "messages": [
            {
                "role": "system",
                "content": "You are a careful film analyst. Always return exactly and only the required JSON."
            },
            {
                "role": "user",
                "content": prompt
            }
        ]
    }

    response = requests.post(api_url, headers=headers, json=payload)

    if response.status_code == 429:
        print("⚠️ Rate limit hit. Waiting...")
        time.sleep(60)
        return call_grok(prompt)
    elif response.status_code != 200:
        print(f"❌ API Error: {response.status_code}")
        print(response.text)
        return None

    try:
        content = response.json()["choices"][0]["message"]["content"]
        return json.loads(content)
    except Exception:
        print(f"⚠️ JSON parsing error:\n{response.text}")
        return None

# === Chunk subtitles ===
def chunk_dialogue(subs, chunk_size=5000, overlap=200):
    text_blocks = [line.get("text", "").strip() for line in subs if line.get("text")]
    full_text = " ".join(text_blocks)

    chunks = []
    start = 0
    while start < len(full_text):
        end = start + chunk_size
        chunks.append(full_text[start:end])
        start = end - overlap
    return chunks

# === Scoring loop ===
for _, row in tqdm(matches_df.iterrows(), total=len(matches_df), desc="Scoring Good vs Evil (Grok)"):
    filename = row["subtitle_filename"]
    json_path = subs_dir / f"{filename}.json"
    summary_path = summaries_dir / f"{filename}.srt_summary.txt"

    if not json_path.exists() or not summary_path.exists():
        print(f"⚠️ Missing files for: {filename}")
        continue

    with open(json_path, "r", encoding="utf-8") as f:
        subs = json.load(f)

    with open(summary_path, "r", encoding="utf-8") as f:
        summary = f.read().strip()

    chunks = chunk_dialogue(subs)

    for i, chunk in enumerate(chunks):
        output_path = output_dir / f"{filename}_chunk{i+1}_good_vs_evil.json"
        prompt_path = prompts_dir / f"{filename}_chunk{i+1}_good_vs_evil_prompt.txt"

        if output_path.exists():
            print(f"🟡 Already scored: {filename} chunk {i+1}")
            continue

        prompt = create_good_vs_evil_prompt(summary, chunk)

        with open(prompt_path, "w", encoding="utf-8") as f:
            f.write(prompt)

        result = call_grok(prompt)
        if result:
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(result, f, indent=2)
            print(f"✅ Scored: {filename} chunk {i+1}")
        else:
            print(f"❌ Failed: {filename} chunk {i+1}")

        time.sleep(1.5)


Scoring Good vs Evil (Grok):   0%|                       | 0/20 [00:00<?, ?it/s]

✅ Scored: 2006Blood.Diamond chunk 1
✅ Scored: 2006Blood.Diamond chunk 2
✅ Scored: 2006Blood.Diamond chunk 3
✅ Scored: 2006Blood.Diamond chunk 4
✅ Scored: 2006Blood.Diamond chunk 5
✅ Scored: 2006Blood.Diamond chunk 6
✅ Scored: 2006Blood.Diamond chunk 7
✅ Scored: 2006Blood.Diamond chunk 8
✅ Scored: 2006Blood.Diamond chunk 9
✅ Scored: 2006Blood.Diamond chunk 10
✅ Scored: 2006Blood.Diamond chunk 11


Scoring Good vs Evil (Grok):   5%|▊              | 1/20 [01:00<19:18, 60.98s/it]

✅ Scored: 2005The.Constant.Gardener chunk 1
✅ Scored: 2005The.Constant.Gardener chunk 2
✅ Scored: 2005The.Constant.Gardener chunk 3
✅ Scored: 2005The.Constant.Gardener chunk 4
✅ Scored: 2005The.Constant.Gardener chunk 5
✅ Scored: 2005The.Constant.Gardener chunk 6
✅ Scored: 2005The.Constant.Gardener chunk 7
✅ Scored: 2005The.Constant.Gardener chunk 8
✅ Scored: 2005The.Constant.Gardener chunk 9
✅ Scored: 2005The.Constant.Gardener chunk 10
✅ Scored: 2005The.Constant.Gardener chunk 11
✅ Scored: 2005The.Constant.Gardener chunk 12
✅ Scored: 2005The.Constant.Gardener chunk 13


Scoring Good vs Evil (Grok):  10%|█▌             | 2/20 [02:14<20:25, 68.06s/it]

✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 1
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 2
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 3
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 4
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 5
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 6


Scoring Good vs Evil (Grok):  15%|██▎            | 3/20 [02:48<14:53, 52.56s/it]

✅ Scored: 2009Avatar chunk 1
✅ Scored: 2009Avatar chunk 2
✅ Scored: 2009Avatar chunk 3
✅ Scored: 2009Avatar chunk 4
✅ Scored: 2009Avatar chunk 5
✅ Scored: 2009Avatar chunk 6
✅ Scored: 2009Avatar chunk 7
✅ Scored: 2009Avatar chunk 8
✅ Scored: 2009Avatar chunk 9
✅ Scored: 2009Avatar chunk 10
✅ Scored: 2009Avatar chunk 11


Scoring Good vs Evil (Grok):  20%|███            | 4/20 [03:47<14:45, 55.33s/it]

✅ Scored: 2012The.Hunger.Games chunk 1
✅ Scored: 2012The.Hunger.Games chunk 2
✅ Scored: 2012The.Hunger.Games chunk 3
✅ Scored: 2012The.Hunger.Games chunk 4
✅ Scored: 2012The.Hunger.Games chunk 5
✅ Scored: 2012The.Hunger.Games chunk 6
✅ Scored: 2012The.Hunger.Games chunk 7
✅ Scored: 2012The.Hunger.Games chunk 8


Scoring Good vs Evil (Grok):  25%|███▊           | 5/20 [04:31<12:47, 51.15s/it]

✅ Scored: 1984Ghostbusters chunk 1
✅ Scored: 1984Ghostbusters chunk 2
✅ Scored: 1984Ghostbusters chunk 3
✅ Scored: 1984Ghostbusters chunk 4
✅ Scored: 1984Ghostbusters chunk 5
✅ Scored: 1984Ghostbusters chunk 6
✅ Scored: 1984Ghostbusters chunk 7
✅ Scored: 1984Ghostbusters chunk 8
✅ Scored: 1984Ghostbusters chunk 9
✅ Scored: 1984Ghostbusters chunk 10


Scoring Good vs Evil (Grok):  30%|████▌          | 6/20 [05:24<12:07, 51.96s/it]

✅ Scored: 1978Superman chunk 1
✅ Scored: 1978Superman chunk 2
✅ Scored: 1978Superman chunk 3
✅ Scored: 1978Superman chunk 4
✅ Scored: 1978Superman chunk 5
✅ Scored: 1978Superman chunk 6
✅ Scored: 1978Superman chunk 7
✅ Scored: 1978Superman chunk 8
✅ Scored: 1978Superman chunk 9
✅ Scored: 1978Superman chunk 10
✅ Scored: 1978Superman chunk 11


Scoring Good vs Evil (Grok):  35%|█████▎         | 7/20 [06:31<12:18, 56.80s/it]

✅ Scored: 2008The.Hurt.Locker chunk 1
✅ Scored: 2008The.Hurt.Locker chunk 2
✅ Scored: 2008The.Hurt.Locker chunk 3
✅ Scored: 2008The.Hurt.Locker chunk 4
✅ Scored: 2008The.Hurt.Locker chunk 5
✅ Scored: 2008The.Hurt.Locker chunk 6
✅ Scored: 2008The.Hurt.Locker chunk 7
✅ Scored: 2008The.Hurt.Locker chunk 8
✅ Scored: 2008The.Hurt.Locker chunk 9


Scoring Good vs Evil (Grok):  40%|██████         | 8/20 [07:20<10:52, 54.37s/it]

✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 1
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 2
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 3
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 4
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 5
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 6
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 7
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 8
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 9
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 10
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 11
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 12


Scoring Good vs Evil (Grok):  45%|██████▊        | 9/20 [08:24<10:29, 57.24s/it]

✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 1
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 2
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 3
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 4
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 5
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 6
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 7
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 8
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 9
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 10


Scoring Good vs Evil (Grok):  50%|███████       | 10/20 [09:18<09:22, 56.23s/it]

✅ Scored: 2018Black.Panther chunk 1
✅ Scored: 2018Black.Panther chunk 2
✅ Scored: 2018Black.Panther chunk 3
✅ Scored: 2018Black.Panther chunk 4
✅ Scored: 2018Black.Panther chunk 5
✅ Scored: 2018Black.Panther chunk 6
✅ Scored: 2018Black.Panther chunk 7
✅ Scored: 2018Black.Panther chunk 8
✅ Scored: 2018Black.Panther chunk 9
✅ Scored: 2018Black.Panther chunk 10


Scoring Good vs Evil (Grok):  55%|███████▋      | 11/20 [10:14<08:24, 56.06s/it]

✅ Scored: 2021Dont.Look.Up chunk 1
✅ Scored: 2021Dont.Look.Up chunk 2
✅ Scored: 2021Dont.Look.Up chunk 3
✅ Scored: 2021Dont.Look.Up chunk 4
✅ Scored: 2021Dont.Look.Up chunk 5
✅ Scored: 2021Dont.Look.Up chunk 6
✅ Scored: 2021Dont.Look.Up chunk 7
✅ Scored: 2021Dont.Look.Up chunk 8
✅ Scored: 2021Dont.Look.Up chunk 9
✅ Scored: 2021Dont.Look.Up chunk 10
✅ Scored: 2021Dont.Look.Up chunk 11
✅ Scored: 2021Dont.Look.Up chunk 12
✅ Scored: 2021Dont.Look.Up chunk 13
✅ Scored: 2021Dont.Look.Up chunk 14
✅ Scored: 2021Dont.Look.Up chunk 15
✅ Scored: 2021Dont.Look.Up chunk 16
✅ Scored: 2021Dont.Look.Up chunk 17


Scoring Good vs Evil (Grok):  60%|████████▍     | 12/20 [11:45<08:53, 66.73s/it]

✅ Scored: 1982First.Blood chunk 1
✅ Scored: 1982First.Blood chunk 2
✅ Scored: 1982First.Blood chunk 3
✅ Scored: 1982First.Blood chunk 4
✅ Scored: 1982First.Blood chunk 5
✅ Scored: 1982First.Blood chunk 6
✅ Scored: 1982First.Blood chunk 7


Scoring Good vs Evil (Grok):  65%|█████████     | 13/20 [12:24<06:48, 58.31s/it]

✅ Scored: 2019Joker chunk 1
✅ Scored: 2019Joker chunk 2
✅ Scored: 2019Joker chunk 3
✅ Scored: 2019Joker chunk 4
✅ Scored: 2019Joker chunk 5
✅ Scored: 2019Joker chunk 6
✅ Scored: 2019Joker chunk 7
✅ Scored: 2019Joker chunk 8
✅ Scored: 2019Joker chunk 9


Scoring Good vs Evil (Grok):  70%|█████████▊    | 14/20 [13:24<05:52, 58.83s/it]

✅ Scored: 2006Night.at.the.Museum chunk 1
✅ Scored: 2006Night.at.the.Museum chunk 2
✅ Scored: 2006Night.at.the.Museum chunk 3
✅ Scored: 2006Night.at.the.Museum chunk 4
✅ Scored: 2006Night.at.the.Museum chunk 5
✅ Scored: 2006Night.at.the.Museum chunk 6
✅ Scored: 2006Night.at.the.Museum chunk 7
✅ Scored: 2006Night.at.the.Museum chunk 8
✅ Scored: 2006Night.at.the.Museum chunk 9
✅ Scored: 2006Night.at.the.Museum chunk 10


Scoring Good vs Evil (Grok):  75%|██████████▌   | 15/20 [14:20<04:50, 58.11s/it]

✅ Scored: 1976Rocky.I chunk 1
✅ Scored: 1976Rocky.I chunk 2
✅ Scored: 1976Rocky.I chunk 3
✅ Scored: 1976Rocky.I chunk 4
✅ Scored: 1976Rocky.I chunk 5
✅ Scored: 1976Rocky.I chunk 6
✅ Scored: 1976Rocky.I chunk 7
✅ Scored: 1976Rocky.I chunk 8
✅ Scored: 1976Rocky.I chunk 9
✅ Scored: 1976Rocky.I chunk 10
✅ Scored: 1976Rocky.I chunk 11
✅ Scored: 1976Rocky.I chunk 12
✅ Scored: 1976Rocky.I chunk 13


Scoring Good vs Evil (Grok):  80%|███████████▏  | 16/20 [15:34<04:11, 62.84s/it]

✅ Scored: 2005V.for.Vendetta chunk 1
✅ Scored: 2005V.for.Vendetta chunk 2
✅ Scored: 2005V.for.Vendetta chunk 3
✅ Scored: 2005V.for.Vendetta chunk 4
✅ Scored: 2005V.for.Vendetta chunk 5
✅ Scored: 2005V.for.Vendetta chunk 6
✅ Scored: 2005V.for.Vendetta chunk 7
✅ Scored: 2005V.for.Vendetta chunk 8
✅ Scored: 2005V.for.Vendetta chunk 9
✅ Scored: 2005V.for.Vendetta chunk 10
✅ Scored: 2005V.for.Vendetta chunk 11
✅ Scored: 2005V.for.Vendetta chunk 12
✅ Scored: 2005V.for.Vendetta chunk 13
✅ Scored: 2005V.for.Vendetta chunk 14
✅ Scored: 2005V.for.Vendetta chunk 15
✅ Scored: 2005V.for.Vendetta chunk 16


Scoring Good vs Evil (Grok):  85%|███████████▉  | 17/20 [17:36<04:02, 80.75s/it]

✅ Scored: 2017Paddington.2 chunk 1
✅ Scored: 2017Paddington.2 chunk 2
✅ Scored: 2017Paddington.2 chunk 3
✅ Scored: 2017Paddington.2 chunk 4
✅ Scored: 2017Paddington.2 chunk 5
✅ Scored: 2017Paddington.2 chunk 6
✅ Scored: 2017Paddington.2 chunk 7
✅ Scored: 2017Paddington.2 chunk 8
✅ Scored: 2017Paddington.2 chunk 9
✅ Scored: 2017Paddington.2 chunk 10
✅ Scored: 2017Paddington.2 chunk 11
✅ Scored: 2017Paddington.2 chunk 12


Scoring Good vs Evil (Grok):  90%|████████████▌ | 18/20 [18:42<02:32, 76.14s/it]

✅ Scored: 1985Back.To.The.Future chunk 1
✅ Scored: 1985Back.To.The.Future chunk 2
✅ Scored: 1985Back.To.The.Future chunk 3
✅ Scored: 1985Back.To.The.Future chunk 4
✅ Scored: 1985Back.To.The.Future chunk 5
✅ Scored: 1985Back.To.The.Future chunk 6
✅ Scored: 1985Back.To.The.Future chunk 7
✅ Scored: 1985Back.To.The.Future chunk 8
✅ Scored: 1985Back.To.The.Future chunk 9
✅ Scored: 1985Back.To.The.Future chunk 10
✅ Scored: 1985Back.To.The.Future chunk 11


Scoring Good vs Evil (Grok):  95%|█████████████▎| 19/20 [19:46<01:12, 72.49s/it]

✅ Scored: 2013The.Purge chunk 1
✅ Scored: 2013The.Purge chunk 2
✅ Scored: 2013The.Purge chunk 3
✅ Scored: 2013The.Purge chunk 4
✅ Scored: 2013The.Purge chunk 5
✅ Scored: 2013The.Purge chunk 6
✅ Scored: 2013The.Purge chunk 7


Scoring Good vs Evil (Grok): 100%|██████████████| 20/20 [20:23<00:00, 61.20s/it]


In [3]:
import os
import json
import pandas as pd
from pathlib import Path
from collections import defaultdict, Counter

# === Paths ===
base_path = Path.home() / "Desktop" / "Benchmark"
scored_dir = base_path / "scored_good_vs_evil_grok"
output_csv_path = base_path / "model_good_vs_evil_output_grok.csv"

# === Aggregate Results ===
aggregated_results = []

# Group chunk files by movie
movie_files = defaultdict(list)
for file in scored_dir.glob("*_chunk*_good_vs_evil.json"):
    movie_id = file.name.split("_chunk")[0]
    movie_files[movie_id].append(file)

# Process each movie's chunks
for movie_id, files in movie_files.items():
    label_counts = Counter()
    confidence_sums = defaultdict(float)
    confidence_counts = defaultdict(int)
    explanations = []

    for file in files:
        with open(file, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                label = data.get("good_vs_evil", "").strip().lower()
                confidence = float(data.get("confidence", 0.0))
                explanation = data.get("explanation", "")

                label_counts[label] += 1
                confidence_sums[label] += confidence
                confidence_counts[label] += 1
                explanations.append(f"{label} ({confidence:.2f}): {explanation}")

            except Exception as e:
                print(f"⚠️ Error in file {file.name}: {e}")

    # Compute average confidence per label
    avg_confidences = {
        label: round(confidence_sums[label] / confidence_counts[label], 3)
        for label in label_counts
    }

    # Weighted vote = label with highest total confidence
    weighted_vote = max(confidence_sums.items(), key=lambda x: x[1])[0] if confidence_sums else None

    aggregated_results.append({
        "subtitle_filename": movie_id,
        "label_counts": dict(label_counts),
        "avg_confidences": avg_confidences,
        "weighted_vote": weighted_vote,
        "explanations": explanations
    })

# === Save to CSV ===
df = pd.DataFrame(aggregated_results)
df.to_csv(output_csv_path, index=False)
print(f"✅ Saved aggregated results to: {output_csv_path}")


✅ Saved aggregated results to: /Users/cedricroetheli/Desktop/Benchmark/model_good_vs_evil_output_grok.csv


In [5]:
import pandas as pd
from pathlib import Path

# === Paths ===
base = Path.home() / "Desktop" / "Benchmark"
truth_path = base / "benchmark_final.csv"
model_path = base / "model_good_vs_evil_output_grok.csv"
output_path = base / "good_vs_evil_evaluation_grok.csv"

# === Load Data ===
truth_df = pd.read_csv(truth_path)
model_df = pd.read_csv(model_path)

# === Normalize filenames ===
truth_df["subtitle_filename"] = truth_df["subtitle_filename"].str.strip()
model_df["subtitle_filename"] = model_df["subtitle_filename"].str.strip()

# === Label Mapping for Consistency ===
label_map = {
    "clear": "clear",
    "complicated": "complicated",
    "complex": "complicated",  # ✅ map 'complex' to 'complicated'
    "neutral": "neutral"
}

def normalize_label(label):
    if pd.isna(label):
        return ""
    label = label.lower().strip()
    return label_map.get(label, label)

def normalize_set(label_str):
    if pd.isna(label_str):
        return set()
    parts = [normalize_label(part) for part in label_str.split("|")]
    return set(parts)

truth_df["good_vs_evil_set"] = truth_df["good_vs_evil"].apply(normalize_set)
model_df["normalized_vote"] = model_df["weighted_vote"].apply(normalize_label)

# === Merge and Evaluate ===
merged_df = pd.merge(model_df, truth_df, on="subtitle_filename", how="inner")
merged_df["is_correct"] = merged_df.apply(
    lambda row: row["normalized_vote"] in row["good_vs_evil_set"], axis=1
)

# === Output CSV ===
evaluation_df = merged_df[[
    "subtitle_filename", "good_vs_evil", "normalized_vote", "is_correct"
]].copy()
evaluation_df.columns = ["movie", "benchmark_good_vs_evil", "model_good_vs_evil", "is_correct"]
evaluation_df.to_csv(output_path, index=False)

# === Summary ===
total = len(evaluation_df)
correct = evaluation_df["is_correct"].sum()
accuracy = correct / total if total else 0

print(f"🎯 Evaluation complete:")
print(f"✅ Correct: {correct}/{total}")
print(f"📊 Accuracy: {accuracy:.2%}")
print(f"📁 Saved to: {output_path}")


🎯 Evaluation complete:
✅ Correct: 16/20
📊 Accuracy: 80.00%
📁 Saved to: /Users/cedricroetheli/Desktop/Benchmark/good_vs_evil_evaluation_grok.csv
