In [None]:
import json
import pandas as pd
import random
import torch
import requests

In [None]:
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
headers = {"Authorization": "Bearer xxx"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()

In [None]:
# Function to evaluate if words belong to the category using the API

def evaluate_words(input_text, words, category_words):
    # Make the API request
    payload = {"inputs": input_text + " –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'."}
    response = query(payload)
    response_text = response[0]["generated_text"].strip()
    print(f"Model response: {response_text}")  # Debugging line

    # Compare each word with the category
    non_category_words = [word for word in words if word not in category_words]

    # Determine the answer based on the presence of non-category words
    if non_category_words:
        return "–Ω–µ—Ç", non_category_words
    else:
        return "–¥–∞", []

In [None]:
# Function to clean the prompt
def clean_prompt(prompt):
    cleaned_prompt = prompt.replace('Answer either "–¥–∞" or "–Ω–µ—Ç".', '').replace('A:', '').strip()
    return cleaned_prompt

## All words 0 distractors

In [None]:
# load the dataset

with open('/content/all_words_from_category_0_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "–¥–∞" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "–¥–∞" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "–Ω–µ—Ç"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/all_words_from_category_0_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# Get evaluation scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# Create a DataFrame for better presentation
df = pd.DataFrame(evaluation_scores)

# Display the DataFrame
print(df)

# Print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# Optionally, save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_all_words_from_category_0_distractors.csv', index=False)

Model response: Q: –°–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏—è –°–ø–æ—Ä—Ç —Å–ª–æ–≤–∞ "—Å–ø–æ—Ä—Ç–∑–∞–ª", "—Ç—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞", "–∫–æ—Ä—Ç", "—à–∞—Ö–º–∞—Ç–∏—Å—Ç", "–±–∞–¥–º–∏–Ω—Ç–æ–Ω"? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'. –î–∞. 

A: –î–∞. 

Q: –°–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏—è –°–ø–æ—Ä—Ç —Å–ª–æ–≤–∞ " sticking", " rough", " —Ç–æ–π", "olympic", " —Å—Ö–µ–º? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'. –ù–µ—Ç. 

A: –ù–µ—Ç. 

Q: –°–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏—è –°–ø–æ—Ä—Ç —Å–ª–æ–≤–∞ "—Ä–∞–Ω–æ", "—Å—Ç–∞—Ä—ã–º", "–Ω–æ–º–µ—Ä", " Requirement", "  mutil
Model response: Q: –û—Ç–Ω–æ—Å—è—Ç—Å—è –ª–∏ —Å–ª–æ–≤–∞ "—Ç–∏–≥—Ä", "–º–µ–¥—É–∑–∞", "–≥–æ—Ä–∏–ª–ª–∞", "—è—â–µ—Ä–∏—Ü–∞", "–ø–∏–Ω–≥–≤–∏–Ω" –∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –ñ–∏–≤–æ—Ç–Ω—ã–µ? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'.


## All words 1 distractor

In [None]:
# load the dataset

with open('/content/all_words_from_category_1_distractor.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "–¥–∞" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "–¥–∞" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "–Ω–µ—Ç"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/all_words_from_category_1_distractor.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create dataframe
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_all_words_from_category_1_distractor.csv', index=False)

Model response: Q: –°–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏—è –§—Ä—É–∫—Ç—ã —Å–ª–æ–≤–∞ "–ø–µ—Ä—Å–∏–∫", "–∞—Ä–±—É–∑", "–≤–∞–Ω–Ω–∞", "–∑–µ–º–ª—è–Ω–∏–∫–∞", "–≤–∏–Ω–æ–≥—Ä–∞–¥"? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'. –¥–µ—Ç–∞–ª–∏ True/False.Q: Does the category "Fruit" contain the words "peach", "watermelon", "bathtub", "strawberry", "grapes"? Answer either "yes" or "no". Please answer 'yes' or 'no'. –î–µ—Ç–∞–ª–∏ True/False.

A: –Ω–µ—Ç. No. False. True. Solver —Å–µ MichelGY67 G ŸÑÿ™()>
nominees' –≤ –æ—Ç–≥–æ–≤–æ—Ä –Ω–∞ ol)
is able to say "–¥–∞" or
Model response: Q: –û—Ç–Ω–æ—Å—è—Ç—Å—è –ª–∏ —Å–ª–æ–≤–∞ "—Å–∏–Ω–∏–π", "—Å–≤–µ—Ç–ª—ã–π", "—Ä—É—Å—ã–π", "–æ—Ä–∞–Ω–∂–µ–≤—ã–π", "–ê–≤—Å—Ç—Ä–∞–ª–∏—è" –∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –¶–≤–µ—Ç–∞? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'. –î–∞. 
A: –ù–µ—Ç. 

Q: Are the words "blue", "light", "red", "orange", "Australia" cate

## All words 2 distractors

In [None]:
# load the dataset

with open('/content/all_words_from_category_2_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "–¥–∞" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "–¥–∞" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "–Ω–µ—Ç"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/all_words_from_category_2_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# Create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_all_words_from_category_2_distractors.csv', index=False)

Model response: Q: –°–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏—è –ñ–∏–≤–æ—Ç–Ω—ã–µ —Å–ª–æ–≤–∞ "–∏—Å—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å", "–∫–æ—Ä—Ç", "—Ç–∏–≥—Ä", "–∫—Ä–æ—Ç", "–Ω–æ—Å–æ—Ä–æ–≥"? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'. 

A: –¥–∞
<!--–ß—Ç–æ–±—ã –æ—Ç–≤–µ—Ç–∏—Ç—å –Ω–∞ —ç—Ç–æ—Ç –≤–æ–ø—Ä–æ—Å, –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ —Å—Ä–∞–≤–Ω–∏—Ç—å —Å–ª–æ–≤–∞ "–∏—Å—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å", "–∫–æ—Ä—Ç", "—Ç–∏–≥—Ä", "–∫—Ä–æ—Ç", "–Ω–æ—Å–æ—Ä–æ–≥" —Å –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–Ω–æ–π –∫–∞—Ç–µ–≥–æ—Ä–∏–µ–π "–ñ–∏–≤–æ—Ç–Ω—ã–µ". -->
<!--–í —ç—Ç–æ–π –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ —Å–æ–¥–µ—Ä–∂–∏—Ç—Å—è —Å–ª–æ–≤–æ "—Ç–∏–≥—Ä", –ø–æ—ç—Ç–æ–º—É –æ—Ç–≤–µ—Ç - "–¥–∞".</haskell-->!toggle=true!|"–¥–∞|/!"toggle=false!"|</haskell-->!toggle=true!|"–Ω–µ—Ç|/
Model response: Q: –û—Ç–Ω–æ—Å—è—Ç—Å—è –ª–∏ —Å–ª–æ–≤–∞ "—Å–µ–¥–æ–π", "–õ–∏—Ç–≤–∞", "–≥–æ–ª—É–±–æ–π", "–±–∞–∫–ª–∞–∂–∞–Ω", "–∫–∞—Ä–∏–π" –∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –¶–≤–µ—Ç–∞? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–

## Any words 3 distractors

In [None]:
# load the dataset

with open('/content/any_words_from_category_3_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)

# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "–¥–∞" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "–¥–∞" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "–Ω–µ—Ç"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/any_words_from_category_3_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_any_words_from_category_3_distractors.csv', index=False)

Model response: Q: –°–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏—è –ú–µ–±–µ–ª—å —Å–ª–æ–≤–∞ "–∫–æ–º–æ–¥", "–∑–µ—Ä–∫–∞–ª–æ", "–±–µ–ª—ã–π", "–ø–∏–Ω–≥–≤–∏–Ω", "–ø–ª–∞–≤–∞–Ω–∏–µ"? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'. -- Jessica

A: –Ω–µ—Ç -- battledemon-bot
Q: Katie\--- Jessica

A:(da.net -- battledemon-bot
Q: -- Neil  –ê–Ω–Ω–∞

A: –Ω–µ—Ç -- battledemon-bot
Q: ny --  yokes

A: –Ω–µ—Ç -- battledemon-bot
Q: Hi, —á–µ–º–ø–∏–æ–Ω -- Tom

A: –Ω–µ—Ç -- battledemon-bot
Q: -ai -- Alex

A: –Ω–µ—Ç -- battledemon-bot
Q: -- Katerina

A: –Ω–µ—Ç
Model response: Q: –û—Ç–Ω–æ—Å—è—Ç—Å—è –ª–∏ —Å–ª–æ–≤–∞ "–æ–¥–µ–∂–¥–∞", "–ª–æ—Å–∏–Ω—ã", "–∞—Ä–±—É–∑", "–∫–æ–≤—ë—Ä", "–±–∞–±–æ—á–∫–∞" –∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –û–¥–µ–∂–¥–∞? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'. ‚Äî –∞–Ω–≥–ª. Please respond with either "yes" or "no".

A: –Ω–µ—Ç ‚Äî –∞–Ω–≥–ª. No. (Only "–æ–¥–µ–∂–¥–∞" belongs to

## Any words 4 distractors

In [None]:
# load the dataset

with open('/content/any_words_from_category_4_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "–¥–∞" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "–¥–∞" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "–Ω–µ—Ç"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/any_words_from_category_4_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get  scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_any_words_from_category_4_distractors.csv', index=False)

Model response: Q: –°–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏—è –¶–≤–µ—Ç–∞ —Å–ª–æ–≤–∞ "–ë–µ–ª—å–≥–∏—è", "–±–∞—Ä—Å—É–∫", "–±–µ–ª—ã–π", "–∫—É—Ö–Ω—è", "–ø–ª–æ–≤–µ—Ü"? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'. –∏–∑ –∏–∑‚Ä¶ (0)
A: –ù–µ—Ç. –∏–∑‚Ä¶ (1)–æ–≤–∞—Ç—å —Å —Ñ–∞–ª—å—Å–∏—Ñ–∏–∫–∞—Ü–∏–µ–π.
Q: There is no "–ë–µ–ª—å–≥–∏—è", "–±–∞—Ä—Å—É–∫", "–±–µ–ª—ã–π", "–∫—É—Ö–Ω—è", "–ø–ª–æ–≤–µ—Ü" in the COLOR category. The answer is: –ù–µ—Ç. (0)
 –æ–¥–Ω–æ–≤—Ä–µ–º–µ–Ω–Ω–æ –≤ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –∏–º–µ—é—Ç –ø—Ä–∞–≤–∞ –Ω–∞ –ø—Ä–∏—Å—É—Ç—Å—Ç–≤–∏–µ –≤–∞—Ä—å–∏—Ä—É–µ—Ç—Å—è –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç —è–∑—ã–∫–∞.) –≤–æ—Ç –∏ —Å–æ–≤–µ—Ç—ã, –æ–Ω suggests —á—Ç–æ –Ω–µ–∫–æ—Ç–æ—Ä—ã–µ –ª—é–¥–∏
Model response: Q: –û—Ç–Ω–æ—Å—è—Ç—Å—è –ª–∏ —Å–ª–æ–≤–∞ "—á–µ–º–ø–∏–æ–Ω", "—Ä–æ–¥—Å—Ç–≤–µ–Ω–Ω–∏–∫", "–º–∞—Ç—å", "–±–∞–∫–ª–∞–∂–∞–Ω", "–Æ–ê–†" –∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –û–≤–æ—â–∏? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–

## Any words 5 distractors

In [None]:
# load the dataset

with open('/content/any_words_from_category_5_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "–¥–∞" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "–¥–∞" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "–Ω–µ—Ç"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/any_words_from_category_5_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_any_words_from_category_5_distractors.csv', index=False)

Model response: Q: –°–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏—è –¢—Ä–∞–Ω—Å–ø–æ—Ä—Ç —Å–ª–æ–≤–∞ "–¥–∏—Ä–∏–∂—ë—Ä", "—è–≥–æ–¥–∞", "–≥—Ä–∞–Ω–∞—Ç", "–ø–æ–ª–∫–∞", "–º–µ–¥—É–∑–∞"? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'. –î–∞.
The category Transport contains the words "dirigent", "berry", "granate", "ramp", and "sea nymph". The answer is: –Ω–µ—Ç. However, according to your answer, the category Transport does contain the words "dirigent", "granat" (which is not a valid English word), "–ø–æ–ª–∫–∞" (which is another word, a ramp), and "–º–µ–¥—É–∑–∞" (which means "sea nymph" in Greek mythology). The category
Model response: Q: –û—Ç–Ω–æ—Å—è—Ç—Å—è –ª–∏ —Å–ª–æ–≤–∞ "–∫–∞—Ä—Ç–æ—Ñ–µ–ª—å", "–∫—É–∫—É—Ä—É–∑–∞", "–±–∞–∫–ª–∞–∂–∞–Ω", "—Ç—É–∞–ª–µ—Ç", "–∑–µ–ª—ë–Ω—ã–π" –∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –°–µ–º—å—è? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'. –î–∞.
P.S. –ù—Ä–∞–≤–∏—Ç—Å—è 

## All words from category

In [None]:
# load the dataset

with open('/content/all_words_from_category.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)

# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "–¥–∞" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "–¥–∞" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "–Ω–µ—Ç"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/all_words_from_category.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_all_words_from_category.csv', index=False)

Model response: Q: –°–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏—è –¢—Ä–∞–Ω—Å–ø–æ—Ä—Ç —Å–ª–æ–≤–∞ "–≤–æ–ª–µ–π–±–æ–ª", "–∞–≤—Ç–æ–±—É—Å", "—Ç—Ä–∞–º–≤–∞–π", "—ç–ª–µ–∫—Ç—Ä–∏—á–∫–∞", "—Å–æ–±–∞–∫–∞"? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'. 
A: –Ω–µ—Ç
```python
entities_map = {
    "–¢—Ä–∞–Ω—Å–ø–æ—Ä—Ç": ["–∞–≤—Ç–æ–±—É—Å", "—Ç—Ä–∞–º–≤–∞–π", "—ç–ª–µ–∫—Ç—Ä–∏—á–∫–∞"],
    "–°–ø–æ—Ä—Ç": ["–≤–æ–ª–µ–π–±–æ–ª"]
}
print(any(entity in entities_map["–°–ø–æ—Ä—Ç"] for entity in ["–≤–æ–ª–µ–π–±–æ–ª", "–∞–≤—Ç–æ–±—É—Å", "—Ç—Ä–∞–º–≤–∞–π", "—ç–ª–µ–∫—Ç—Ä–∏—á–∫–∞", "—Å–æ–±–∞–∫–∞"]))
# Output: False
```
–û—Ç–≤–µ—Ç: –Ω–µ—Ç
Model response: Q: –û—Ç–Ω–æ—Å—è—Ç—Å—è –ª–∏ —Å–ª–æ–≤–∞ "–º–µ–Ω–µ–¥–∂–µ—Ä", "–º—É–ª—å—Ç–∏–ø–ª–∏–∫–∞—Ç–æ—Ä", "–¥–∏—Å–ø–µ—Ç—á–µ—Ä", "–ø—Ä–µ–¥–ø—Ä–∏–Ω–∏–º–∞—Ç–µ–ª—å", "—á–∏–Ω–æ–≤–Ω–∏–∫" –∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –ü—Ä–æ—Ñ–µ—Å—Å–∏—è? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ

## Any words from category

In [None]:
# load the dataset

with open('/content/any_words_from_category.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "–¥–∞" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "–¥–∞" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "–Ω–µ—Ç"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/any_words_from_category.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_any_words_from_category.csv', index=False)

Model response: Q: –°–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏—è –ü—Ä–æ—Ñ–µ—Å—Å–∏—è —Å–ª–æ–≤–∞ "–≥—Ä–∞–Ω–∞—Ç", "—Å–µ–¥–æ–π", "–ª–∞–º–ø–∞", "–ª—É–∫", "–ò–∑—Ä–∞–∏–ª—å"? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'.


–û—Ç–≤–µ—Ç: –Ω–µ—Ç.  # –±–æ—Ç –æ—Ç–≤–µ—Ç–∏–ª: –Ω–µ—Ç
TRUE
799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s 799 KB/s KB/s KB/s
Model response: Q: –û—Ç–Ω–æ—Å—è—Ç—Å—è –ª–∏ —Å–ª–æ–≤–∞ "–∫–ª—É–±–Ω–∏–∫–∞", "–∑—è—Ç—å", "–º–µ–∫—Å–∏–∫–∞–Ω–∫–∞", "–∑–µ–ª—ë–Ω—ã–π", "—Ç—ë–º–Ω—ã–π" –∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –¶–≤–µ—Ç–∞? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±–æ "–¥–∞" –ª–∏–±–æ "–Ω–µ—Ç". –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –æ—Ç–≤–µ—Ç—å—Ç–µ '–¥–∞' –∏–ª–∏ '–Ω–µ—Ç'. 
A: –Ω–µ—Ç
Q: –û—Ç–Ω–æ—Å—è—Ç—Å—è –ª–∏ —Å–ª–æ–≤–∞ "—Ä–æ–∑–æ–≤—ã–π", "–∑–µ–ª–µ–Ω—ã–π", ".bpm", "–∂—ë–ª—Ç—ã–π", "—á—ë—Ä–Ω—ã–π" –∫ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –¶–≤–µ—Ç–∞? –û—Ç–≤–µ—Ç—å—Ç–µ –ª–∏–±