In [None]:
import json
import pandas as pd
import random
import torch
import requests

In [None]:
API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
headers = {"Authorization": "Bearer xxx"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()

In [None]:
# function to evaluate if words belong to the category using the API

def evaluate_words(input_text, words, category_words):

    # to make the API request
    payload = {"inputs": input_text + " Пожалуйста, ответьте 'да' или 'нет'."}
    response = query(payload)
    response_text = response[0]["generated_text"].strip()
    print(f"Model response: {response_text}")  # debugging line

    # compare each word with the category
    non_category_words = [word for word in words if word not in category_words]

    # determine the answer based on the presence of non-category words
    if non_category_words:
        return "нет", non_category_words
    else:
        return "да", []

In [None]:
# to clean the prompt
def clean_prompt(prompt):
    cleaned_prompt = prompt.replace('Answer either "да" or "нет".', '').replace('A:', '').strip()
    return cleaned_prompt

## All words 0 distractors

In [None]:
# load the dataset

with open('/content/all_words_from_category_0_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/all_words_from_category_0_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('Mistral_eval_scores_all_words_from_category_0_distractors.csv', index=False)

Model response: Q: Содержит ли категория Спорт слова "спортзал", "тренировка", "корт", "шахматист", "бадминтон"? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. The category "Sport" contains the words "sport hall", "training", "court", "chess player", "badminton". Answer: 'да'.
Model response: Q: Относятся ли слова "тигр", "медуза", "горилла", "ящерица", "пингвин" к категории Животные? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. Yes, all of those words refer to the 'Animalia' category.
Model response: Q: Относятся ли слова "Украина", "Литва", "Россия", "Япония", "Швеция" к категории Страны? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. Yes, all of those words refer to countries.
Model response: Q: Входят ли слова "огурец", "кукуруза", "фасоль", "баклажан", "картофель" в категорию Овощи? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. Are "огурец" (cucumber), "кукуруза" ( corn), "фасоль" (bean), "бакла

## All words 1 distractor

In [None]:
# load the dataset

with open('/content/all_words_from_category_1_distractor.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/all_words_from_category_1_distractor.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('Mistral_eval_scores_all_words_from_category_1_distractor.csv', index=False)

Model response: Q: Содержит ли категория Фрукты слова "персик", "арбуз", "ванна", "земляника", "виноград"? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. The category 'Fruits' contains the words "peach", "watermelon", "bathtub", "raspberry", "grape". Answer: 'да'. The word 'peach' and all the other listed words belong to the 'Fruits' category.
Model response: Q: Относятся ли слова "синий", "светлый", "русый", "оранжевый", "Австралия" к категории Цвета? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. These words belong to the color category: 'yes'. The words 'синий' (blue), 'светлый' (light), 'оранжевый' (orange), belong to the color category. 'Русый' (Russian) does not belong to the color category. 'Австралия' is a country and does not belong to the color category.
Model response: Q: Относятся ли слова "комментатор", "лётчик", "художница", "карий", "домохозяйка" к категории Профессия? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 

## All words 2 distractors

In [None]:
# load the dataset

with open('/content/all_words_from_category_2_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/all_words_from_category_2_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('Mistral_eval_scores_all_words_from_category_2_distractors.csv', index=False)

Model response: Q: Содержит ли категория Животные слова "исследователь", "корт", "тигр", "крот", "носорог"? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. The category "Animals" contains the words "researcher", "court", "tiger", "mole", "rhinoceros". Yes, all of those words are animals.
Model response: Q: Относятся ли слова "седой", "Литва", "голубой", "баклажан", "карий" к категории Цвета? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. Yes, all of those words relate to the color category. The words "sedoy" (grey), "gosudarstvo Litvy" (the Republic of Lithuania), "goluboy" (blue), "batlazhan" (eggplant), and "kariy" (yellowish-brown) all describe colors or things associated with colors.
Model response: Q: Относятся ли слова "преподаватель", "Болгария", "столовая", "прокурор", "ювелир" к категории Профессия? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. Yes, all of those words (преподаватель, Болгария, столовая, прокурор, ю

## Any words 3 distractors

In [None]:
# load the dataset

with open('/content/any_words_from_category_3_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)

# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/any_words_from_category_3_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('Mistral_eval_scores_any_words_from_category_3_distractors.csv', index=False)

Model response: Q: Содержит ли категория Мебель слова "комод", "зеркало", "белый", "пингвин", "плавание"? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. The category of Furniture contains the words 'komod', 'zirka', 'white', 'pingvin', 'swimming'? No. These words are not present in the Russian word for furniture, which is 'мебель' (mebel'). In the English language Furniture also does not include the words 'komod', 'zirka', 'white', 'pingvin', and 'swimming'. These words have different meanings and are
Model response: Q: Относятся ли слова "одежда", "лосины", "арбуз", "ковёр", "бабочка" к категории Одежда? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'.

A: Номер один: "Одежда" - да.
Номер два: "Лосины" - нет.
Третий номер: "Арбуз" - нет.
Четвертый номер: "Ковёр" - нет.
Пятый номер: "Бабочка" - нет.

И только первое слово - оде
Model response: Q: Относятся ли слова "марокканка", "австралийка", "малина", "врач", "мантия" к категории Национальность

## Any words 4 distractors

In [None]:
# load the dataset

with open('/content/any_words_from_category_4_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/any_words_from_category_4_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('Mistral_eval_scores_any_words_from_category_4_distractors.csv', index=False)

Model response: Q: Содержит ли категория Цвета слова "Бельгия", "барсук", "белый", "кухня", "пловец"? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. The category of Colour contains the words "Belgium", "badger", "white", "kitchen", "swimmer". Answer: 'нет'. The Category of Colour typically contains colours or shades. The words listed do not fall under this category as they describe objects, animals, kitchen utensils or a activity.
Model response: Q: Относятся ли слова "чемпион", "родственник", "мать", "баклажан", "ЮАР" к категории Овощи? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. These words do not belong to the category 'fruits and vegetables'. Ответ: нет. Пояснения: The words "чемпион", "родственник", "мать", "баклажан", and "ЮАР" are not items of food that fall under the category 'fruits and vegetables'. The word 'баклажан' is a vegetable, but when asked to identify it as a fruit or a vegetable in this question
Model response: Q: Относят

## Any words 5 distractors

In [None]:
# load the dataset

with open('/content/any_words_from_category_5_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/any_words_from_category_5_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('Mistral_eval_scores_any_words_from_category_5_distractors.csv', index=False)

Model response: Q: Содержит ли категория Транспорт слова "дирижёр", "ягода", "гранат", "полка", "медуза"? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. The category "Transport" contains the words "director", "berry", "pomegranate", "shelf", "jellyfish" ? Yes or No please.

A: No. The words "director", "berry", "pomegranate", "shelf", and "jellyfish" do not belong to the category "Transport".
Model response: Q: Относятся ли слова "картофель", "кукуруза", "баклажан", "туалет", "зелёный" к категории Семья? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. Да, эти слова относятся к различным категориям лексического классификатора, но категория "Семья" не включает в себя слова 'картофель', 'кукуруза', 'баклажан', 'туалет', 'зелёный'. Семья - категория лексического классификатора, которая охватывает слова,
Model response: Q: Относятся ли слова "Австрия", "футболка", "жук", "ананас", "родственник" к категории Транспорт? Ответьте либо "да" либо "нет". По

## All words

In [None]:
# load the dataset

with open('/content/all_words_from_category.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)

# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/all_words_from_category.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('Mistral_eval_scores_all_words_from_category.csv', index=False)

Model response: Q: Содержит ли категория Транспорт слова "волейбол", "автобус", "трамвай", "электричка", "собака"? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. The category 'Transport' contains the words 'volleyball', 'autobus', 'tramway', 'electric train', and 'dog'. Answer: No. The category 'Transport' does not contain the words 'volleyball' and 'dog'. However, it does contain the words 'autobus', 'tramway', 'electric train', which are all types of transportation.
Model response: Q: Относятся ли слова "менеджер", "мультипликатор", "диспетчер", "предприниматель", "чиновник" к категории Профессия? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. Да. Уточните, пожалуйста, чем эти слова связаны с категорией "Профессия"? Менеджер, мультипликатор, диспетчер, предприниматель и чиновник являются значимыми терминами в области профессий. Менеджер - человек, отвечающий за координацию и руководство другими людьми
Model response: Q: Относятся ли слова "тр

## Any words

In [None]:
# load the dataset

with open('/content/any_words_from_category.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # use the API to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # Use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/any_words_from_category.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# create a DataFrame
df = pd.DataFrame(evaluation_scores)

# display the output
print(df)

# print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('Mistral_eval_scores_any_words_from_category.csv', index=False)

Model response: Q: Содержит ли категория Профессия слова "гранат", "седой", "лампа", "лук", "Израиль"? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. The 'Profession' category does not contain the words "granate", "sedoy", "lampa", "luk", or "Israel". There is no profession associated with those words in the English language.
Model response: Q: Относятся ли слова "клубника", "зять", "мексиканка", "зелёный", "тёмный" к категории Цвета? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'нет'. Are the words "clubberry", "stepson", "Mexican", "green", "dark" in the color category? Please answer 'yes' or 'no'. A: No, the words "clubberry", "stepson", "Mexican", "green", and "dark" do not belong to the color category. Only colors or color-related words should be placed in this category.
Model response: Q: Относятся ли слова "самолёт", "дыня", "тролейбус", "кресло", "мебель" к категории Транспорт? Ответьте либо "да" либо "нет". Пожалуйста, ответьте 'да' или 'не