In [1]:
import json
import pandas as pd
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [2]:
# to define function to check if words belong to the category

def evaluate_words(input_text, words, category_words):
    inputs = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(inputs, max_new_tokens=50)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # to extract answer and distractor words from the response
    # to compare each word with the category
    non_category_words = [word for word in words if word not in category_words]

    # to determine the answer based on the presence of non-category words
    if non_category_words:
        return "нет", non_category_words
    else:
        return "да", []

In [None]:
# Load the model
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/rugpt3large_based_on_gpt2")
model = AutoModelForCausalLM.from_pretrained("sberbank-ai/rugpt3large_based_on_gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

In [None]:
# to clean the prompt

def clean_prompt(prompt):
    cleaned_prompt = prompt.replace('Answer either "да" or "нет".', '').replace('A:', '').strip()
    return cleaned_prompt

## All words 0 Distractors

In [None]:
# load the dataset

with open('/content/all_words_from_category_0_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # for reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # to use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/all_words_from_category_0_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# to get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# to create a DataFrame
df = pd.DataFrame(evaluation_scores)

# to display output data
print(df)

# to print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_all_words_from_category_0_distractors.csv', index=False)


   sample_id example_number correct_answer model_answer correct_category  \
0       2620           2620             да           да            Спорт   
1        457            457             да           да         Животные   
2        103            103             да           да           Страны   
3       1127           1127             да           да            Овощи   
4       1004           1004             да           да         Животные   
..       ...            ...            ...          ...              ...   
95      2622           2622             да           да           Страны   
96      2819           2819             да           да            Семья   
97      2282           2282             да           да            Спорт   
98       900            900             да           да         Животные   
99      2805           2805             да           да            Спорт   

   model_category                             correct_category_words  \
0           Спо

## 1 distractor

In [None]:
# load the dataset

with open('/content/all_words_from_category_1_distractor.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)


def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score
# load the original dataset with correct answers
with open('/content/all_words_from_category_1_distractor.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# to get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# to create a DataFrame
df = pd.DataFrame(evaluation_scores)

# to display output data
print(df)

# to print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# to save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_all_words_from_category_1_distractor.csv', index=False)

   sample_id example_number correct_answer model_answer correct_category  \
0       2620           2620            нет          нет           Фрукты   
1        457            457            нет          нет            Цвета   
2        103            103            нет          нет        Профессия   
3       1127           1127            нет          нет            Спорт   
4       1004           1004            нет          нет           Фрукты   
..       ...            ...            ...          ...              ...   
95      2622           2622            нет          нет            Овощи   
96      2819           2819            нет          нет           Одежда   
97      2282           2282            нет          нет   Национальность   
98       900            900            нет          нет           Фрукты   
99      2805           2805            нет          нет            Спорт   

    model_category                         correct_category_words  \
0           Фрукты

## 2 distractors

In [None]:
# load dataset

with open('/content/all_words_from_category_2_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/all_words_from_category_2_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# to get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# to create a DataFrame
df = pd.DataFrame(evaluation_scores)

# to display output data
print(df)

# to print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# to save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_all_words_from_category_2_distractors.csv', index=False)

   sample_id example_number correct_answer model_answer correct_category  \
0       2620           2620            нет          нет         Животные   
1        457            457            нет          нет            Цвета   
2        103            103            нет          нет        Профессия   
3       1127           1127            нет          нет              Дом   
4       1004           1004            нет          нет        Транспорт   
..       ...            ...            ...          ...              ...   
95      2622           2622            нет          нет            Семья   
96      2819           2819            нет          нет        Транспорт   
97      2282           2282            нет          нет           Мебель   
98       900            900            нет          нет            Овощи   
99      2805           2805            нет          нет           Одежда   

   model_category             correct_category_words  \
0        Животные              

## 3 distractors

In [None]:
# load dataset

with open('/content/any_words_from_category_3_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # for reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # to use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/any_words_from_category_3_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# to get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# to create a DataFrame
df = pd.DataFrame(evaluation_scores)

# to display output data
print(df)

# to print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# to save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_any_words_from_category_3_distractors.csv', index=False)

   sample_id example_number correct_answer model_answer correct_category  \
0       2620           2620            нет          нет           Мебель   
1        457            457            нет          нет           Одежда   
2        103            103            нет          нет   Национальность   
3       1127           1127            нет          нет           Мебель   
4       1004           1004            нет          нет           Мебель   
..       ...            ...            ...          ...              ...   
95      2622           2622            нет          нет           Одежда   
96      2819           2819            нет          нет           Мебель   
97      2282           2282            нет          нет   Национальность   
98       900            900            нет          нет              Дом   
99      2805           2805            нет          нет           Одежда   

    model_category     correct_category_words       model_category_words  \
0          

## 4 distractors

In [None]:
# load the dataset

with open('/content/any_words_from_category_4_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)  # For reproducibility
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    #  to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,  # to use the example number from the dataset
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/any_words_from_category_4_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# to get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# to create a DataFrame
df = pd.DataFrame(evaluation_scores)

# tp display the output data
print(df)

# to print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# to save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_any_words_from_category_4_distractors.csv', index=False)

   sample_id example_number correct_answer model_answer correct_category  \
0       2620           2620            нет          нет            Цвета   
1        457            457            нет          нет            Овощи   
2        103            103            нет          нет           Мебель   
3       1127           1127            нет          нет            Цвета   
4       1004           1004            нет          нет            Спорт   
..       ...            ...            ...          ...              ...   
95      2622           2622            нет          нет            Семья   
96      2819           2819            нет          нет           Мебель   
97      2282           2282            нет          нет           Фрукты   
98       900            900            нет          нет           Фрукты   
99      2805           2805            нет          нет            Овощи   

   model_category correct_category_words model_category_words  \
0           Цвета     

## 5 distractors

In [None]:
# load the  dataset

with open('/content/any_words_from_category_5_distractors.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/any_words_from_category_5_distractors.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# to get  scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# to create a DataFrame
df = pd.DataFrame(evaluation_scores)

# to display the output
print(df)

# P=to print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# to save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_any_words_from_category_5_distractors.csv', index=False)

   sample_id example_number correct_answer model_answer correct_category  \
0       2620           2620            нет          нет        Транспорт   
1        457            457            нет          нет            Семья   
2        103            103            нет          нет        Транспорт   
3       1127           1127            нет          нет         Животные   
4       1004           1004            нет          нет              Дом   
..       ...            ...            ...          ...              ...   
95      2622           2622            нет          нет            Семья   
96      2819           2819            нет          нет            Цвета   
97      2282           2282            нет          нет           Мебель   
98       900            900            нет          нет           Фрукты   
99      2805           2805            нет          нет         Животные   

   model_category correct_category_words model_category_words  \
0       Транспорт     

## All words

In [None]:
# load the  dataset

with open('/content/all_words_from_category.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/all_words_from_category.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# to get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# to create a DataFrame
df = pd.DataFrame(evaluation_scores)

# to display the output data
print(df)

# to print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# to save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_all_words_from_category.csv', index=False)

   sample_id example_number correct_answer model_answer correct_category  \
0       2620           2620            нет          нет        Транспорт   
1        457            457             да           да        Профессия   
2        103            103            нет          нет        Профессия   
3       1127           1127            нет          нет         Животные   
4       1004           1004            нет          нет        Профессия   
..       ...            ...            ...          ...              ...   
95      2622           2622            нет          нет         Животные   
96      2819           2819            нет          нет           Одежда   
97      2282           2282            нет          нет            Овощи   
98       900            900            нет          нет            Семья   
99      2805           2805            нет          нет            Спорт   

   model_category                             correct_category_words  \
0       Транспо

## Any words

In [None]:
# load the dataset

with open('/content/any_words_from_category.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)


# select a random sample of 100 examples from the dataset

random.seed(42)
sample_keys = random.sample(list(dataset['examples'].keys()), 100)

# evaluate the random sample

results = []
for key in sample_keys:
    item = dataset['examples'][key]
    input_text = clean_prompt(item['input'])
    words = item['metadata']['words']
    category_words = item['metadata']['category_words']

    # to generate the answer
    answer, non_category_words = evaluate_words(input_text, words, category_words)

    # prepare the result
    result = {
        "sample_id": key,
        "example_number": key,
        "input": input_text,
        "answer": answer,
        "category": item['metadata']['category'],
        "category_words": category_words if answer == "да" else list(set(words).intersection(set(category_words))),
        "distractors": non_category_words,
        "num_distractors": len(non_category_words),
        "distractor_indices": [words.index(word) for word in non_category_words],
        "template_id": item['metadata']['template_id']
    }

    results.append(result)

# function to evaluate the model's output with expected data

def evaluate_output(results, original_dataset):
    scores = []
    for result in results:
        key = result["sample_id"]
        item = original_dataset['examples'][key]
        correct_answer = "да" if set(item['metadata']['words']).issubset(set(item['metadata']['category_words'])) else "нет"
        correct_distractors = list(set(item['metadata']['words']) - set(item['metadata']['category_words']))
        correct_distractor_indices = [item['metadata']['words'].index(word) for word in correct_distractors]

        score = {
            "sample_id": key,
            "example_number": result["example_number"],
            "correct_answer": correct_answer,
            "model_answer": result["answer"],
            "correct_category": item['metadata']['category'],
            "model_category": result["category"],
            "correct_category_words": item['metadata']['category_words'],
            "model_category_words": result["category_words"],
            "correct_distractors": correct_distractors,
            "model_distractors": result["distractors"],
            "correct_distractor_indices": correct_distractor_indices,
            "model_distractor_indices": result["distractor_indices"],
            "template_id": result["template_id"],
            "score": 1 if (result["answer"] == correct_answer and
                           result["category"] == item['metadata']['category'] and
                           set(result["distractors"]) == set(correct_distractors)) else 0
        }

        scores.append(score)

    total_score = sum([score["score"] for score in scores])
    return scores, total_score

# load the original dataset with correct answers
with open('/content/any_words_from_category.json', 'r', encoding='utf-8') as file:
    original_dataset = json.load(file)

# to get scores
evaluation_scores, total_score = evaluate_output(results, original_dataset)

# to create a DataFrame
df = pd.DataFrame(evaluation_scores)

# to dissplay output data
print(df)

# to print summary of the evaluation
print(f"\nTotal Score: {total_score}")

# to save the DataFrame and evaluation scores to CSV files
df.to_csv('eval_scores_any_words_from_category.csv', index=False)

   sample_id example_number correct_answer model_answer correct_category  \
0       2620           2620            нет          нет        Профессия   
1        457            457            нет          нет            Цвета   
2        103            103            нет          нет        Транспорт   
3       1127           1127            нет          нет         Животные   
4       1004           1004            нет          нет            Овощи   
..       ...            ...            ...          ...              ...   
95      2622           2622            нет          нет            Спорт   
96      2819           2819            нет          нет           Мебель   
97      2282           2282            нет          нет        Профессия   
98       900            900            нет          нет           Мебель   
99      2805           2805            нет          нет           Страны   

   model_category correct_category_words  model_category_words  \
0       Профессия    