In [1]:
from dao.attribute import DAOAttributePL

from models.attribute import AttributePLInDB

from typing import List

PLACEHOLDER_WORDS = ["imię", "nazwisko", "adres", "e-mail"]

In [2]:
dao_attribute: DAOAttributePL = DAOAttributePL('attributes-24-12-16-recalc-24-12-22.1-pgryka')


generated: List[AttributePLInDB] = dao_attribute.find_many_by_query({"is_generated": True})
real: List[AttributePLInDB] = dao_attribute.find_many_by_query({"is_generated": False})

In [3]:
def create_word_dict_with_counts(data: List[AttributePLInDB], normalize_output: bool = False) -> dict:
    word_dict = {}
    for attribute in data:
        keys_to_analyze = [key for key in attribute.sample_word_counts if (key not in PLACEHOLDER_WORDS) and len(key) > 3]
        sample_word_sum = sum([attribute.sample_word_counts[key] for key in keys_to_analyze])
        for word in keys_to_analyze:
            if word in word_dict:
                word_dict[word] += attribute.sample_word_counts[word]/sample_word_sum
            else:
                word_dict[word] = attribute.sample_word_counts[word]/sample_word_sum
    # normalize the output
    if normalize_output:
        for key in word_dict.keys():
            word_dict[key] = word_dict[key] / len(data)
    return dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=True))

In [4]:
generated_word_dict = create_word_dict_with_counts(generated, normalize_output=True)
real_word_dict = create_word_dict_with_counts(real, normalize_output=True)

In [5]:
def compare_word_dicts(generated_words_dict: dict, real_words_dict: dict) -> dict:
    # Combine keys from both dictionaries
    all_keys = set(generated_words_dict.keys()).union(real_words_dict.keys())
    
    # Calculate differences, assuming 0 for missing keys
    diff_dict = {key: generated_words_dict.get(key, 0) - real_words_dict.get(key, 0) for key in all_keys}
    
    # Sort by descending values
    return dict(sorted(diff_dict.items(), key=lambda item: abs(item[1]), reverse=True))

In [6]:
diff_dict = compare_word_dicts(generated_word_dict, real_word_dict)

In [7]:
diff_dict

{'który': 0.011427862513394815,
 'kluczowy': 0.008377149494496932,
 'analiza': 0.007649711489376467,
 'oraz': 0.007607655982521884,
 'kontekst': 0.006817289467591725,
 'istotny': 0.006379752091441083,
 'rysunek': -0.005264208821153907,
 'zadanie': -0.0050851522466241125,
 'dokument': 0.00466305176897688,
 'różny': 0.004318211648892186,
 'dane': 0.003861273808162512,
 'dotyczyć': 0.003553130855989228,
 'proces': 0.003433979093361774,
 'taki': 0.0032649076865535996,
 'wskazywać': 0.0029930308683931615,
 'także': 0.0027860269345843834,
 'związać': 0.002785888280567498,
 'element': 0.002750447587512185,
 'przez': -0.0027097988454725624,
 'zrozumienie': 0.0026752910431059765,
 'mieć': 0.0025353787061488282,
 'czas': -0.0025333844562765166,
 'wpływać': 0.0025102364901381426,
 'znaczenie': 0.00250955700226754,
 'również': 0.0024676176067461413,
 'pozwalać': 0.0024347608317414576,
 'działanie': 0.0023143413236289385,
 'jakość': 0.0022852622558036492,
 'wykres': -0.002269623897229821,
 'router'

In [8]:
import json
import os

file_name = "diff_dict.json"
# get current directory
current_directory = os.getcwd()
print(current_directory)

# Write the dictionary to a JSON file
with open(file_name, "w", encoding="utf-8") as json_file:
    json.dump(diff_dict, json_file, ensure_ascii=False, indent=4)

/mnt/d/Dev/Github/anti-gpt-checker/notebooks
