# Parsing and processing IDEA files

In [45]:
import ClauseWizard
import yaml

In [52]:
# TEST_PATH = '../data/raw/gypsy/common/ideas/00_basic_ideas.txt'
# TEST_PATH = '../data/raw/gypsy/common/ideas/00_country_ideas.txt'
# TEST_PATH = '../data/raw/gypsy/common/ideas/00_flogi_ideas.txt'
TEST_PATH = '../data/raw/gypsy/common/ideas/zz_group_ideas.txt'

In [53]:
with open(TEST_PATH, 'r') as f:
    tokens = ClauseWizard.cwparse(f.read())
    obj = ClauseWizard.cwformat(tokens)

In [54]:
# Replace defaultdicts with dicts

def dictify(d):
    if isinstance(d, dict):
        return {k: dictify(v) for k, v in d.items()}
    elif isinstance(d, list):
        return [dictify(x) for x in d]
    elif isinstance(d, set):
        return {dictify(x) for x in d}
    else:
        return d

obj = dictify(obj)

In [55]:
# Remove keys: ai_will_do, trigger, important, free

for k, v in obj.items():
    if 'ai_will_do' in v:
        del v['ai_will_do']
    if 'trigger' in v:
        del v['trigger']
    if 'important' in v:
        del v['important']
    if 'free' in v:
        del v['free']


In [56]:
# Quality check
# - every object has exactly 9 keys
# - every object has to have keys: category or start, bonus
# - category value is a str: ADM, DIP, MIL
# - eight key values are of type dict

IDEA_CATEGORY_VALUES = ['ADM', 'DIP', 'MIL']
for k, v in obj.items():
    if len(v) != 9:
        print(f'Object {k} has {len(v)} keys')
    if 'category' not in v:
        if 'start' not in v:
            print(f'Object {k} has no category or start key')
    else:
        v['category'] = v['category'].upper()
        if  v['category'] not in IDEA_CATEGORY_VALUES:
            print(f'Object {k} has category {v["category"]}')
    idea_keys = [k for k in v.keys() if k != 'category']
    for key in idea_keys:
        if not isinstance(v[key], dict):
            print(f'Object {k} has no dict value for key {key}')
            del v[key]
    
    

Object cossack_ideas has no dict value for key zaz_steppe_riders
Object rajput_ideas has no dict value for key marwari_horses
Object bengali_ideas has no dict value for key jute_production
Object horde_ideas has no dict value for key life_of_steppe_warrior
Object horde_ideas has no dict value for key tradition_of_conquest
Object horde_ideas has no dict value for key logistics_of_khan
Object horde_ideas has no dict value for key glory_of_conquest
Object anatolian_beyliks_ideas has no dict value for key ghazi
Object ruthenian_ideas has no dict value for key mother_of_russian_cities
Object ruthenian_ideas has no dict value for key international_influences
Object ruthenian_ideas has no dict value for key zaporizhian_cossacs
Object ruthenian_ideas has no dict value for key east_and_west
Object ruthenian_ideas has no dict value for key legacy_of_ancient_rus
Object ruthenian_ideas has no dict value for key reuniting_rus
Object ruthenian_ideas has no dict value for key birth_of_russian_orthodo

In [58]:
with open('../data/interim/dev_ideas_processing_parser.yaml', 'w') as f:
    yaml.dump(obj, f, default_flow_style=False)