In [1]:
# imports
import pandas as pd
import numpy as np
from scraping.canto_dict.canto_dict_client import get_fact_from_character
from time import sleep
import json



In [2]:
radicals_df = (
    pd
    .read_csv('data/radicals.csv', comment='#', index_col='index', dtype={'variants': str, 'stroke_count': int})
    .fillna(np.nan).replace([np.nan], [None])
)

In [5]:
radical_fact_dict = {}
character_set_names = ['traditional','simplified']

for index in radicals_df.index:
    # scrape a radical from cantodict
    radical = radicals_df['radical'][index]
    fact = get_fact_from_character(radical)

    if fact is None:
        fact = {
            'definitions': [],
            'character': radical,
            'is_radical': True,
            'radicals': [radical],
            'forms': {'traditional': radical, 'simplified': radical},
            'romanizations': {},
            'stroke_counts': {}
        }

    fact['lemma'] = fact.pop('character')
    english = radicals_df['english'][index]
    
    # add some information from the local CSV (cantodict's data is a bit lacking)
    fact['definitions'].insert(0, english)
    fact['pos'] = ['Noun'] # hardcoding for now
    variants_field = radicals_df['variants'][index]
    fact['variants'] = list(variants_field) if variants_field is not None else []
    fact['stroke_counts']['simplified'] = 1

    # put into dictionary
    radical_fact_dict[radical] = fact

In [6]:
# write to file
with open('data/radicals.json', 'w') as file:
     file.write(json.dumps(radical_fact_dict))