In [1]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
from operator import itemgetter as iget
from itertools import accumulate

In [95]:
def load_json(filename):
    with open(filename,encoding='utf-8') as f:
        d = json.load(f)
    return d

kanji2element = load_json("kanjivg-radical-master/data/kanji2element.json")
element2kanji = load_json("kanjivg-radical-master/data/element2kanji.json")

url = 'https://vacuum.name/japanese/wanikani/radicals'

page = requests.get(url)
soup = BeautifulSoup(page.text)

radicals = [res.text.split() for res in soup.find_all('div',class_ = 'group rounded bg-radical')]
radicals_to_meaning = {elem[0]:elem[1] for elem in radicals if len(elem) != 1}
radicals = [elem[0] for elem in radicals_to_meaning]
json.dump(radicals_to_meaning,open('dump.json','w'))

kanji_freq = json.load(open('data/twitter.json','r',encoding='utf-8'))
kanji_freq = {el[0]:el[2] for el in kanji_freq}

url_wiki = 'https://ru.wikipedia.org/wiki/Список_дзёё_кандзи'

page = requests.get(url_wiki)
soup = BeautifulSoup(page.text)

def get_table_lines(soup):
    for el in soup.find('table',class_='sortable wikitable').find_all('tr'):
        yield [res.text.strip() for res in el.find_all('td')]

lines = sorted([el+[kanji_freq[el[1]] if el[1] in kanji_freq else 0] for el in get_table_lines(soup) if len(el) != 0],key=iget(9),reverse=True)

kanji = pd.DataFrame(lines,columns=['id','kanji','old','radical','strokes','grade','year added','meaning','readings','freq'])
kanji = kanji.drop(columns='id')
kanji['freq'] = kanji['kanji'].apply(lambda k : kanji_freq[k]
                                     if k in kanji_freq else 0)

radical_freq = kanji.groupby('radical').freq.agg(freq = 'sum').sort_values('freq',ascending=False)
freq = list(radical_freq['freq'])
sum_freq = list(accumulate(freq))
radical_freq['sum'] = sum_freq
radical_to_freq = {index : row['freq'] for index, row in radical_freq.iterrows()}

with open('data/variants.txt','r',encoding='utf-8') as file:
    lines = [line[:-1].split('\t')[1:] for line in file]
    
radicals = pd.DataFrame(lines,columns=['radical (variants)','strokes','meaning and reading','freq','jfreq','examples','group'])
radicals['radical'] = radicals['radical (variants)'].apply(lambda s : s[0])

def get_variants(s : str):
    if (len(s) == 1):
        return ''
    l = s[3:-1].split(',')
    return ''.join(l)

radicals['variants'] = radicals['radical (variants)'].apply(get_variants)
radicals = radicals.drop(columns='radical (variants)')
radicals = radicals[['radical', 'variants','strokes','meaning and reading','freq','jfreq','examples','group']]
radicals['kfreq'] = radicals['freq'].apply(lambda s : int(s.replace(',','')) if s != '' else 0)
radicals['freq'] = radicals['radical'].apply(lambda s : radical_to_freq[s] if s in radical_to_freq else 0)

In [105]:
radicals.sort_values('freq',ascending=False)[50:100]

Unnamed: 0,radical,variants,strokes,meaning and reading,freq,jfreq,examples,group,kfreq
158,車,,7,"cart, car (くるま, kuruma)",0.005498,,軌 軟 較 軍 載,,361
99,生,,5,"life (うまれる, umareru)",0.00545,,牲 笙 甥,,22
9,儿,,2,"legs, human underneath (にんにょう, ninnyō, 人繞)",0.005377,,兄元,,52
114,禾,,5,"two-branch tree (のぎ, nogi, ノ木)",0.005,,利 私 季 和 科 香 秦 穀,,431
103,疒,,5,"sickness (やまいだれ, yamaidare, 病垂)",0.004969,,病 症 痛 癌 癖,Top 75%,526
23,十,,2,"ten, complete (じゅう, jū, 十)",0.004949,35.0,十千午半博,Top 75%,55
43,尸,,3,"corpse (しかばね, shikabane, 屍)",0.00473,,尺局,Top 75%,148
166,金,釒,8,"metal, gold (かね, kane)",0.004713,,銀 銅 釘 銳 鋞 鋙 鉒 鉍 鉗 鈡 鈠,Top 50%,806
69,方,,4,"way, square, raft (ほう, hō)",0.004663,,方 放 旅 族,,92
188,高,髙,10,"tall, high (たかい, takai)",0.004183,,髚 髛,,34


In [104]:
radical_freq.iloc[50:100]

Unnamed: 0_level_0,freq,sum
radical,Unnamed: 1_level_1,Unnamed: 2_level_1
丨,0.005725,0.750173
肉,0.005662,0.755836
亅,0.005641,0.761477
干,0.005591,0.767068
車,0.005498,0.772566
生,0.00545,0.778016
儿,0.005377,0.783393
禾,0.005,0.788393
疒,0.004969,0.793362
十,0.004949,0.798311
