In [143]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
from operator import itemgetter as iget
from itertools import accumulate

In [26]:
def load_json(filename):
    with open(filename,encoding='utf-8') as f:
        d = json.load(f)
    return d

kanji2element = load_json("kanjivg-radical-master/data/kanji2element.json")
element2kanji = load_json("kanjivg-radical-master/data/element2kanji.json")

In [59]:
url = 'https://vacuum.name/japanese/wanikani/radicals'

page = requests.get(url)
soup = BeautifulSoup(page.text)

radicals = [res.text.split() for res in soup.find_all('div',class_ = 'group rounded bg-radical')]
radicals_to_meaning = {elem[0]:elem[1] for elem in radicals if len(elem) != 1}
radicals = [elem[0] for elem in radicals_to_meaning]
json.dump(radicals_to_meaning,open('dump.json','w'))

In [90]:
kanji_freq = json.load(open('data/twitter.json','r',encoding='utf-8'))
kanji_freq = {el[0]:el[2] for el in kanji_freq}

In [124]:
url_wiki = 'https://ru.wikipedia.org/wiki/Список_дзёё_кандзи'

page = requests.get(url_wiki)
soup = BeautifulSoup(page.text)

def get_table_lines(soup):
    for el in soup.find('table',class_='sortable wikitable').find_all('tr'):
        yield [res.text.strip() for res in el.find_all('td')]

lines = sorted([el+[kanji_freq[el[1]] if el[1] in kanji_freq else 0] for el in get_table_lines(soup) if len(el) != 0],key=iget(9),reverse=True)

In [137]:
kanji = pd.DataFrame(lines,columns=['id','kanji','old','radical','strokes','grade','year added','meaning','readings','freq'])
kanji = kanji.drop(columns='id')
kanji['freq'] = kanji['kanji'].apply(lambda k : kanji_freq[k]
                                     if k in kanji_freq else 0)

In [138]:
kanji.head(10)

Unnamed: 0,kanji,old,radical,strokes,grade,year added,meaning,readings,freq
0,笑,,竹,10,4,,Смех,"ショウ、わら-う、え-む\nshō, wara-u, e-mu",0.029155
1,日,,日,4,1,,День,"ニチ、ジツ、ひ、か\nnichi, jitsu, hi, ka",0.023846
2,今,,人,4,2,,Сейчас,"コン、キン、いま\nkon, kin, ima",0.013454
3,人,,人,2,1,,Человек,"ジン、ニン、ひと\njin, nin, hito",0.011658
4,大,,大,3,1,,Большой,"ダイ、タイ、おお、おお-きい、おお-いに\ndai, tai, oo, oo-kii, oo...",0.009701
5,行,,行,6,2,,Идти,"コウ、ギョウ、（アン）、い-く、ゆ-く、おこな-う\nkō, gyō, (an), i-ku...",0.008892
6,時,,日,10,2,,Время,"ジ、とき\nji, toki",0.008664
7,一,,一,1,1,,один,"イチ、イツ、ひと、ひと-つ\nichi, itsu, hito, hito-tsu",0.008027
8,気,氣,气,6,1,,Дух,"キ、ケ\nki, ke",0.007409
9,市,,巾,5,2,,Город,"シ、いち\nshi, ichi",0.007281


In [154]:
radical_freq = kanji.groupby('radical').freq.agg(freq = 'sum').sort_values('freq',ascending=False)
freq = list(radical_freq['freq'])
sum_freq = list(accumulate(freq))
radical_freq['sum'] = sum_freq

In [158]:
radical_freq.head(60)

Unnamed: 0_level_0,freq,sum
radical,Unnamed: 1_level_1,Unnamed: 2_level_1
人,0.067618,0.067618
日,0.050016,0.117635
木,0.038822,0.156457
竹,0.032357,0.188814
心,0.029979,0.218793
水,0.027981,0.246774
口,0.02701,0.273784
辵,0.026823,0.300607
糸,0.025103,0.325711
言,0.024528,0.350239
