In [18]:
import requests
import json
from bs4 import BeautifulSoup

In [19]:
# getting some networking issue, reading static file instead
# source: https://www.cantoneseclass101.com/chinese-radicals/
page_content = open('../data/cantonese/radicals.html', 'rb')

In [20]:
# parse table
soup = BeautifulSoup(page_content, 'html.parser')
radical_table = soup.html.find_all('table', id='radical-table')[0]

# get data entries
rows = radical_table.tbody.find_all('tr')
header = rows[0]
entries = rows[1:]

In [21]:
# collect radical data
radical_data = []
stroke_count = 0

for entry in entries:
    # if the entry has the stroke count field
    if len(entry) == 13:
        stroke_count = int(entry.find_all('td')[0].find('span').text)
        radical_data.append(
            {
                'stroke_count': stroke_count,
                'cantonese': entry.find_all('td')[1].text.strip()[0],
                'variants': list(filter(lambda x: x.strip() != '',
                    list(entry.find_all('td')[2].text.strip()) + list(entry.find_all('td')[1].text.strip()[1:]))),
                'english': entry.find_all('td')[3].text,
                'jyutping': entry.find_all('td')[4].text    
            }
        )
        
    # the entry doesn't have the stroke count field
    else:
        radical_data.append(
            {
                'stroke_count': stroke_count,
                'cantonese': entry.find_all('td')[0].text.strip()[0],
                'variants': list(filter(lambda x: x.strip() != '',
                    list(entry.find_all('td')[1].text.strip()) + list(entry.find_all('td')[0].text.strip()[1:]))),
                'english': entry.find_all('td')[2].text,
                'jyutping': entry.find_all('td')[3].text
            }
        )

In [37]:
radical_data

[{'stroke_count': 1,
  'cantonese': '一',
  'variants': [],
  'english': 'one',
  'jyutping': 'jat1',
  'audio_path': '/home/alex/projects/ltt/data/data/cantonese/pronunciationMp3s/jat1.mp3',
  'strokes_path': '../data/cantonese/strokeGifs/一.gif'},
 {'stroke_count': 1,
  'cantonese': '丨',
  'variants': [],
  'english': 'line',
  'jyutping': 'gwan2',
  'audio_path': '/home/alex/projects/ltt/data/data/cantonese/pronunciationMp3s/gwan2.mp3'},
 {'stroke_count': 1,
  'cantonese': '丶',
  'variants': [],
  'english': 'dot',
  'jyutping': 'zyu2',
  'audio_path': '/home/alex/projects/ltt/data/data/cantonese/pronunciationMp3s/zyu2.mp3'},
 {'stroke_count': 1,
  'cantonese': '丿',
  'variants': ['乀', '乁'],
  'english': 'slash',
  'jyutping': 'pit3',
  'audio_path': '/home/alex/projects/ltt/data/data/cantonese/pronunciationMp3s/pit3.mp3'},
 {'stroke_count': 1,
  'cantonese': '乙',
  'variants': ['乚', '乛'],
  'english': 'second',
  'jyutping': 'jyut6',
  'audio_path': '/home/alex/projects/ltt/data/data

In [47]:
# collect mp3 files
# https://www.cantonesetools.org/en/cantonese-text-to-sound
# paths need to be full to work with anki
PATH_TO_MP3_URLS = "/home/alex/projects/ltt/data/cantonese/mp3Urls.json"
PATH_TO_MP3S = "/home/alex/projects/ltt/data/cantonese/pronunciationMp3s"
mp3_urls_dict = json.load(open(PATH_TO_MP3_URLS))

for entry in radical_data:
    if entry['jyutping'] in mp3_urls_dict:
        url = mp3_urls_dict[entry["jyutping"]]
        output_path = f'{PATH_TO_MP3S}/{entry["jyutping"]}.mp3'
#         !curl '{url}' --output '{output_path}' && sleep 1
        entry['audio_path'] = output_path
        
# TODO this produces 180 mp3s, not 214?

In [40]:
# collect stroke order gifs
# http://www.strokeorder.info/
PATH_TO_STROKE_GIFS = '/home/alex/projects/ltt/data/cantonese/strokeGifs'
url_template = "http://www.strokeorder.info/mandarin.php?q={}"

for entry in radical_data:
    try:
        !sleep 1 #throttle
        page_content = requests.get(url_template.format(entry['cantonese'])).content
        soup = BeautifulSoup(page_content, 'html.parser')
        stroke_data = list(filter(lambda x: x.has_attr('src') and '.gif' in x['src'], soup.find_all('img')))[0]

        url = stroke_data['src']
        output_path = f'{PATH_TO_STROKE_GIFS}/{entry["cantonese"]}.gif'
#         !curl '{url}' --output '{output_path}' && sleep 1
        entry['strokes_path'] = output_path
    except:
        print('failed to get stroke gif for ' + entry['cantonese'])

failed to get stroke gif for 丨
failed to get stroke gif for 丶
failed to get stroke gif for 丿
failed to get stroke gif for 亅
failed to get stroke gif for 亠
failed to get stroke gif for 亻
failed to get stroke gif for 冂
failed to get stroke gif for 冖
failed to get stroke gif for 冫
failed to get stroke gif for 凵
failed to get stroke gif for 勹
failed to get stroke gif for 匚
failed to get stroke gif for 匸
failed to get stroke gif for 卩
failed to get stroke gif for 厶
failed to get stroke gif for 囗
failed to get stroke gif for 夂
failed to get stroke gif for 夊
failed to get stroke gif for 宀
failed to get stroke gif for 尢
failed to get stroke gif for 屮
failed to get stroke gif for 巛
failed to get stroke gif for 廴
failed to get stroke gif for 廾
failed to get stroke gif for 彐
failed to get stroke gif for 彡
failed to get stroke gif for 彳
failed to get stroke gif for 攴
failed to get stroke gif for 殳
failed to get stroke gif for 爻
failed to get stroke gif for 爿
failed to get stroke gif for 疋
failed t

In [1]:
def invoke(payload):
    requestJson = json.dumps(payload).encode('utf-8')
    response = json.load(urllib.request.urlopen(urllib.request.Request('http://localhost:8765', requestJson)))
    return response

In [50]:
# https://foosoft.net/projects/anki-connect/#card-actions
import json
import urllib.request
from os.path import basename

for entry in radical_data:
    character = entry['cantonese']
    romanization = entry['jyutping']
    english = entry['english']
    payload = {
        "action": "addNote",
        "version": "6",
        "params": {
            "note": {
                "deckName": "Chinese::Vocab",
                "modelName": "Chinese",
                "fields": {
                    "character": character,
                    "romanization": romanization,
                    "english": english
                }
            }
        }
    }
    
    if 'audio_path' in entry:
        audio = entry['audio_path']
        payload['params']['note']['audio'] = [{
            "filename": f"cantonese/audio/{basename(audio)}",
            "path": audio,
            "fields": [
                "audio"
        ]}]
        
    if 'strokes_path' in entry:
        strokes = entry['strokes_path']
        payload['params']['note']['picture'] =  [{
                    "filename": f"cantonese/strokes/{basename(strokes)}",
                    "path": strokes,
                    "fields": [
                        "strokes"
        ]}]
        
    invoke(payload)
    !sleep 1