# Summary
This code creates a csv file with 6000 most common japanese words with sentences. Thanks iKnow for making this data available. This site was hard to scrape so I had to use selenium so that firefox would run the Javascript.

## To use this, you need:
- selenium! Google how to install it on your system
- python 3

In [1]:
import re
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from tqdm import tqdm_notebook as tqdm
import csv

In [2]:
opts = Options()
opts.headless = True
browser = Firefox(options=opts)

## The word lists are divided up into pages with 100 words each, so this code scrapes the list of urls from the table of contents page

In [3]:
browser.get('https://iknow.jp/content/japanese')
url_list = []
url_set = set()
for cont in browser.find_elements_by_class_name("series_container"):
    for course_container in browser.find_elements_by_class_name("course_container"):
        ul = course_container.find_element_by_tag_name("ul")
        for li in ul.find_elements_by_tag_name("li"):
            a = li.find_element_by_tag_name("a")
            title = a.get_property("title")
            if re.match(r"Japanese Core \d000.+", title):
                url = a.get_attribute("href")
                if url not in url_set:
                    url_list.append(url)
                    url_set.add(url)

## Should be 60 unique urls

In [4]:
assert len(url_list) == len(set(url_list))
len(url_list)

60

In [5]:
word_list = []
for url in tqdm(url_list):
    
    browser.get(url)
    
    
    content = browser.find_elements_by_class_name("course-content")
    ul = content[0].find_element_by_tag_name("ul")
    for li in ul.find_elements_by_tag_name("li"):
    
        # Find the actual word
        item_details = li.find_element_by_class_name("item-details")
        text = item_details.find_element_by_class_name("text")

        word_text = text.text.strip()
        match = re.match(r"([^\W\d_]+) \[([^\W\d_]+)\]", word_text)
        if match:
            word = match.group(1)
            pron = match.group(2)
        else:
            word = word_text
            pron = None
        translation = item_details.find_element_by_class_name("response").text

        # Get sentences
        item_sents = li.find_element_by_class_name("item-sentences")
        sents = item_sents.find_elements_by_class_name("text")
        tl = item_sents.find_elements_by_class_name("transliteration")
        tn = item_sents.find_elements_by_class_name("translation")

        def get_sent_dict(i):
            return dict(sentence=sents[i].text,transliteration=tl[i].text,
                       translation=tn[i].text)

        sent1 = get_sent_dict(0)
        try:
            sent2=get_sent_dict(1)
        except IndexError:
            sent2 = None
        entry = dict(word=word, pron=pron, translation=translation, 
                     sent1=sent1, sent2=sent2)
        word_list.append(entry)

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))




In [6]:
browser.close()

# Should be 6000 words

In [7]:
len(word_list)

6000

In [8]:
word_list[0:5]

[{'word': '行く',
  'pron': 'いく',
  'translation': 'go',
  'sent1': {'sentence': '日曜日は図書館に行きます。',
   'transliteration': 'にちようび は としょかん に いきます。',
   'translation': 'I go to the library on Sundays.'},
  'sent2': {'sentence': '私は夏休みにプールに行った。',
   'transliteration': 'わたし は なつやすみ に プール に いった。',
   'translation': 'I went to the pool during summer vacation.'}},
 {'word': '見る',
  'pron': 'みる',
  'translation': 'see, look at',
  'sent1': {'sentence': '私は絵を見るのが好きです。',
   'transliteration': 'わたし は え を みる の が すき です。',
   'translation': 'I like looking at pictures.'},
  'sent2': {'sentence': '仕事のあと、映画を見た。',
   'transliteration': 'しごと の あと、 えいが を みた。',
   'translation': 'I saw a movie after work.'}},
 {'word': '多い',
  'pron': 'おおい',
  'translation': 'a lot of, many',
  'sent1': {'sentence': '京都にはお寺が多い。',
   'transliteration': 'きょうと に は おてら が おおい。',
   'translation': 'There are a lot of temples in Kyoto.'},
  'sent2': {'sentence': 'この道は車が多い。',
   'transliteration': 'この みち は くるま が おおい。',
   'translation

## Export to a csv file

In [9]:
with open('japanese6000.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t')
    writer.writerow("word kana english sent1 sent1_kana sent1_english sent2 sent2_kana sent2_english".split())
    for d in word_list:
        word = d["word"]
        kana = d["pron"]
        if kana is None:
            kana = word
        english = d["translation"]
        def get_sentence_list(sent_dict):
            if sent_dict is None:
                return [""]*3
            else:
                return [sent_dict["sentence"], 
                        sent_dict["transliteration"],
                       sent_dict["translation"]]
        row = [word, kana, english] + get_sentence_list(d["sent1"]) + get_sentence_list(d["sent2"])
        writer.writerow(row)

## Export to pkl file

In [10]:
import pickle

In [11]:
with open('japanese6000.pkl', 'wb') as f:
    pickle.dump(word_list, f)