In [None]:
import pandas as pd
import utils

ACCENT_FILEPATH = "data/ACCDB_unicode.csv"
USEFUL_COLUMNS = [
#     'NID', # Unique ID. Ignore as we can just use the row index for this.
    'ID', # ID unique to word & accent (same word with different accents have new `ID`)
    'ACT', # ??? ID unique to each word? This seems to be unique across semantic words, but doesn't distinguish accents.
    'midashigo', # Literally "title word/keyword/entry word"
    'nhk', # Kanji?
    'kanjiexpr', # Kanji 2?
#     'NHKexpr', # This seems to combine nhk and kanjiexpr
    'numberchars', # This is the length of one of the other columns... (midashigo?)
#     'nopronouncepos', # len(midashigo1)
#     'majiri', # Literally means "mixed". Probably an example phrase.
#     'kaisi', # Literally "start". Probably the starting location of the word within `majiri`.
    'midashigo1', # as `midashigo`, but also adds characters for things like word boundaries.
    'akusentosuu', # I think this is "accent+number". Maybe the number of different accents a word can be pronounced with?
#     'bunshou', # Literally "sentence". I think this is 1 when `majiri` contains an example sentence.
    'ac', # This seems to be a list of mora accents. Note that this corresponds to `midashigo1`, not `midashigo`
]

## Notes on 'ac' column:
- 0 = low pitch
- 1 = high pitch
- 2 = dropping pitch
- Entries that end in a 0 or a 1 mean that the trailing particle attaches high.
- Entries that end in a 2 mean that the trailing particle attaches low
  i.e. there is a pitch drop between the last mora of the word and the
  following one.

If 'ac' begins with a 1, a leading 0 has been ommitted.

## Notes on 'nopronouncepos' and 'nasalpos'
no idea what's going on here.

# Parse DataFrame:
This is what it looks like:

In [None]:
accent = utils.process_accent_data(ACCENT_FILEPATH, USEFUL_COLUMNS).rename(columns={
    'ACT':'sem_word_id',
    'ID':'word_id',
    'nhk':'word',
    'kanjiexpr':'kanji',
    'numberchars':'midashigo_alt_len',
    'midashigo1':'midashigo_alt',
    'akusentosuu':'n_accents',
    'ac':'accent',
}) #.reindex(columns=['word', 'kanji', 'midashigo', 'midashigo_alt', 'accent'])

accent

# How to flatten:
- midashigo + midashigo_alt
- word + kanji
 
## midashigo + midashigo_alt
Need to ignore variations with `ー`, but preserve variations in pronunciation

It seems as if there is a separate row for each pronunciation, or way of
writing the pronunciation (filled in the `mishidago` column).

In [None]:
# Show only rows where ID is repeated.
# accent[accent.word_id.isin(accent[accent.word_id.duplicated()].word_id)]
accent[accent.sem_word_id.isin(accent[accent.sem_word_id.duplicated()].sem_word_id)]

- Is there an entry in midashigo_alt that doesn't exist in midashigo?
- Is it reasonable to drop midashigo, and drop any duplicated columns?
  Alternative phonetic spellings shouldn't affect the pitch accent.


In [None]:
# accent.groupby('word_id').apply(lambda x: x[x.midashigo_alt.unique()])
no_dups = accent.drop_duplicates(['word_id','midashigo_alt'])

# %timeit [utils.mora_split(word, accent) for word, accent in zip(no_dups.midashigo_alt, no_dups.accent)]
# 815 ms ± 104 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# %timeit no_dups.apply(lambda x: utils.mora_split(x.midashigo_alt, x.accent), axis=1)
# 3.72 s ± 339 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

no_dups['mora'] = [
    utils.mora_split(word, accent) 
    for word, accent 
    in zip(no_dups.midashigo_alt, no_dups.accent)
]

In [None]:
# Isolate single mora words
no_dups.loc[no_dups.mora.str.len() == 1, [
    'word_id',
    'sem_word_id',
    'word',
    'kanji',
    'n_accents',
    'mora'
]].reset_index().drop(columns='index').to_pickle("data/single_mora_words.pickle")
