# Analysis Of Words With Single Mora
- These tend to be words of Chinese origin.
- There are two possible pitch accents: `[0]` and `[1]`
- The ratio of pitch accents should be ~70% `[0]`

Before running this notebook, you should generate `"data/single_mora_words.pickle"`
by running `ACC_DB_parsing.ipynb`.

In [None]:
import pandas as pd
import altair as alt
import utils

IN_FILE = "data/single_mora_words.pickle"


In [None]:
words = pd.read_pickle(IN_FILE)
words[['phoneme', 'accent']] = pd.DataFrame([[mora.get_pair() for mora in word][0] for word in words.mora], columns=['phoneme','accent'])
words = utils.get_last_mora_info(words)

words


In [None]:
print("Words that are pitch accent [1]: {:0.1f}% of population"
    # Accent of 2 on the first mora is equivalent to a pitch accent of [1]
    # on the word.
    .format((len(words[words.accent == 2]) / len(words)) * 100.0))


In [None]:
alt.Chart(
    words[['word_id', 'sem_word_id', 'phoneme', 'accent']].melt(id_vars=['word_id', 'sem_word_id', 'phoneme'])
).mark_bar().encode(
    x='phoneme:N',
    y='count()',
    color='value:N',
)


In [None]:
def make_chart(df, group):
    word_count = (df.groupby([group, 'accent']).word_id.count().reset_index().rename(columns={'word_id':'word_count'}).set_index(group))
    word_count['total'] = word_count.groupby(group).word_count.sum()
    word_count['density'] = word_count.word_count / word_count.total
    word_count = word_count.reset_index()
    # return word_count.groupby(group).first()
    return alt.Chart(word_count).mark_bar().encode(
        x=group,
        y='density:Q',
        color='accent:N',
    ).properties(
        width=300
    ) + alt.Chart(word_count.groupby(group).first().reset_index()).mark_text(dy=140, color='white').encode(
        x=group,
        text=alt.Text('total:Q')
    )

# make_chart(words, 'end_vowel')
make_chart(words, 'is_vowel') | make_chart(words, 'end_vowel') | make_chart(words, 'onset')
