# Experiment with decomposing rhyme scheme

## General

For all of the following, find maximum domain (number of syllables, segments, features, etc.) and write nulls (`None`) where missing.

## Procedure

1. Create column for each syllable
1. Split syllables into onset, nucleus, coda
1. Split onset and coda into segments
1. Decompose segments into features

## Initialize

In [20]:
import pandas as pd
import regex as re
syllpat = re.compile('^(.*)([aeiouAEIOU].*)()$') # onset, nucleus, coda

## Create sample data and write into df

In [8]:
w1 = ['A', 'Vil'] # pravil (feminine)
w2 = ['Ok'] # mog (closed masculine)
w3 = ['U', 'ka'] # nauka (feminine)
w4 = ['VE', 'tska', 'va'] # sovetskogo (dactyl)
w5 = ['BA'] # sebja (open masculine)
words = list([w1, w2, w3, w4, w5])
df = pd.DataFrame()
df["rzs"] = [item for item in words] # rhyme zone syllables
df

Unnamed: 0,rzs
0,"[A, Vil]"
1,[Ok]
2,"[U, ka]"
3,"[VE, tska, va]"
4,[BA]


## Create column for each syllable

`t0` = tonic, `t1` = first post-tonic, etc.

In [10]:
df["syllcounts"] = df["rzs"].apply(len) 
m = df["syllcounts"].max() # longest word in syllable count; hold on to this for processing later
for i in range(m): # Use max syllable count in rzs to create tonic, posttonic, etc. columns
    df['t' + str(i)] = [x[i] if len(x) > i else None for x in df["rzs"] ]
df

Unnamed: 0,rzs,syllcounts,t0,t1,t2
0,"[A, Vil]",2,A,Vil,
1,[Ok],1,Ok,,
2,"[U, ka]",2,U,ka,
3,"[VE, tska, va]",3,VE,tska,va
4,[BA],1,BA,,


## Split syllables into onset, nucleus, coda

In [21]:
syllcols = df.filter(regex=("^t\d+$"))
for syllcol in syllcols:
    for i in ['o', 'n', 'c']: # onset, nucleus, cola
        df[syllcol + i]

t0o
t0n
t0c
t1o
t1n
t1c
t2o
t2n
t2c


In [3]:
# https://www.kaggle.com/jboysen/quick-tutorial-flatten-nested-json-in-pandas
import json
from pandas.io.json import json_normalize
with open('features.json') as f:
    d = json.load(f)
d

{'segments': [{'p': [{'Syllabic': '0'},
    {'Sonorant': '0'},
    {'Anterior': '1'},
    {'Coronal': '0'},
    {'Palatalized': '0'},
    {'Nasal': '0'},
    {'Voiced': '0'},
    {'Continuant': '0'},
    {'Lateral': '0'},
    {'Delayedrelease': '0'}]},
  {'P': [{'Syllabic': '0'},
    {'Sonorant': '0'},
    {'Anterior': '1'},
    {'Coronal': '0'},
    {'Palatalized': '1'},
    {'Nasal': '0'},
    {'Voiced': '0'},
    {'Continuant': '0'},
    {'Lateral': '0'},
    {'Delayedrelease': '0'}]},
  {'b': [{'Syllabic': '0'},
    {'Sonorant': '0'},
    {'Anterior': '1'},
    {'Coronal': '0'},
    {'Palatalized': '0'},
    {'Nasal': '0'},
    {'Voiced': '1'},
    {'Continuant': '0'},
    {'Lateral': '0'},
    {'Delayedrelease': '0'}]},
  {'B': [{'Syllabic': '0'},
    {'Sonorant': '0'},
    {'Anterior': '1'},
    {'Coronal': '0'},
    {'Palatalized': '1'},
    {'Nasal': '0'},
    {'Voiced': '1'},
    {'Continuant': '0'},
    {'Lateral': '0'},
    {'Delayedrelease': '0'}]},
  {'t': [{'Syllabic': '0

In [65]:
features

Unnamed: 0,classes
0,"{'Consonants': [{'p': [{'Syllabic': '0'}, {'So..."
1,"{'Vowels': [{'i': [{'Syllabic': '1'}, {'Sonora..."
