# Experiment with decomposing rhyme scheme

## General

For all of the following, find maximum domain (number of syllables, segments, features, etc.) and write nulls (`None`) where missing.

## Procedure

1. Create column for each syllable
1. Split syllables into onset, nucleus, coda
1. Split onset and coda into segments
1. Decompose segments into features

## Initialize

In [1]:
import pandas as pd
import regex as re

## Create sample data and write into df

In [2]:
words = [
    ['BA'], # sebja (open masculine)
    ['Ok'], # mog (closed masculine)
    ['AST'], # strast' (closed masculine with coda cluster)
    ['Instv'], # menšynstv (closed masculine with coda cluster)
    ['U', 'ka'], # nauka (open feminine)
    ['A', 'Vil'], # pravil (closed feminine)
    ['I', 'graT'], # vygrat' (closed feminine with post-tonic onset cluster) 
    ['Or', 'daST'], # gordost' (closed feminine with post-tonic coda cluster)
    ['U', 'pnaST'], # sovokupnost' (closed feminine with post-tonic onset and coda clusters)
    ['I', 'Vi', 'STi'], # vyvesti (dactyl)
    ['E', 'tska', 'va'] # sovetskogo (dactyl)
]
df = pd.DataFrame()
df["rzs"] = [item for item in words] # rhyme zone syllables
df

Unnamed: 0,rzs
0,[BA]
1,[Ok]
2,[AST]
3,[Instv]
4,"[U, ka]"
5,"[A, Vil]"
6,"[I, graT]"
7,"[Or, daST]"
8,"[U, pnaST]"
9,"[I, Vi, STi]"


## Create column for each syllable

`t0` = tonic, `t1` = first post-tonic, etc.

In [3]:
df["syllcounts"] = df["rzs"].apply(len) 
m = df["syllcounts"].max() # longest word in syllable count; hold on to this for processing later
for i in range(m): # Use max syllable count in rzs to create tonic, posttonic, etc. columns
    df['t' + str(i)] = [x[i] if len(x) > i else '' for x in df["rzs"] ]
df

Unnamed: 0,rzs,syllcounts,t0,t1,t2
0,[BA],1,BA,,
1,[Ok],1,Ok,,
2,[AST],1,AST,,
3,[Instv],1,Instv,,
4,"[U, ka]",2,U,ka,
5,"[A, Vil]",2,A,Vil,
6,"[I, graT]",2,I,graT,
7,"[Or, daST]",2,Or,daST,
8,"[U, pnaST]",2,U,pnaST,
9,"[I, Vi, STi]",3,I,Vi,STi


## Split syllables into onset, nucleus, coda

In [4]:
syllcols = df.filter(regex=("^t\d+$"))
for syllcol in syllcols:
    headers = [syllcol + i for i in ['o', 'n', 'c']]
    df[headers] = df[syllcol].str.extract('^(.*)([aeiouAEIOU])(.*)$')
df.fillna(value='', inplace=True) # replace Null and NaN with empty string
df

Unnamed: 0,rzs,syllcounts,t0,t1,t2,t0o,t0n,t0c,t1o,t1n,t1c,t2o,t2n,t2c
0,[BA],1,BA,,,B,A,,,,,,,
1,[Ok],1,Ok,,,,O,k,,,,,,
2,[AST],1,AST,,,,A,ST,,,,,,
3,[Instv],1,Instv,,,,I,nstv,,,,,,
4,"[U, ka]",2,U,ka,,,U,,k,a,,,,
5,"[A, Vil]",2,A,Vil,,,A,,V,i,l,,,
6,"[I, graT]",2,I,graT,,,I,,gr,a,T,,,
7,"[Or, daST]",2,Or,daST,,,O,r,d,a,ST,,,
8,"[U, pnaST]",2,U,pnaST,,,U,,pn,a,ST,,,
9,"[I, Vi, STi]",3,I,Vi,STi,,I,,V,i,,ST,i,


## Split onset, nucleus, and coda into segments

In [5]:
# Columns of interest match ^t\d[onc]$
syllpartcols = df.filter(regex=("^t\d[onc]$"))
for col in syllpartcols:
    m = syllpartcols[col].apply(len).max()
    for i in range(m):
        print(col + '-' + str(i+1))

t0o-1
t0n-1
t0c-1
t0c-2
t0c-3
t0c-4
t1o-1
t1o-2
t1o-3
t1n-1
t1c-1
t1c-2
t2o-1
t2o-2
t2n-1


In [6]:
# https://www.kaggle.com/jboysen/quick-tutorial-flatten-nested-json-in-pandas
import json
from pandas.io.json import json_normalize
with open('features.json') as f:
    d = json.load(f)
d["segments"][:2]

[{'p': [{'Syllabic': '0'},
   {'Sonorant': '0'},
   {'Anterior': '1'},
   {'Coronal': '0'},
   {'Palatalized': '0'},
   {'Nasal': '0'},
   {'Voiced': '0'},
   {'Continuant': '0'},
   {'Lateral': '0'},
   {'Delayedrelease': '0'}]},
 {'P': [{'Syllabic': '0'},
   {'Sonorant': '0'},
   {'Anterior': '1'},
   {'Coronal': '0'},
   {'Palatalized': '1'},
   {'Nasal': '0'},
   {'Voiced': '0'},
   {'Continuant': '0'},
   {'Lateral': '0'},
   {'Delayedrelease': '0'}]}]

In [7]:
flattened = {}
for item in d["segments"]:
    (key, value), = item.items() # key is the phonep
    flattened[key] = {k: v for d in value for k, v in d.items()} # flatten list of one-item dictionaries to key:value pairs
print(flattened)

{'p': {'Syllabic': '0', 'Sonorant': '0', 'Anterior': '1', 'Coronal': '0', 'Palatalized': '0', 'Nasal': '0', 'Voiced': '0', 'Continuant': '0', 'Lateral': '0', 'Delayedrelease': '0'}, 'P': {'Syllabic': '0', 'Sonorant': '0', 'Anterior': '1', 'Coronal': '0', 'Palatalized': '1', 'Nasal': '0', 'Voiced': '0', 'Continuant': '0', 'Lateral': '0', 'Delayedrelease': '0'}, 'b': {'Syllabic': '0', 'Sonorant': '0', 'Anterior': '1', 'Coronal': '0', 'Palatalized': '0', 'Nasal': '0', 'Voiced': '1', 'Continuant': '0', 'Lateral': '0', 'Delayedrelease': '0'}, 'B': {'Syllabic': '0', 'Sonorant': '0', 'Anterior': '1', 'Coronal': '0', 'Palatalized': '1', 'Nasal': '0', 'Voiced': '1', 'Continuant': '0', 'Lateral': '0', 'Delayedrelease': '0'}, 't': {'Syllabic': '0', 'Sonorant': '0', 'Anterior': '1', 'Coronal': '1', 'Palatalized': '0', 'Nasal': '0', 'Voiced': '0', 'Continuant': '0', 'Lateral': '0', 'Delayedrelease': '0'}, 'T': {'Syllabic': '0', 'Sonorant': '0', 'Anterior': '1', 'Coronal': '1', 'Palatalized': '1', '