# Map segments to distinctive features

Scratchpad to explore mapping from segments to features. 

## Import packages and configure notebook

In [72]:
import pandas as pd
import json
import regex as re
pd.options.display.max_columns = 50 # to display full consonant df

## Data to be processed

In [113]:
## Fake data
rzs = ["ETi", "Inaf", "InaJ", "AumaMi"] # rhyme zones

## Consonant and vowel features

### Consonant features

In [114]:
## Consonant features
with open("consonants.json") as f: # consonant features
    consonants_json = json.load(f)
df_consonants = pd.DataFrame(consonants_json)
df_consonants

Unnamed: 0,Š,š,B,b,c,ɣ,D,d,F,f,G,g,J,K,k,L,l,Č,M,m,N,n,Ǯ,P,p,R,r,ʒ,S,s,T,t,V,v,X,x,Z,z,ž
Anterior,0,0,1,1,1,0,1,1,1,1,0,0,0,0,0,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0
Consonantal,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Continuant,1,1,0,0,0,1,0,0,1,1,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,1,1,1,1
Coronal,1,1,0,0,1,0,1,1,0,0,0,0,1,0,0,1,1,1,0,0,1,1,1,0,0,1,1,1,1,1,1,1,0,0,0,0,1,1,1
Delayedrelease,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
Lateral,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Nasal,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Palatalized,0,0,1,0,0,0,1,0,1,0,1,0,1,1,0,1,0,1,1,0,1,0,1,1,0,1,0,0,1,0,1,0,1,0,1,0,1,0,0
Sonorant,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
Syllabic,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Vowel features

In [115]:
with open("vowels.json") as f: # vowel features
    vowels_json = json.load(f)
df_vowels = pd.DataFrame(vowels_json)
df_vowels

Unnamed: 0,A,a,E,e,I,i,O,o,U,u
Back,1,1,0,0,0,0,1,1,1,1
High,0,0,0,0,1,1,0,0,1,1
Low,1,1,0,0,0,0,0,0,0,0
Sonorant,1,1,1,1,1,1,1,1,1,1
Syllabic,1,1,1,1,1,1,1,1,1,1


## Import data into df

In [117]:
df = pd.DataFrame()
df["RhymeZone"] = rzs

# Split into vowels and consonant clusters
syllpat = r"([aeiou]?)([^aeiou]*)" # VC(C)
max_syll_count = max({len(re.findall(r"[aeiouAEIOU]", rz)) for rz in rzs}) # highest syllable count in rzs
df["tokenized"] = [x[0] for x in df["RhymeZone"].str. # create only as many places as are needed (may be one extraa)
                   findall(r"(.?)([AEIOU])([^aeiou]*)" + (syllpat * (max_syll_count - 1)))]

# Tokenize rz into columns
# Figure out how many columns are needed
pre = {(len(re.search(r"^.*[AEIOU]", rz).group())) for rz in rzs}
post = {(len(re.search(r"[AEIOU].*$", rz). group())) for rz in rzs}
max_len = max(pre) + max(post)
# Now fill them
i = 0
while i <= max_len:
    df["token" + str(i)] = [item[i] for item in df["tokenized"]]
    i += 1
df

Unnamed: 0,RhymeZone,tokenized,token0,token1,token2,token3,token4,token5,token6,token7
0,ETi,"(, E, T, i, , , , , )",,E,T,i,,,,
1,Inaf,"(, I, n, a, f, , , , )",,I,n,a,f,,,
2,InaJ,"(, I, n, a, J, , , , )",,I,n,a,J,,,
3,AumaMi,"(, A, , u, m, a, M, i, )",,A,,u,m,a,M,i
