In [296]:
# https://stackoverflow.com/questions/30312566/python-how-to-get-values-from-a-dictionary-from-pandas-series
# https://towardsdatascience.com/apply-and-lambda-usage-in-pandas-b13a1ea037f7
# https://spapas.github.io/2016/04/27/python-nested-list-comprehensions/
# https://stackoverflow.com/questions/38793713/numpy-std-calculation-typeerror-cannot-perform-reduce-with-flexible-type

In [297]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [298]:
import pandas as pd
import numpy as np
import json
pd.set_option('display.max_colwidth', -1) # show all text in cell, without truncation; default is 50
pd.set_option('display.max_columns', None) # show all columns; default is 20

In [299]:
with open('features.json') as json_file:
    feature_matrix = json.load(json_file)
feature_names = {feature for segment,features in feature_matrix.items() for feature in features}
print(feature_matrix['n'])
print(feature_matrix['a']['Syllabic'])
print(feature_names)

{'Syllabic': '0', 'Sonorant': '1', 'Anterior': '1', 'Coronal': '1', 'Palatalized': '0', 'Nasal': '1', 'Voiced': '1', 'Continuant': '0', 'Lateral': '0', 'Delayedrelease': '0'}
1
{'Sonorant', 'Back', 'Palatalized', 'Nasal', 'Voiced', 'Syllabic', 'Anterior', 'Continuant', 'High', 'Lateral', 'Low', 'Delayedrelease', 'Coronal'}


In [300]:
df = pd.DataFrame()
df['token1'] = ['a', 'e', 'i']
df['token2'] = ['t', 'Dn', 'n']
df['token3'] = ['Q', 'Q', 'Q']
df

Unnamed: 0,token1,token2,token3
0,a,t,Q
1,e,Dn,Q
2,i,n,Q


Series cannot be used directly as dictionary keys because they are not hashable. This method first uses `map()` to select the dictionary that corresponds to the segment from **feature_matrix** and then uses `apply()` with a lambda function to get the feature of interest. We specify a default value to avoid a key error with features not specified for the given segment (e.g., consonantal features and vowels).

**TODO**: Create different features depending on whether the column is consonantal or vocalic, that is, 10 features for consonantal columns and only 5 for vocalic.

In [301]:
for column_label in ['token1','token2','token3']:
    for feature in feature_names:
#         df[column_label + '_' + feature] = df[column_label].map(feature_matrix).apply(lambda x: x.get(feature, 'missing'))
        print(column_label, feature)
        print(df[column_label].apply(lambda x: np.nanmean(np.asarray([feature_matrix.get(char, dict()).get(feature, np.nan) for char in list(x)]).astype(np.float32))))
        df[column_label + '_' + feature] = df[column_label].apply(lambda x: np.nanmean(np.asarray([feature_matrix.get(char, dict()).get(feature, np.nan) for char in list(x)]).astype(np.float32)))
df

token1 Sonorant
0    1.0
1    1.0
2    1.0
Name: token1, dtype: float64
token1 Back
0    1.0
1    0.0
2    0.0
Name: token1, dtype: float64
token1 Palatalized
0   NaN
1   NaN
2   NaN
Name: token1, dtype: float64
token1 Nasal
0   NaN
1   NaN
2   NaN
Name: token1, dtype: float64
token1 Voiced
0   NaN
1   NaN
2   NaN
Name: token1, dtype: float64
token1 Syllabic
0    1.0
1    1.0
2    1.0
Name: token1, dtype: float64
token1 Anterior
0   NaN
1   NaN
2   NaN
Name: token1, dtype: float64
token1 Continuant
0   NaN
1   NaN
2   NaN
Name: token1, dtype: float64
token1 High
0    0.0
1    0.0
2    1.0
Name: token1, dtype: float64
token1 Lateral
0   NaN
1   NaN
2   NaN
Name: token1, dtype: float64
token1 Low
0    1.0
1    0.0
2    0.0
Name: token1, dtype: float64
token1 Delayedrelease
0   NaN
1   NaN
2   NaN
Name: token1, dtype: float64
token1 Coronal
0   NaN
1   NaN
2   NaN
Name: token1, dtype: float64
token2 Sonorant
0    0.0
1    0.5
2    1.0
Name: token2, dtype: float64
token2 Back
0   NaN
1   N

  """
  


Unnamed: 0,token1,token2,token3,token1_Sonorant,token1_Back,token1_Palatalized,token1_Nasal,token1_Voiced,token1_Syllabic,token1_Anterior,token1_Continuant,token1_High,token1_Lateral,token1_Low,token1_Delayedrelease,token1_Coronal,token2_Sonorant,token2_Back,token2_Palatalized,token2_Nasal,token2_Voiced,token2_Syllabic,token2_Anterior,token2_Continuant,token2_High,token2_Lateral,token2_Low,token2_Delayedrelease,token2_Coronal,token3_Sonorant,token3_Back,token3_Palatalized,token3_Nasal,token3_Voiced,token3_Syllabic,token3_Anterior,token3_Continuant,token3_High,token3_Lateral,token3_Low,token3_Delayedrelease,token3_Coronal
0,a,t,Q,1.0,1.0,,,,1.0,,,0.0,,1.0,,,0.0,,0.0,0.0,0.0,0.0,1.0,0.0,,0.0,,0.0,1.0,,,,,,,,,,,,,
1,e,Dn,Q,1.0,0.0,,,,1.0,,,0.0,,0.0,,,0.5,,0.5,0.5,1.0,0.0,1.0,0.0,,0.0,,0.0,1.0,,,,,,,,,,,,,
2,i,n,Q,1.0,0.0,,,,1.0,,,1.0,,0.0,,,1.0,,0.0,1.0,1.0,0.0,1.0,0.0,,0.0,,0.0,1.0,,,,,,,,,,,,,
