In [221]:
# https://stackoverflow.com/questions/30312566/python-how-to-get-values-from-a-dictionary-from-pandas-series
# https://towardsdatascience.com/apply-and-lambda-usage-in-pandas-b13a1ea037f7
# https://spapas.github.io/2016/04/27/python-nested-list-comprehensions/
import pandas as pd
import numpy as np
import json
pd.set_option('display.max_colwidth', -1) # show all text in cell, without truncation; default is 50
pd.set_option('display.max_columns', None) # show all columns; default is 20

In [222]:
with open('features.json') as json_file:
    feature_matrix = json.load(json_file)
feature_names = {feature for segment,features in feature_matrix.items() for feature in features}
print(feature_matrix['n'])
print(feature_matrix['a']['Syllabic'])
print(feature_names)

{'Syllabic': '0', 'Sonorant': '1', 'Anterior': '1', 'Coronal': '1', 'Palatalized': '0', 'Nasal': '1', 'Voiced': '1', 'Continuant': '0', 'Lateral': '0', 'Delayedrelease': '0'}
1
{'Sonorant', 'Back', 'Palatalized', 'Nasal', 'Voiced', 'Syllabic', 'Anterior', 'Continuant', 'High', 'Lateral', 'Low', 'Delayedrelease', 'Coronal'}


In [223]:
df = pd.DataFrame()
# df['token1'] = ['a', 'e', 'i']
df['token2'] = ['t', 'dn', 'n']

Series cannot be used directly as dictionary keys because they are not hashable. This method first uses `map()` to select the dictionary that corresponds to the segment from **feature_matrix** and then uses `apply()` with a lambda function to get the feature of interest. We specify a default value to avoid a key error with features not specified for the given segment (e.g., consonantal features and vowels).

**TODO**: Create different features depending on whether the column is consonantal or vocalic, that is, 10 features for consonantal columns and only 5 for vocalic.

In [266]:
for column_label in ['token2']:
    for feature in feature_names:
#         df[column_label + '_' + feature] = df[column_label].map(feature_matrix).apply(lambda x: x.get(feature, 'missing'))
        print(feature)
        print(df[column_label].apply(lambda x: np.nanmean(np.asarray([feature_matrix[char].get(feature, np.nan) for char in list(x)]).astype(np.float32))))
#
# for char in list(string)
# for feature in feature_names
# feature_matrix[char][feature]
# np.asarray(values).astype(np.float32)

Sonorant
0    0.0
1    0.5
2    1.0
Name: token2, dtype: float64
Back
0   NaN
1   NaN
2   NaN
Name: token2, dtype: float64
Palatalized
0    0.0
1    0.0
2    0.0
Name: token2, dtype: float64
Nasal
0    0.0
1    0.5
2    1.0
Name: token2, dtype: float64
Voiced
0    0.0
1    1.0
2    1.0
Name: token2, dtype: float64
Syllabic
0    0.0
1    0.0
2    0.0
Name: token2, dtype: float64
Anterior
0    1.0
1    1.0
2    1.0
Name: token2, dtype: float64
Continuant
0    0.0
1    0.0
2    0.0
Name: token2, dtype: float64
High
0   NaN
1   NaN
2   NaN
Name: token2, dtype: float64
Lateral
0    0.0
1    0.0
2    0.0
Name: token2, dtype: float64
Low
0   NaN
1   NaN
2   NaN
Name: token2, dtype: float64
Delayedrelease
0    0.0
1    0.0
2    0.0
Name: token2, dtype: float64
Coronal
0    1.0
1    1.0
2    1.0
Name: token2, dtype: float64


  """
