# Load the Dataframe

Much code used from https://github.com/zyocum/phoible-notebook/blob/main/notebook.ipynb 

In [2]:
# Import pandas
# ! pip install pandas

import pandas as pd
from collections import defaultdict

In [3]:
# Download the dataframe

df = pd.read_csv('https://raw.githubusercontent.com/phoible/dev/master/data/phoible.csv', low_memory=False)

In [4]:
df

Unnamed: 0,InventoryID,Glottocode,ISO6393,LanguageName,SpecificDialect,GlyphID,Phoneme,Allophones,Marginal,SegmentClass,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
0,1,kore1280,kor,Korean,,0068,h,ç h ɦ,,consonant,...,-,-,-,-,+,-,-,-,-,-
1,1,kore1280,kor,Korean,,006A,j,j,,consonant,...,-,-,+,-,-,-,-,-,-,-
2,1,kore1280,kor,Korean,,006B,k,k̚ ɡ k,,consonant,...,-,-,-,-,-,-,-,-,-,-
3,1,kore1280,kor,Korean,,006B+02B0,kʰ,kʰ,,consonant,...,-,-,-,-,+,-,-,-,-,-
4,1,kore1280,kor,Korean,,006B+02C0,kˀ,kˀ,,consonant,...,-,-,-,-,-,+,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105479,3020,lamu1254,lby,Tableland Lamalama,,0294,ʔ,,False,consonant,...,-,-,-,-,-,+,-,-,-,-
105480,3020,lamu1254,lby,Tableland Lamalama,,03B8,θ,,False,consonant,...,-,-,-,-,-,-,-,-,-,-
105481,3020,lamu1254,lby,Tableland Lamalama,,0061,a,,False,vowel,...,-,-,+,-,-,-,0,-,-,0
105482,3020,lamu1254,lby,Tableland Lamalama,,0069,i,,False,vowel,...,-,-,+,-,-,-,0,-,-,0


# Set up the analysis

In [5]:
# Map the index of the dataframe to the language name

index_to_language = dict(df[['InventoryID', 'LanguageName']].values)

In [6]:
print(index_to_language[3020])

len(index_to_language)

Tableland Lamalama


3020

In [7]:
# Find all inventories with at least one NaNs in the Allophones column

nan_inventories = set(df[df['Allophones'].isna()]['InventoryID'].values)
print(len(nan_inventories))

1688


In [28]:
# Find inventories with only non-NaNs

defined_inventories = set(df.groupby('InventoryID').filter(lambda x: x['Allophones'].isna().sum() != len(x))['InventoryID'].values)
print(len(defined_inventories))

1332


In [52]:
# Find inventories where each phoneme in an inventory maps to exactly one allophone 387
meaningful_allophone_inventories = set()
for inventory, phonemes in df.groupby('InventoryID'):
    if inventory in defined_inventories: # Only go through if it's defined
        for index, row in phonemes.iterrows():
            if row['Phoneme'] != row['Allophones']:
                meaningful_allophone_inventories.add(inventory)
defined_no_allophone_inventories = {ID for ID in defined_inventories if ID not in meaningful_allophone_inventories}
print(f'There are {len(defined_no_allophone_inventories)} inventories with no meaningful allophones')
print(f'These are inventories {sorted(defined_no_allophone_inventories)}')

There are 387 inventories with no meaningful allophones
These are inventories [3, 26, 41, 173, 649, 650, 651, 655, 656, 659, 660, 663, 664, 666, 667, 668, 669, 670, 671, 672, 675, 676, 677, 679, 680, 681, 682, 683, 684, 686, 687, 688, 692, 693, 695, 696, 697, 698, 699, 700, 701, 702, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 718, 719, 720, 724, 725, 727, 729, 730, 732, 733, 735, 737, 738, 739, 740, 741, 743, 744, 746, 748, 749, 751, 752, 753, 754, 756, 759, 762, 763, 764, 765, 766, 768, 769, 770, 771, 772, 776, 777, 779, 782, 783, 784, 787, 788, 789, 792, 793, 794, 795, 796, 797, 800, 801, 805, 806, 807, 809, 810, 811, 812, 813, 814, 816, 818, 819, 820, 822, 826, 827, 829, 830, 831, 832, 833, 834, 835, 836, 837, 842, 843, 844, 846, 847, 848, 849, 850, 852, 853, 859, 860, 863, 864, 868, 869, 870, 872, 873, 874, 877, 878, 879, 902, 903, 912, 916, 924, 926, 931, 933, 936, 942, 946, 951, 952, 960, 963, 964, 986, 989, 991, 993, 999, 1023, 1033, 1041, 1047, 1049, 1063,

The total number of inventories is equal to the number of NaN-only columns and mixed columns. This means that every inventory either is all-NaNs or all non-nans

In [9]:
# Define a function which returns true if a phoneme is in the list of phonemes

def is_defined(phoneme):
    return phoneme in df['Phoneme'].values

In [56]:
# Check to see which phonemes don't have allophones

allophone_counts = defaultdict(int)
undefined_allophones = set()

# For each inventory with meaningful allophones
for inventory in sorted(meaningful_allophone_inventories):
    # For each phoneme in that inventory
    for _, phoneme in df[df['InventoryID'] == inventory].iterrows():
        # For each allophone of that phoneme
        for allophone in phoneme['Allophones'].split():
            allophone_counts[allophone] += 1
            if not is_defined(allophone):
                undefined_allophones.add(allophone)
    if inventory % 5 == 0:
        print(inventory)


all_allophones = allophone_counts.keys()

5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
665
685
690
745
750
755
760
775
780
785
790
815
825
840
845
855
865
875
880
885
890
895
900
905
910
915
920
925
930
935
940
945
950
955
965
970
975
980
985
990
995
1000
1005
1010
1015
1020
1025
1030
1035
1040
1045
1050
1055
1060
1065
1070
1075
1080
1085
1090
1095
1100
1105
1110
1115
1120
1125
1130
1135
1140
1145
1155
1160
1165
1170
1175
1185
1190
1210
1215
1220
1225
1230
1255
1260
1265
1285
1290
1295
1300
1315
1320
1325
1330
1340
1350
1355
1365
1375
1380
1385
1390
1395
1400
1405
1410
1415
1435
1440
1445
1455
1470
1475
1480
1500
1510
1520
1535
1550
1565
1570
1575
1585
1590
1600
1605
1610
1625
1630
1635
1650
1655
1660
1665
1670
1675
1680
1685
1690
2160
2165
2170
2175
2180
2185
2190
2195
2200
2205
2210
2215
2220
2225
2230
2235


# Get data on the inventories

In [57]:
# Of the inventories with allophones, how many phonemes and allophones do they have?
# Only considers inventories with defined allophones
id_to_phoneme_count = {} # Maps an ID number to the number of phonemes
id_to_allophone_count = {} # Maps an ID number to the number of phonemes
id_to_allophone_phoneme_ratio = {} # Maps an ID number to the ratio of allophones to phonemes
for inventory, phonemes in df.groupby('InventoryID'):
    if inventory in defined_inventories: # Only go through if it's defined
        num_allophones = 0
        for allophones in phonemes:
            num_allophones += len(allophones.split() )
        id_to_phoneme_count[inventory] = len(phonemes)
        id_to_allophone_count[inventory] = num_allophones
        id_to_allophone_phoneme_ratio[inventory] = num_allophones / len(phonemes)

In [70]:
average_allophone_phoneme_ratio = sum(id_to_allophone_phoneme_ratio.values() ) / len(id_to_allophone_phoneme_ratio)
print(f'The average ratio of allophones to phonemes is {average_allophone_phoneme_ratio}')
print(f'There are {average_allophone_phoneme_ratio - 1} more allophones than phonemes')

The average ratio of allophones to phonemes is 1.365849237498223
There are 0.36584923749822296 more allophones than phonemes


In [66]:
# How many of the allophones are already in the inventory?

# Get all the phonemes in an inventory
inventory_phonemes = defaultdict(set)
# For each inventory with meaningful allophones
for inventory in sorted(meaningful_allophone_inventories):
    # For each phoneme in that inventory
    for _, phoneme in df[df['InventoryID'] == inventory].iterrows():
        inventory_phonemes[inventory].add(phoneme['Phoneme'])

# Get all the allophones in an inventory
inventory_allophones = defaultdict(set)
# For each inventory with meaningful allophones
for inventory in sorted(meaningful_allophone_inventories):
    # For each phoneme in that inventory
    for _, phoneme in df[df['InventoryID'] == inventory].iterrows():
        # For each allophone of that phoneme
        for allophone in phoneme['Allophones'].split():
            inventory_allophones[inventory].add(allophone)

In [68]:
# Check to see how many of the allophones already exist within the phonemic inventory
allophone_percent_outside_inventory = {}
# For each inventory with meaningful allophones
for inventory in sorted(meaningful_allophone_inventories):
    allophone_percent_outside_inventory[inventory] = len(inventory_allophones[inventory]) / len(inventory_phonemes[inventory]) - 1

In [72]:
average_allophone_percent_outside_inventory = sum(allophone_percent_outside_inventory.values() ) / len(allophone_percent_outside_inventory)
print(f'The average percent of allophones outside the phonemic inventory is {average_allophone_percent_outside_inventory}')
print(f'This means that {average_allophone_percent_outside_inventory / (average_allophone_phoneme_ratio - 1)}% of allophones lie outside the inventory')

The average percent of allophones outside the phonemic inventory is 0.33712105332857684
This means that 0.9214753476976% of allophones lie outside the inventory


# Get data on the allophones

In [59]:
# Return the number of allophones and unique allophones
print(f'There are {len(allophone_counts)} allophones')
print(f'{len(undefined_allophones)} don\'t have feature representations')

There are 3034 allophones
1016 don't have feature representations


In [60]:
# Account for marginality
marginal_defined_allophones = set()
nonmarginal_undefined_allophones = set()
for undefined_allophone in undefined_allophones:
    # If it's marginal and exists nonmarginally
    if (undefined_allophone.startswith("<") and is_defined(undefined_allophone[1:-1]) ) or (is_defined("<" + undefined_allophone + ">") ):
        marginal_defined_allophones.add(undefined_allophone)
    else:
        nonmarginal_undefined_allophones.add(undefined_allophone)


In [61]:
# Return info on the marginality of some allophones
print(f'Of the {len(undefined_allophones)} without feature representations:')
print(f'{len(marginal_defined_allophones)} have feature representations when accounting for marginality')
print(f'These are {marginal_defined_allophones}')
print(f'{len(nonmarginal_undefined_allophones)} still don\'t feature representations when accounting for marginality')

Of the 1016 without feature representations:
7 have feature representations when accounting for marginality
These are {'<q>', '<ɲ̟>', '<o>', '<œ>', '<ɯ>', '<y>', '<ɾ>'}
1009 still don't feature representations when accounting for marginality


In [65]:
# Filter by the number of times they appear
nonunique_allophones = {key for key, value in allophone_counts.items() if value > 1}
nonunique_undefined_allophones = {key for key, value in allophone_counts.items() if value > 1 and key in nonmarginal_undefined_allophones}
print(f'There are {len(nonunique_allophones)} allophones that appear more than once.')
print(f'Of those, {len(nonmarginal_undefined_allophones)} don\'t have feature representations.')
print(f'These are {nonunique_undefined_allophones}')

There are 1294 allophones that appear more than once.
Of those, 1009 don't have feature representations.
These are {'ʊ̥', 'h̃ʲ', 'tʷʲ', 'ɡ͉', 'kⁿ', 'œə', 'k̚', 'ɱvʷ', 'wə', 'ɓ̚', 'dɪ̯', 'ʲh', 'tʰɪ̯', '↘', 'ɲʲ', 'kʔ', 'k͈ʷː', 'ɖʲ', 'pˡ', 'jɪ', 'l̪̥ˠ', 'kp̚', 'h̃ʷ', 'dˡ', 'ɽ̥', 'θ̬', 'ɟ̥', 't̚', 'ɖ̚', 't̃', 'ʔn', 'β̞̜', 'ă̟', 'k̚ʷ', 'ə̥̆', 't̠ʃ̚', 'ʈʲʰ', 'ŋ̚', 'd̃', 'wa', 'ɣ̞', 'ŋw̃', 'nʷˠ', 'pʔ', 'ɐ̯', 'l̻̥', 'ɟ̚', 'k̠', 'ɡ̠', 'ʒ̥', 'ɰʷ', 'lɪ̯', 'ɡ̃', 'tˡ', 'ʈ̚', 'ɪ̥', 'vʷˠ', 'uʔ', 'nn̥', 'ñ', 'o̥', 'tːʃ', 'ɛ˞', 'ɔ̥', 'd̠ʒ̥', 'ɤ̞̥', 'nɪ̯', 't̪̚', 'wʊ', 'tʔ', 'ɨ̟', 'n̟', 'xˑ', 'ŋm̩', 'ɻ̥', 'ʔb', 'r̞', 'ʔ̚', 'ɽʲ', 'ʒ̃', 'f̃', 'p̃', 'ɡ̥', 'nʒ', 'd̠̈', 'c̚', 'p̚ˀ', 'ʔm', 'ʊ˞', 'β̥', 'x̟', 'z͇̥', 'ĭ̥', 'ʛ', 'd̚', 'ɜ̠', 'ɯ̥', 'bˡ', 'ɡ̚', 'm̚', 'ʊ̥̆', 'hw', 'ʔp', 'i̠ː', 'ɛ̥', 'tⁿ', 'ʌ̆', 's̬', 'o̞ə̯', 'ɟɲ', 'mm̥', 'p̬', 'ja', 'n̠d̠ʒʲ', 'k̠ʰ', 'ɐ˞', 'n̚', 'l̩ˠ', 'k͉̚', 'pfʷ', 'ɗ̚', 'ʀ̥', 'ʔk', 'ʲt̚', 'b̚', 'ĭ̃', 'ʔʰ', 'ʃˤː', 'ɳ̩', 'ʔt', 'ɐ̥', 'd̠̥z̠̥ʲ', '∅', 'k̬', 'ə̝', 'ʃ̩', 'ə̠', 'lʷˠ', '