In [15]:
# Import necessary collections
# Make sure you have a conda environment with pandas downloaded

import pandas as pd
import tqdm

In [3]:
# Get the data from Phobile
phoible = pd.read_csv('https://raw.githubusercontent.com/phoible/dev/master/data/phoible.csv', low_memory=False)

In [8]:
# See what the data looks like
phoible
phoible.columns

Index(['InventoryID', 'Glottocode', 'ISO6393', 'LanguageName',
       'SpecificDialect', 'GlyphID', 'Phoneme', 'Allophones', 'Marginal',
       'SegmentClass', 'Source', 'tone', 'stress', 'syllabic', 'short', 'long',
       'consonantal', 'sonorant', 'continuant', 'delayedRelease',
       'approximant', 'tap', 'trill', 'nasal', 'lateral', 'labial', 'round',
       'labiodental', 'coronal', 'anterior', 'distributed', 'strident',
       'dorsal', 'high', 'low', 'front', 'back', 'tense',
       'retractedTongueRoot', 'advancedTongueRoot', 'periodicGlottalSource',
       'epilaryngealSource', 'spreadGlottis', 'constrictedGlottis', 'fortis',
       'lenis', 'raisedLarynxEjective', 'loweredLarynxImplosive', 'click'],
      dtype='object')

In [36]:
# Create a new dataframe with the columns that we care about
nasal_counter = pd.DataFrame(
    columns=["Glottocode", "num_oral_cons", "num_nasal_cons", "num_oral_vowel", "num_nasal_vowel"]
)
# These are the glottocodes we've encountered
seen_glottocodes = set()
# Iterate over each inventory
for inventory_id, inventory_group in phoible.groupby('InventoryID'):
    # Check to see if we've processed this inventory:
    current_glottocode = inventory_group.iloc[0]["Glottocode"]
    if current_glottocode in seen_glottocodes:
        continue
    else:
        seen_glottocodes.add(current_glottocode)
    # Count the number of each permutation
    oral_cons = 0
    oral_vowel = 0
    nasal_cons = 0
    nasal_vowel = 0
    # Iterate over each inventory
    for index, row in inventory_group.iterrows():
        # Nasal vowel
        if row["nasal"] == "+" and row["syllabic"] == "+":
            nasal_vowel += 1
        # Nasal consonant
        elif row["nasal"] == "+" and row["syllabic"] == "-":
            nasal_cons += 1
        # Oral vowel
        elif row["nasal"] == "-" and row["syllabic"] == "+":
            oral_vowel += 1
        # Oral consonant
        elif row["nasal"] == "-" and row["syllabic"] == "-":
            oral_cons += 1
        # Not a phoneme (e.g. tone)
        elif row["nasal"] == "0" and row["syllabic"] == "0":
            continue
        # We shouldn't get here, this is a sanity check
        else:
            print(row)
            print(row["Phoneme"])
            continue
            raise Exception("Huh?")
    # Save it to a new dataframe
    nasal_counter.loc[len(nasal_counter.index)] = [
        current_glottocode, oral_cons, nasal_cons, oral_vowel, nasal_vowel
    ]
    

InventoryID                      13
Glottocode                 seda1262
ISO6393                         sed
LanguageName                 Sedang
SpecificDialect                 NaN
GlyphID                   006D+0062
Phoneme                          mb
Allophones                       mb
Marginal                        NaN
SegmentClass              consonant
Source                          spa
tone                              0
stress                            -
syllabic                          -
short                             -
long                              -
consonantal                       +
sonorant                        +,-
continuant                        -
delayedRelease                    -
approximant                       -
tap                               -
trill                             -
nasal                           +,-
lateral                           -
labial                            +
round                             -
labiodental                 

In [37]:
nasal_counter

Unnamed: 0,Glottocode,num_oral_cons,num_nasal_cons,num_oral_vowel,num_nasal_vowel
0,kore1280,19.0,3.0,18.0,0.0
1,kett1243,14.0,4.0,14.0,0.0
2,lakk1252,58.0,2.0,9.0,0.0
3,kaba1278,47.0,2.0,7.0,0.0
4,nucl1302,27.0,2.0,6.0,0.0
...,...,...,...,...,...
2172,ward1246,12.0,5.0,5.0,0.0
2173,yang1288,12.0,5.0,5.0,0.0
2174,guwa1243,11.0,4.0,6.0,0.0
2175,west2437,13.0,5.0,6.0,0.0
