# Country Prediction and Analysis
Dastan Abdulla  
Ling 1340: Data Science for Linguists  
04/08/2024  

In [1]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Imports
import pandas as pd
import numpy as np
# For plotting
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

In [2]:
bio_df = pd.read_pickle('../data/bio_df.pkl')
bio_df.head(5)

Unnamed: 0,speakerid,native_language,country,age,gender,onset_age,english_residence,length_of_residence,learning_style,speech_sample,phonetic_transcription,ethnologue_language_code,language_name,macroarea,coordinate
0,1,afrikaans,south africa,27.0,female,9.0,usa,0.5,academic,afrikaans1.wav,afrikaans1.txt,afr,Afrikaans,Africa,"(-30.559482, 22.937506)"
1,2,afrikaans,south africa,40.0,male,5.0,usa,10.0,academic,afrikaans2.wav,afrikaans2.txt,afr,Afrikaans,Africa,"(-30.559482, 22.937506)"
2,3,agni,côte d'ivoire,25.0,male,15.0,usa,1.2,academic,agni1.wav,agny1.txt,any,Anyin,Africa,"(7.539989, -5.54708)"
3,4,albanian,serbia,19.0,male,6.0,usa,3.0,naturalistic,albanian1.wav,albanian1.txt,als,Tosk Albanian,Eurasia,"(44.016521, 21.005859)"
4,5,albanian,albania,33.0,male,15.0,usa,0.04,naturalistic,albanian2.wav,albanian2.txt,aln,Gheg Albanian,Eurasia,"(41.153332, 20.168331)"


In [3]:
features_df = pd.read_pickle('../data/features_df.pkl')
features_df.head(5)

Unnamed: 0,speakerid,transcription,anterior,consonantal,labial,sagittal,back,constricted_glottis,continuant,coronal,...,long,nasal,round,sonorant,syllabic,velaric,voice,distributed,strident,tense
0,1,pʰlis kɔl stɛːlʌ ɑsk˺ ɜ tə bɹɪ̃ŋ ðiz θɪ̃ŋz̥ wɪ...,114,126,38,1,74,1,166,87,...,4,24,31,138,89,0,165,12,11,45
1,2,pʰliːz̥ kʰɔl stɛ̆lʌ ɔsk hɜ tŭ bɹiŋ ðiz θiŋz̥ ...,109,125,37,6,69,1,160,85,...,6,24,31,131,84,0,162,15,9,43
2,3,pliz kɑl stelə æs hɚ tu bɹɪ̃ŋ viz fɪŋ wɪf hɜɹ̆...,111,119,45,1,59,1,158,78,...,0,26,31,134,81,0,164,4,16,51
3,4,p̬liz kʰɔl stɛla æs xɜɹ tu bɹɪ̃ŋ ðɪs θɪ̃ŋks wɪ...,119,129,39,5,79,0,173,95,...,0,33,39,145,87,0,173,13,8,47
4,5,pliz kɔl stɛlə æsk hɛɹ tu bɹɪ̃ŋ ðɪs θɪ̃ŋs wɪð ...,122,136,37,1,63,0,171,97,...,4,35,33,138,87,0,159,15,11,41


In [4]:
saa_tran_df = pd.concat([bio_df, features_df], axis=1)

In [5]:
saa_tran_df.head(5)

Unnamed: 0,speakerid,native_language,country,age,gender,onset_age,english_residence,length_of_residence,learning_style,speech_sample,...,long,nasal,round,sonorant,syllabic,velaric,voice,distributed,strident,tense
0,1,afrikaans,south africa,27.0,female,9.0,usa,0.5,academic,afrikaans1.wav,...,4,24,31,138,89,0,165,12,11,45
1,2,afrikaans,south africa,40.0,male,5.0,usa,10.0,academic,afrikaans2.wav,...,6,24,31,131,84,0,162,15,9,43
2,3,agni,côte d'ivoire,25.0,male,15.0,usa,1.2,academic,agni1.wav,...,0,26,31,134,81,0,164,4,16,51
3,4,albanian,serbia,19.0,male,6.0,usa,3.0,naturalistic,albanian1.wav,...,0,33,39,145,87,0,173,13,8,47
4,5,albanian,albania,33.0,male,15.0,usa,0.04,naturalistic,albanian2.wav,...,4,35,33,138,87,0,159,15,11,41


# Preliminary Setup

In [6]:
feature_full_names = [
    'anterior',
    'consonantal',
    'labial',
    'sagittal',
    'back',
    'constricted_glottis',
    'continuant',
    'coronal',
    'delayed_release',
    'high',
    'lateral',
    'low',
    'long',
    'nasal',
    'round',
    'sonorant',
    'syllabic',
    'velaric',
    'voice',
    'distributed',
    'strident',
    'tense'
]


In [11]:
import pandas as pd
from scipy.stats import kruskal
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

for feature in feature_full_names:
    print('Feature: ', feature)
    stat, p = kruskal(*[group[f'{feature}'].values for name, group in saa_tran_df.groupby("country")])
    print('Kruskal-Wallis Test stat=%s, p=%s' % (stat, p))
    print("ANOVA: ")
    model = ols(f'{feature} ~ C(country)', data=saa_tran_df).fit()
    anova_results = anova_lm(model)
    print(anova_results)


Feature:  anterior
Kruskal-Wallis Test stat=445.8068455209281, p=3.011819761826233e-29
ANOVA: 
                df        sum_sq     mean_sq         F        PR(>F)
C(country)   158.0  16539.849716  104.682593  3.722804  1.076991e-37
Residual    1110.0  31212.407179   28.119286       NaN           NaN
Feature:  consonantal
Kruskal-Wallis Test stat=477.9805875285458, p=6.883080471855056e-34
ANOVA: 
                df        sum_sq     mean_sq         F        PR(>F)
C(country)   158.0  22747.338286  143.970495  3.649383  1.757617e-36
Residual    1110.0  43790.207813   39.450638       NaN           NaN
Feature:  labial
Kruskal-Wallis Test stat=324.99008468067825, p=1.259808651639137e-13
ANOVA: 
                df       sum_sq   mean_sq         F        PR(>F)
C(country)   158.0  1443.475809  9.135923  2.432171  7.402329e-17
Residual    1110.0  4169.474545  3.756283       NaN           NaN
Feature:  sagittal
Kruskal-Wallis Test stat=376.2625769353314, p=7.575598552550014e-20
ANOVA: 
      

Even though the output has shrunk, pretty much all the features were significant due to their low p values except `delayed_release` with a p value of 0.59