# Богданов Александр Иванович, Б05-003

## Задача 5.2

Выборка: архив записей голоса

Требуется:

1. Отобрать записи, соответствующие странам с минимум 30 респонеднтами в выборке;
2. Получить сумму zero-crossing по каждой из записей;
3. Провести ANOVA-анализ по аттрибутам родного языка, пола и возраста для уровня значимости 0.15. Дискретность признака zero-crossing игнорировать.

## Решение|

In [75]:
import pandas as pd
import numpy as np
import librosa
from pathlib import Path
from statsmodels.formula.api import ols
import statsmodels.api as sm

Загрузим данные.

In [86]:
data = pd.read_csv('data/speakers_all.csv')
data

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid,country,file_missing?,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,24.0,12.0,"koussi, senegal",balanta,balanta,male,788,senegal,True,,,
1,18.0,10.0,"buea, cameroon",cameroon,cameroon,male,1953,cameroon,True,,,
2,48.0,8.0,"hong, adamawa, nigeria",fulfulde,fulfulde,male,1037,nigeria,True,,,
3,42.0,42.0,"port-au-prince, haiti",haitian,haitian,male,1165,haiti,True,,,
4,40.0,35.0,"port-au-prince, haiti",haitian,haitian,male,1166,haiti,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2167,46.0,5.0,"lagos, nigeria",yoruba3,yoruba,female,766,nigeria,False,,,
2168,46.0,12.0,"lagos, nigeria",yoruba4,yoruba,male,851,nigeria,False,,,
2169,47.0,2.0,"ibadan, nigeria",yoruba5,yoruba,female,2023,nigeria,False,,,
2170,31.0,1.0,"bethel, alaska, usa",yupik1,yupik,female,571,usa,False,,,


Удалим ненужные слобцы и строки с потерянными файлами.

In [87]:
data = data[data['file_missing?'] == False]

data = data.drop(['Unnamed: 9',
                  'Unnamed: 10',
                  'Unnamed: 11',
                  'file_missing?'], axis=1)
data

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid,country
32,27.0,9.0,"virginia, south africa",afrikaans1,afrikaans,female,1,south africa
33,40.0,5.0,"pretoria, south africa",afrikaans2,afrikaans,male,2,south africa
34,43.0,4.0,"pretoria, transvaal, south africa",afrikaans3,afrikaans,male,418,south africa
35,26.0,8.0,"pretoria, south africa",afrikaans4,afrikaans,male,1159,south africa
36,19.0,6.0,"cape town, south africa",afrikaans5,afrikaans,male,1432,south africa
...,...,...,...,...,...,...,...,...
2167,46.0,5.0,"lagos, nigeria",yoruba3,yoruba,female,766,nigeria
2168,46.0,12.0,"lagos, nigeria",yoruba4,yoruba,male,851,nigeria
2169,47.0,2.0,"ibadan, nigeria",yoruba5,yoruba,female,2023,nigeria
2170,31.0,1.0,"bethel, alaska, usa",yupik1,yupik,female,571,usa


Отберем записи, соответствующие странам с минимум 30 респонеднтами в выборке.

In [88]:
country_counts = data['country'].value_counts()
selected_countries = country_counts[country_counts >= 30].index
data = data[data['country'].isin(selected_countries)]
data

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid,country
50,20.0,5.0,"addis ababa, ethiopia",amharic1,amharic,female,6,ethiopia
51,29.0,19.0,"gonder, ethiopia",amharic10,amharic,female,998,ethiopia
52,24.0,17.0,"addis ababa, ethiopia",amharic11,amharic,female,1129,ethiopia
53,26.0,15.0,"addis ababa, ethiopia",amharic12,amharic,female,1130,ethiopia
54,28.0,6.0,"addis ababa, ethiopia",amharic13,amharic,female,1131,ethiopia
...,...,...,...,...,...,...,...,...
2157,23.0,10.0,"zhuzhou, hunan, china",xiang4,xiang,female,2163,china
2158,19.0,13.0,"yakutsk, russia",yakut1,yakut,female,1252,russia
2161,78.0,6.0,"winnipeg, manitoba, canada",yiddish2,yiddish,female,405,canada
2162,52.0,5.0,"brooklyn, new york, usa",yiddish3,yiddish,male,1161,usa


Получим сумму zero-crossing по каждой из записей.

In [89]:
def compute_zero_crossing(file_name):
    audio, _ = librosa.load(Path('data', 'recordings', f'{file_name}.mp3'))
    zero_crossings = librosa.zero_crossings(audio, pad=False)
    return sum(zero_crossings)

In [111]:
data.loc[:, 'zero_crossing'] = data['filename'].apply(compute_zero_crossing)

In [112]:
data

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid,country,zero_crossing
50,20.0,5.0,"addis ababa, ethiopia",amharic1,amharic,female,6,ethiopia,68406
51,29.0,19.0,"gonder, ethiopia",amharic10,amharic,female,998,ethiopia,73988
52,24.0,17.0,"addis ababa, ethiopia",amharic11,amharic,female,1129,ethiopia,81180
53,26.0,15.0,"addis ababa, ethiopia",amharic12,amharic,female,1130,ethiopia,64650
54,28.0,6.0,"addis ababa, ethiopia",amharic13,amharic,female,1131,ethiopia,104104
...,...,...,...,...,...,...,...,...,...
2157,23.0,10.0,"zhuzhou, hunan, china",xiang4,xiang,female,2163,china,81333
2158,19.0,13.0,"yakutsk, russia",yakut1,yakut,female,1252,russia,50062
2161,78.0,6.0,"winnipeg, manitoba, canada",yiddish2,yiddish,female,405,canada,63399
2162,52.0,5.0,"brooklyn, new york, usa",yiddish3,yiddish,male,1161,usa,52182


Проведем ANOVA анализ

In [117]:
lm = ols('zero_crossing ~ age * C(sex) * C(native_language)', data=data).fit()
table = sm.stats.anova_lm(lm, typ=1)
table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(sex),1.0,18338970000.0,18338970000.0,25.390356,5.643279e-07
C(native_language),55.0,273225000000.0,4967728000.0,6.877833,4.971828e-40
C(sex):C(native_language),55.0,107674700000.0,1957721000.0,2.71047,1.251458e-09
age,1.0,15225740000.0,15225740000.0,21.080076,5.020936e-06
age:C(sex),1.0,329889000.0,329889000.0,0.456732,0.4993261
age:C(native_language),55.0,64061720000.0,1164759000.0,1.612612,0.003796413
age:C(sex):C(native_language),55.0,31047530000.0,564500500.0,0.781553,0.8751763
Residual,914.0,660164800000.0,722280900.0,,


In [118]:
table[table['PR(>F)'] < 0.15]

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(sex),1.0,18338970000.0,18338970000.0,25.390356,5.643279e-07
C(native_language),55.0,273225000000.0,4967728000.0,6.877833,4.971828e-40
C(sex):C(native_language),55.0,107674700000.0,1957721000.0,2.71047,1.251458e-09
age,1.0,15225740000.0,15225740000.0,21.080076,5.020936e-06
age:C(native_language),55.0,64061720000.0,1164759000.0,1.612612,0.003796413


Как мы видим zero-crosing зависит от возраста, родного языка, пола, а также в данной выборке есть зависимость, пола и родного языка, а также зависимость возраста и родного языка - выборка не сбалансирована.