In [1]:
import os
import pandas as pd

In [2]:
metadata_rootpath = "../data/MSV_CommonVoice_data/metadata"
language_metadata_paths = os.listdir(metadata_rootpath)
print(len(language_metadata_paths))

9


In [3]:
for language_metadata in language_metadata_paths:
    print(language_metadata)

train_list_ja.txt
train_list_ta.txt
train_list_uz.txt
train_list_hi.txt
train_list_vi.txt
train_list_fr.txt
train_list_zh-CN.txt
train_list_th.txt
train_list_en.txt


In [4]:
def data_statistic(metadata_rootpath: str, result_path: str):
    abbrev2lan = {"en": "English", "fr": "French", "hi": "Hindi", "ja": "Japanese", "ta": "Tamil", "th": "Thai", "uz": "Uzbekistan", "vi": "Vietnameese", "zh-CN": "Chinese"}
    language_metadata_paths = os.listdir(metadata_rootpath)
    languages_stat = [] # list of languages stat

    for language_metadata_path in language_metadata_paths:
        language_abbrev = language_metadata_path[11:][:-4]
        language = abbrev2lan[language_abbrev]
        language_stat_dict = {} # dict of (speaker_id, language) as key and [wav_paths] as value
        language_stat_list = [] # list of (speaker_id, language, #utterance, [wav_paths])

        with open(metadata_rootpath + "/" + language_metadata_path) as f_read:
            lines = f_read.readlines()
            for line in lines:
                speaker_id, wav_path = line.strip().split("\t")
                if language_stat_dict.get((speaker_id, language)) == None:
                    language_stat_dict[(speaker_id, language)] = [wav_path]
                else:
                    language_stat_dict[(speaker_id, language)].append(wav_path)
            for speaker_id, language in language_stat_dict.keys():
                wav_paths = language_stat_dict[(speaker_id, language)]
                language_stat_list.append((speaker_id, language, len(wav_paths), wav_paths))
        languages_stat.extend(language_stat_list)
    
    languages_stat = sorted(languages_stat, key=lambda x: (x[1], x[2], x[0]))
    stat_df = pd.DataFrame(languages_stat, columns=["Speaker ID", "Language", "#Utterances", "Wav paths"])
    stat_df.to_csv(result_path, index=False)

    return stat_df

In [5]:
data_stat_df = data_statistic("../data/MSV_CommonVoice_data/metadata", "../output/data_stat.csv")

In [6]:
stat_df = pd.DataFrame(data_stat_df, columns=["Speaker ID", "Language", "#Utterances", "Wav paths"])

In [7]:
stat_df.head()

Unnamed: 0,Speaker ID,Language,#Utterances,Wav paths
0,02ec74191c6ccc7dcf6ecaa217268263c477273b4de93f...,Chinese,1,[common_voice_zh-CN_22069600.wav]
1,0431cf00d4491b99a93700d7aa0b1948a057b2c162a620...,Chinese,1,[common_voice_zh-CN_22006851.wav]
2,04742f27bccab99619bd4ec3f256b36c639afd058c8664...,Chinese,1,[common_voice_zh-CN_22115132.wav]
3,0648def3862cbb968eec23fad967f50e35fc8e0eea67b4...,Chinese,1,[common_voice_zh-CN_22120171.wav]
4,0697ece1f99a08477906d0f3b4e74e1d6ffca76c20a7db...,Chinese,1,[common_voice_zh-CN_18646658.wav]


In [8]:
stat_df.shape

(17864, 4)

In [9]:
stat_df.describe()

Unnamed: 0,#Utterances
count,17864.0
mean,33.439431
std,473.867425
min,1.0
25%,3.0
50%,5.0
75%,10.0
max,44728.0


In [10]:
# Check if a speaker speaks multiple languages
speaker_multi_lan_df = stat_df.groupby("Speaker ID")["Language"].nunique().reset_index()
speaker_multi_lan_df = speaker_multi_lan_df[speaker_multi_lan_df["Language"] > 1]
print(len(speaker_multi_lan_df))
print(speaker_multi_lan_df["Language"].unique())

108
[2 3 4]


In [11]:
# Check the number of speaker and utterances per language
speaker_utterance_per_lan = stat_df.groupby("Language")["Speaker ID"].nunique().reset_index()
speaker_utterance_per_lan["#Utterances"] = stat_df.groupby("Language")["#Utterances"].sum().reset_index()["#Utterances"]
speaker_utterance_per_lan.sort_values(["Speaker ID", "#Utterances"], ascending=False)

Unnamed: 0,Language,Speaker ID,#Utterances
6,Thai,5515,126058
1,English,3975,100991
0,Chinese,3716,45160
2,French,2495,90035
7,Uzbekistan,901,79704
5,Tamil,475,112919
4,Japanese,465,29447
3,Hindi,226,9189
8,Vietnameese,96,3859
