The notebook preprocesses Common Voices dataset changing format to .wav with sample rate 16000 and computes labels

In [120]:
!ls 

en  es	hy-AM  it  Preprocessing.ipynb


In [121]:
!pwd

/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice


In [123]:
!ls es

clips	 invalidated.tsv  reported.tsv	train.tsv
dev.tsv  other.tsv	  test.tsv	validated.tsv


In [1]:
import pandas as pd
import os
import subprocess
import tqdm
from IPython.display import Audio
import glob


In [2]:
# Specify the path to your TSV file
file_path = 'es/validated.tsv'
df = pd.read_csv(file_path, sep='\t')
df.columns


Index(['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age',
       'gender', 'accents', 'locale', 'segment'],
      dtype='object')

In [3]:
data = df[['path', 'sentence']]
data.head()

Unnamed: 0,path,sentence
0,common_voice_es_19698530.mp3,Habita en aguas poco profundas y rocosas.
1,common_voice_es_19987333.mp3,Opera principalmente vuelos de cabotaje y regi...
2,common_voice_es_19691402.mp3,Para visitar contactar primero con la dirección.
3,common_voice_es_23747242.mp3,En los dos años siguientes trabajó de manera c...
4,common_voice_es_22044222.mp3,tres


In [143]:
# it_alphabet = set()
# for i, row in tqdm.tqdm(data.iterrows()):
#     for ltr in row['sentence'].lower():
#         it_alphabet.add(ltr)


In [144]:
# for s in it_alphabet:
#     print(s, ord(s))

In [145]:
# # vowels extracted from our dataset
# it_reg_vowel_codes = [97, 161, 111, 1077, 1086, 105, 101, 1072, 97]
# it_stressed_vowel_codes =[246, 228, 333, 1105, 337, 279, 365, 7877, 226, 225, 281, 232, 252, 242, 235, 283, 250, 251, 227, 240, 244, 1104,
#            237, 238, 233, 229, 261, 236, 249, 239, 117, 243, 257, 363, 224, 234]

In [4]:
print(data.shape)

(293025, 2)


In [5]:
data.isna().sum()

path        0
sentence    0
dtype: int64

In [6]:
data = data.dropna(how='any')

In [7]:
data.isna().sum()


path        0
sentence    0
dtype: int64

In [8]:
data.duplicated().sum()

0

In [9]:
print(data.shape)

(293025, 2)


In [10]:
def AM_syl_computing(sentence):
    """The function takes armenian sentence as input and returns number of syllables in the sentence"""
    arm_vowels = ['ա', 'ե', 'է', 'ը',  'ի', 'ո', 'օ', 'ու']
    unicode_arm_vowels = [1377, 1381, 1383, 1384, 1387, 1400, 1413]
    
    sentence = sentence.lower()
    syl_count = 0
    for s in sentence:
        if ord(s) in unicode_arm_vowels:
            syl_count += 1
    return syl_count
    

In [11]:
def Eng_syl_computing(sentence):
    vowels = ['a', 'e', 'i', 'o', 'u', 'y']
    # unicode_eng_vowels = [1377, 1381, 1383, 1384, 1387, 1400, 1413]
    sentence = sentence.lower()
    syl_count = 0
    for s in sentence:
        if s in vowels:
            syl_count += 1
    return syl_count

In [12]:
 def Ru_syl_computing(sentence):
    """The function takes Russion sentence as input and returns number of syllables in the sentence"""
    ru_vowels = ['а', 'я', 'у', 'ю', 'о', 'е', 'ё', 'э', 'и', 'ы']
    unicode_ru_vowels = [1072, 1103, 1091, 1102, 1086, 1077, 1105, 1101, 1080, 1099]
    
    sentence = sentence.lower()
    syl_count = 0
    for s in sentence:
        if ord(s) in unicode_ru_vowels:
            syl_count += 1
    return syl_count
    

In [13]:
def It_syl_computing(sentence):
    """The function takes Italian sentence as input and returns number of syllables in the sentence"""
    reg_vowels_codes = [ 0x0061, 0x0065, 0x0069, 0x006F , 0x0075]
    stressed_vowels_codes = [0x00E0, 0x00E8, 0x00E9, 0x00EC,0x00F2, 0x00F3, 0x00F9]
    reg_vowels = [chr(int(s)) for s in it_reg_vowels]
    stressed_vowels = [chr(int(s)) for s in it_stressed_vowels]
    
    sentence = sentence.lower()
    syl_count = 0
    for s in sentence:
        if (ord(s) in reg_vowels_codes) or (ord(s) in stressed_vowels_codes):
            syl_count += 1
    return syl_count

In [14]:
def Sp_syl_computing(sentence):
    """The function takes Spanish sentence as input and returns number of syllables in the sentence"""
    reg_vowels_codes= [int(0x0061), int(0x0065), int(0x0069), int(0x006F), int(0x0075)]
    stressed_vowles_codes = [int(0x00E1), int(0x00E9), int(0x00ED), int(0x00F3) , int(0x00FA), int(0x00FC)] 
    sp_reg_vowels = [chr(c) for c in reg_vowels_codes]
    sp_stressed_vowels = [chr(c) for c in stressed_vowles_codes]
    
    sentence = sentence.lower()
    syl_count = 0
    for s in sentence:
        if (ord(s) in reg_vowels_codes) or (ord(s) in stressed_vowles_codes):
            syl_count += 1
    return syl_count

In [None]:
root_dir = '/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/es/clips'
save_dir = root_dir.replace(root_dir.split(os.sep)[-1], 'clips_wav_16khz_labeled')
os.makedirs(save_dir, exist_ok=True)
sample_rate = 16000

# ffmpeg -i input.mp3 -ar 16000 output.wav
for i, row in tqdm.tqdm(data.iterrows()):
    file = row['path']
    
    if not os.path.exists(os.path.join(root_dir, file)):
        print('file does not exist')

    # print('sentence: ', row['sentence'])
    syl_count = Sp_syl_computing(row['sentence'])

    # print('label: ', syl_count)

    proc = subprocess.Popen(['ffmpeg', '-i', f'{os.path.join(root_dir, file)}', '-ar', f'{sample_rate}', f'{os.path.join(save_dir, f"{i}_{syl_count}.wav")}'], 
                            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return_code = proc.wait()
        
    if return_code:
        print(f"Subprocess failed with return code {return_code}.") 
        print(f'#{i}. ', file)
        print(' '.join(['ffmpeg', '-i', f'{os.path.join(root_dir, file)}', '-ar', f'{sample_rate}', f'{os.path.join(save_dir, f"{i}_{syl_count}.wav")}']))
        print(proc.stdout)
        print(proc.stderr)
        print()
    

275154it [1:06:04, 74.70it/s]

In [15]:
#testing
data_dir = '/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/es/clips_wav_16khz_labeled/**/*.wav'
audios = glob.glob(data_dir, recursive=True)
audios

['/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/es/clips_wav_16khz_labeled/23418_33.wav',
 '/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/es/clips_wav_16khz_labeled/119396_11.wav',
 '/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/es/clips_wav_16khz_labeled/122570_16.wav',
 '/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/es/clips_wav_16khz_labeled/204350_33.wav',
 '/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/es/clips_wav_16khz_labeled/47688_15.wav',
 '/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/es/clips_wav_16khz_labeled/180875_18.wav',
 '/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/es/clips_wav_16khz_labeled/132541_26.wav',
 '/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/es/clips_wav_16khz_labeled/119779_30.wav',
 '/home/dianasimonyan/Desktop/Thesis/Speak

In [17]:
print(len(audios))

292303


In [26]:
print(audios[292302])
print(data.loc[238452]['sentence'])
Audio(audios[292302])

/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/es/clips_wav_16khz_labeled/238452_24.wav
Marchó hacia el Oriente hasta los llanos y luego al sur hasta Pasto.


Italiean vowels [article](https://www.berlitz.com/blog/italian-vowels)  
Spanish vowels and accent marks: [article](https://baselang.com/blog/advanced-grammar/spanish-accent-marks/)