The notebook preprocesses Common Voices dataset changing format to .wav with sample rate 16000 and computes labels

In [1]:
!ls 

en  hy-AM  it  Preprocessing.ipynb  ru


In [2]:
!pwd

/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice


In [3]:
!ls it

clip_durations.tsv  other.tsv	  unvalidated_sentences.tsv
clips		    reported.tsv  validated_sentences.tsv
dev.tsv		    test.tsv	  validated.tsv
invalidated.tsv     train.tsv


In [4]:
import pandas as pd
import os
import subprocess
import tqdm

In [5]:
# Specify the path to your TSV file
file_path = 'it/validated.tsv'
df = pd.read_csv(file_path, sep='\t')
df.columns


Index(['client_id', 'path', 'sentence_id', 'sentence', 'sentence_domain',
       'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant',
       'locale', 'segment'],
      dtype='object')

In [6]:
data = df[['path', 'sentence']]
data.head()

Unnamed: 0,path,sentence
0,common_voice_it_23606167.mp3,Il libro ha suscitato molte polemiche a causa ...
1,common_voice_it_20045040.mp3,Fin dall'inizio la sede episcopale è stata imm...
2,common_voice_it_26970935.mp3,"Fu il fondatore di molti chiostri, ospedali e ..."
3,common_voice_it_17544185.mp3,Il vuoto assoluto?
4,common_voice_it_20042813.mp3,"Dopo alcuni anni, egli decise di tornare in In..."


In [57]:
# it_alphabet = set()
# for i, row in tqdm.tqdm(data.iterrows()):
#     for ltr in row['sentence'].lower():
#         it_alphabet.add(ltr)


In [58]:
# for s in it_alphabet:
#     print(s, ord(s))

In [59]:
# # vowels extracted from our dataset
# it_reg_vowel_codes = [97, 161, 111, 1077, 1086, 105, 101, 1072, 97]
# it_stressed_vowel_codes =[246, 228, 333, 1105, 337, 279, 365, 7877, 226, 225, 281, 232, 252, 242, 235, 283, 250, 251, 227, 240, 244, 1104,
#            237, 238, 233, 229, 261, 236, 249, 239, 117, 243, 257, 363, 224, 234]

In [60]:
print(data.shape)

(236600, 2)


In [61]:
data.isna().sum()

path        0
sentence    0
dtype: int64

In [62]:
data = data.dropna(how='any')

In [63]:
data.isna().sum()


path        0
sentence    0
dtype: int64

In [64]:
data.duplicated().sum()

0

In [65]:
print(data.shape)

(236600, 2)


In [59]:
def AM_syl_computing(sentence):
    """The function takes armenian sentence as input and returns number of syllables in the sentence"""
    arm_vowels = ['ա', 'ե', 'է', 'ը',  'ի', 'ո', 'օ', 'ու']
    unicode_arm_vowels = [1377, 1381, 1383, 1384, 1387, 1400, 1413]
    
    sentence = sentence.lower()
    syl_count = 0
    for s in sentence:
        if ord(s) in unicode_arm_vowels:
            syl_count += 1
    return syl_count
    

In [60]:
def Eng_syl_computing(sentence):
    vowels = ['a', 'e', 'i', 'o', 'u', 'y']
    # unicode_eng_vowels = [1377, 1381, 1383, 1384, 1387, 1400, 1413]
    sentence = sentence.lower()
    syl_count = 0
    for s in sentence:
        if s in vowels:
            syl_count += 1
    return syl_count

In [61]:
 def Ru_syl_computing(sentence):
    """The function takes Russion sentence as input and returns number of syllables in the sentence"""
    ru_vowels = ['а', 'я', 'у', 'ю', 'о', 'е', 'ё', 'э', 'и', 'ы']
    unicode_ru_vowels = [1072, 1103, 1091, 1102, 1086, 1077, 1105, 1101, 1080, 1099]
    
    sentence = sentence.lower()
    syl_count = 0
    for s in sentence:
        if ord(s) in unicode_ru_vowels:
            syl_count += 1
    return syl_count
    

In [52]:
def It_syl_computing(sentence):
    """The function takes Italian sentence as input and returns number of syllables in the sentence"""
    reg_vowels_codes = [ 0x0061, 0x0065, 0x0069, 0x006F , 0x0075]
    stressed_vowels_codes = [0x00E0, 0x00E8, 0x00E9, 0x00EC,0x00F2, 0x00F3, 0x00F9]
    reg_vowels = [chr(int(s)) for s in it_reg_vowels]
    stressed_vowels = [chr(int(s)) for s in it_stressed_vowels]
    
    sentence = sentence.lower()
    syl_count = 0
    for s in sentence:
        if (ord(s) in reg_vowels_codes) or (ord(s) in stressed_vowels_codes):
            syl_count += 1
    return syl_count

In [None]:
root_dir = '/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/it/clips'
save_dir = root_dir.replace(root_dir.split(os.sep)[-1], 'clips_wav_16khz_labeled')
os.makedirs(save_dir, exist_ok=True)
sample_rate = 16000

# ffmpeg -i input.mp3 -ar 16000 output.wav
for i, row in tqdm.tqdm(data.iterrows()):
    file = row['path']
    
    if not os.path.exists(os.path.join(root_dir, file)):
        print('file does not exist')

    # print('sentence: ', row['sentence'])
    syl_count = It_syl_computing(row['sentence'])

    # print('label: ', syl_count)

    proc = subprocess.Popen(['ffmpeg', '-i', f'{os.path.join(root_dir, file)}', '-ar', f'{sample_rate}', f'{os.path.join(save_dir, f"{i}_{syl_count}.wav")}'], 
                            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return_code = proc.wait()
        
    if return_code:
        print(f"Subprocess failed with return code {return_code}.") 
        print(f'#{i}. ', file)
        print(' '.join(['ffmpeg', '-i', f'{os.path.join(root_dir, file)}', '-ar', f'{sample_rate}', f'{os.path.join(save_dir, f"{i}_{syl_count}.wav")}']))
        print(proc.stdout)
        print(proc.stderr)
        print()
    

22747it [05:48, 71.41it/s]