The notebook preprocesses Common Voices dataset changing format to .wav with sample rate 16000 and computes labels

In [1]:
!ls 

hy-AM  Preprocessing.ipynb  ru


In [2]:
!pwd

/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice


In [3]:
!ls ru

clip_durations.tsv  other.tsv	  unvalidated_sentences.tsv
clips		    reported.tsv  validated_sentences.tsv
dev.tsv		    test.tsv	  validated.tsv
invalidated.tsv     train.tsv


In [4]:
import pandas as pd
import os
import subprocess
import tqdm

In [5]:
# Specify the path to your TSV file
file_path = 'ru/validated.tsv'
df = pd.read_csv(file_path, sep='\t')
df.columns


Index(['client_id', 'path', 'sentence_id', 'sentence', 'sentence_domain',
       'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant',
       'locale', 'segment'],
      dtype='object')

In [6]:
data = df[['path', 'sentence']]
data.head()

Unnamed: 0,path,sentence
0,common_voice_ru_23589751.mp3,Масштабы финансово-экономического кризиса и те...
1,common_voice_ru_22727258.mp3,Масштабы финансово-экономического кризиса и те...
2,common_voice_ru_26587318.mp3,"К сожалению, эти предложения не нашли отражени..."
3,common_voice_ru_39228705.mp3,"Наконец, я хочу поблагодарить всех присутствую..."
4,common_voice_ru_36292648.mp3,"Толпа озвереет, будет тереться, ощетинит ножки..."


In [7]:
print(data.shape)

(163387, 2)


In [8]:
data.isna().sum()

path        0
sentence    0
dtype: int64

In [9]:
data = data.dropna(how='any')

In [10]:
data.isna().sum()


path        0
sentence    0
dtype: int64

In [11]:
data.duplicated().sum()

0

In [12]:
def AM_syl_computing(sentence):
    """The function takes armenian sentence as input and returns number of syllables in the sentence"""
    arm_vowels = ['ա', 'ե', 'է', 'ը',  'ի', 'ո', 'օ', 'ու']
    unicode_arm_vowels = [1377, 1381, 1383, 1384, 1387, 1400, 1413]
    
    sentence = sentence.lower()
    syl_count = 0
    for s in sentence:
        if ord(s) in unicode_arm_vowels:
            syl_count += 1
    return syl_count
    

In [13]:
def Eng_syl_computing(sentence):
    vowels = ['a', 'e', 'i', 'o', 'u', 'y']
    # unicode_eng_vowels = [1377, 1381, 1383, 1384, 1387, 1400, 1413]
    sentence = sentence.lower()
    syl_count = 0
    for s in sentence:
        if s in vowels:
            syl_count += 1
    return syl_count

In [14]:
 def Ru_syl_computing(sentence):
    """The function takes Russion sentence as input and returns number of syllables in the sentence"""
    ru_vowels = ['а', 'я', 'у', 'ю', 'о', 'е', 'ё', 'э', 'и', 'ы']
    unicode_ru_vowels = [1072, 1103, 1091, 1102, 1086, 1077, 1105, 1101, 1080, 1099]
    
    sentence = sentence.lower()
    syl_count = 0
    for s in sentence:
        if ord(s) in unicode_ru_vowels:
            syl_count += 1
    return syl_count
    

In [15]:
root_dir = '/home/dianasimonyan/Desktop/Thesis/SpeakingRateEstimation/data/CommonVoice/ru/clips'
save_dir = root_dir.replace(root_dir.split(os.sep)[-1], 'clips_wav_16khz_labeled')
os.makedirs(save_dir, exist_ok=True)
sample_rate = 16000

# ffmpeg -i input.mp3 -ar 16000 output.wav
for i, row in tqdm.tqdm(data.iterrows()):
    file = row['path']
    
    if not os.path.exists(os.path.join(root_dir, file)):
        print('file does not exist')

    # print('sentence: ', row['sentence'])
    syl_count = Ru_syl_computing(row['sentence'])

    # print('label: ', syl_count)

    proc = subprocess.Popen(['ffmpeg', '-i', f'{os.path.join(root_dir, file)}', '-ar', f'{sample_rate}', f'{os.path.join(save_dir, f"{i}_{syl_count}.wav")}'], 
                            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return_code = proc.wait()
        
    if return_code:
        print(f"Subprocess failed with return code {return_code}.") 
        print(f'#{i}. ', file)
        print(' '.join(['ffmpeg', '-i', f'{os.path.join(root_dir, file)}', '-ar', f'{sample_rate}', f'{os.path.join(save_dir, f"{i}_{syl_count}.wav")}']))
        print(proc.stdout)
        print(proc.stderr)
        print()
    

163387it [37:11, 73.23it/s]
