# Data Science Lifecycle
©️ 2021, Mohamed Bani Hani and Ahmad Ibrahim. All Rights Reserved.

------
## Case Study: SPOKEN-LANGUAGE-IDENTIFICATION
*Objective:*
Automatically identify the spoken language from a speech audio signal with TensorFlow.</font>

## Dataset Description
The dataset used in the notebooks is based on Mozilla’s Common Voice. You will need to download the English and the Arabic datasets. Each data set contains an Audio file and five excel files. each excel contains information about the donors. (Age, Gender, accent, sentence, client_id)


In [1]:
train = 'train'
test = 'test'

eng = 'english'
ara = 'arabic'

languages = [eng, ara]
categories = [train, test]

original_dataset_paths = {}
                              
original_dataset_paths[eng] = 'C:/Users/GTS/Desktop/EN &AR/cv-corpus-6.1-2020-12-11/en/' 
original_dataset_paths[ara] = 'C:/Users/GTS/Desktop/EN &AR/cv-corpus-6.1-2020-12-11/ar/'

target_root_path = 'C:/Users/GTS/Desktop/EN &AR/cv-corpus-6.1-2020-12-11/data/'
# specify the number of audio files taken for each language 
num_files_to_take_for_each_language = 20000
train_rate = 0.8 

In [41]:
#this script will check if the dataset exists in the directory 
import os

for lang in languages:
    if not os.path.isdir(original_dataset_paths[lang]):
        raise
    for category in categories:
        if not os.path.isdir(target_root_path + category + '/' + lang):
            raise


In [36]:
import librosa as lr
from glob import glob
from random import shuffle
from shutil import copy2
import numpy as np
import pandas as pd
import warnings

In [2]:
# this function will taken  the audio that exist in the validation excel
# and checks the quality of the Audio and then save it on a new file 

def copy_audio_files_for_language(lang):
    
    print('')
    print('Copying files for language ' + lang + '...')
    print('')
    
    # Only take validated speech data
    df = pd.read_csv(original_dataset_paths[lang] + 'validated.tsv', sep='\t')
    all_filenames = df['path'].tolist()
    shuffle(all_filenames)
    
    counter = 0
    
    category = train    
    
    # picking clean audio
    for filename in all_filenames:
        file = original_dataset_paths[lang] + 'clips/' + filename
        try:
            audio_segment, sample_rate = lr.load(file)
            if np.count_nonzero(audio_segment) == 0:
                raise Exception('Audio is silent!')
            if audio_segment.ndim != 1:
                raise Exception('Audio signal has wrong number of dimensions: ' + str(audio_segment.ndim))
            duration_sec = lr.core.get_duration(audio_segment, sr=sample_rate)
        except Exception as e:
            print('WARNING! Error while loading file \"' + file + '\": ' + str(e) + ' - Skipping...')
            continue
        
        #  copy audio files with a certain minimum duration
        if 1.5 < duration_sec < 10.0:
            copy2(file, target_root_path + category + '/' + lang)
            counter += 1
        
        # Stop after collecting enough files 
        if counter == int(num_files_to_take_for_each_language * train_rate):
            category = test
        if counter == num_files_to_take_for_each_language:
            break

In [38]:
warnings.simplefilter('ignore', UserWarning)

copy_audio_files_for_language(ara)

warnings.simplefilter('default', UserWarning)


Copying files for language arabic...



In [77]:
# start at 5 
warnings.simplefilter('ignore', UserWarning)

copy_audio_files_for_language(eng)

warnings.simplefilter('default', UserWarning)


Copying files for language english...



In [78]:
# this loop will make sure that the above function took 20K of data 16K for traning and 4K for testing 
for category in categories:
    
    if category == train:
        num_files = int(num_files_to_take_for_each_language * train_rate)
    else:
        num_files = int(num_files_to_take_for_each_language * (1.0 - train_rate))
        
    for lang in languages:
        folder = target_root_path + category + '/' + lang + '/'
        all_files = glob(folder + '*.mp3')
        
        if len(all_files) < (num_files - 1):
            raise Exception('Folder \"' + folder + '\" only contains ' + str(len(all_files)) + ' files instead of ' + str(num_files) + '!')
            
print('Okay!')

Okay!


In [79]:
warnings.simplefilter('ignore', UserWarning)

for category in categories:
    for lang in languages:
        duration_sec = 0.0
        
        folder = target_root_path + category + '/' + lang + '/'
        all_files = glob(folder + '*.mp3')
        
        for file in all_files:
            duration_sec += lr.core.get_duration(filename=file)
            
        duration_h = duration_sec / 60.0 / 60.0
        print('Total duration of ' + lang + ' ' + category + ' is ' + str(round(duration_h, 1)) + ' h')
        
warnings.simplefilter('default', UserWarning)

Total duration of english train is 18.2 h
Total duration of arabic train is 17.8 h
Total duration of english test is 4.6 h
Total duration of arabic test is 4.5 h
