In [1]:
import os
from glob import glob
from tqdm.notebook import tqdm
import shutil

import numpy as np
import librosa
import pandas as pd

import re
import hazm
import unicodedata
import Levenshtein

import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
data_path = "../Data/KeyWords"

FILENAME_DICT = {}
for lang in tqdm(os.listdir(data_path)):
    all_filenames = glob(os.path.join(data_path, "{}/clips/*/*.opus".format(lang)))

    filename_dict = {}
    for filename in all_filenames:
        folder = filename.split(os.path.sep)[-2]
    
        if folder in filename_dict:
            filename_dict[folder].append(filename)
        else:
            filename_dict[folder] = [filename]

    FILENAME_DICT[lang] = filename_dict

  0%|          | 0/39 [00:00<?, ?it/s]

In [3]:
print(sum([len(FILENAME_DICT[lang][folder]) for lang in FILENAME_DICT for folder in FILENAME_DICT[lang]]))

7514350


In [4]:
normalizer = hazm.Normalizer(persian_style=False)

translation_src = "إ٫ٲٳٴڃڄﭪﭬﯔﯕﯖﯗﯘﯙﯞﯧﯨﯩﯼﯽﯾﯿﴽﺓةﺔۀأؤیؠػػؽؾؿكيٮٯٷٸٹٺٻټٽٿڀځٵٶٷٸٹٺٻټٽٿڀځڂڅڇڈډڊڋڌڍڎڏڐڑڒړڔڕږڗڙښڛڜڝڞڟڠڡڢڣڤڥڦڧڨڪګڬڭڮڰڱڲڳڴڵڶڷڸڹںڻڼڽھڿہۂۃۄۅۆۇۈۉۊۋۏۍێېۑےۓەۮۯۺۻۼۿݐݑݒݓݔݕݖݗݘݙݚݛݜݝݞݟݠݡݢݣݤݥݦݧݨݩݪݫݬݭݮݯݰݱݲݳݴݵݶݷݸݹݺݻݼݽݾݿࢠࢡࢢࢣࢤࢥࢦࢧࢨࢩࢪࢫࢮࢯࢰࢱࢬࢲࢳࢴࢶࢷࢸࢹࢺࢻࢼࢽﭐﭑﭒﭓﭔﭕﭖﭗﭘﭙﭚﭛﭜﭝﭞﭟﭠﭡﭢﭣﭤﭥﭦﭧﭨﭩﭮﭯﭰﭱﭲﭳﭴﭵﭶﭷﭸﭹﭺﭻﭼﭽﭾﭿﮀﮁﮂﮃﮄﮅﮆﮇﮈﮉﮊﮋﮌﮍﮎﮏﮐﮑﮒﮓﮔﮕﮖﮗﮘﮙﮚﮛﮜﮝﮞﮟﮠﮡﮢﮣﮤﮥﮦﮧﮨﮩﮪﮫﮬﮭﮮﮯﮰﮱﺀﺁﺃﺄﺅﺆﺇﺈﺉﺊﺋﺌﺍﺎﺏﺐﺑﺒﺕﺖﺗﺘﺙﺚﺛﺜﺝﺞﺟﺠﺡﺢﺣﺤﺥﺦﺧﺨﺩﺪﺫﺬﺭﺮﺯﺰﺱﺲﺳﺴﺵﺶﺷﺸﺹﺺﺻﺼﺽﺾﺿﻀﻁﻂﻃﻄﻅﻆﻇﻈﻉﻊﻋﻌﻍﻎﻏﻐﻑﻒﻓﻔﻕﻖﻗﻘﻙﻚﻛﻜﻝﻞﻟﻠﻡﻢﻣﻤﻥﻦﻧﻨﻩﻪﻫﻬﻭﻮﻯﻰﻱﻲﻳﻴىكي“” "
translation_dst = (
            'ا.ااءججفقکککوووویییییییاههههاوییککیییکیبقویتتبتتتبحاوویتتبتتتبحححچدددددددددررررررررسسسصصطعففففففققکککککگگگگگللللنننننهچهههوووووووووییییییهدرشضغهبببببببححددرسعععففکککممنننلررسححسرحاایییووییحسسکببجطفقلمییرودصگویزعکبپتریفقنااببببپپپپببببتتتتتتتتتتتتففففححححححححچچچچچچچچددددددددژژررککککگگگگگگگگگگگگننننننههههههههههییییءاااووااییییااببببتتتتثثثثججججححححخخخخددذذررززسسسسششششصصصصضضضضططططظظظظععععغغغغففففققققککککللللممممننننههههوویییییییکی"" '
        )

number_translation_src = "۰۱۲۳۴۵۶۷۸۹٪٠١٢٣٤٥٦٧٨٩"
number_translation_dst = "0123456789%0123456789"


normalizer.translation_src = translation_src
normalizer.translation_dst = translation_dst

normalizer.number_translation_src = number_translation_src
normalizer.number_translation_dst = number_translation_dst

In [5]:
with open("special_characters.txt", "r") as f:
    SPECIAL_CHARS = f.read().strip().split("\n")
    
def remove_special_characters(text, replace= ""):
    return re.sub('[' + re.escape("".join(SPECIAL_CHARS)) + ']', replace, text)

def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [6]:
def filter_filename_dict(filename_dict, folders, n_thershold=50):
    folder_mapping = {}
    for folder in folders:
        folder_ = normalizer.normalize(folder)
        folder_ = strip_accents(folder_)
        new_folder = remove_special_characters(folder_)
    
        if new_folder == folder_:
            folder_mapping[folder] = new_folder
                
            
    buffer = {}
    for folder in folder_mapping:
        new_folder = folder_mapping[folder]
        
        if new_folder in buffer:
            buffer[new_folder] += filename_dict[folder]
        else:
            buffer[new_folder] = filename_dict[folder]
    filename_dict = buffer

    buffer = {}
    for folder in filename_dict:
        if len(filename_dict[folder]) >= n_thershold:
            buffer[folder] = filename_dict[folder]
    filename_dict = buffer

    return filename_dict


def filter_vocabs(filename_dict, folders):
    
    folders = sorted(list(filename_dict.keys()))
    folder_mapping = {}
    for folder in tqdm(folders):
        
        new_folders = folders.copy()
        new_folders.remove(folder)
    
        MIN = np.inf
        for folder_ in new_folders:
            dist = Levenshtein.distance(folder, folder_, weights=(1,1,1))
            if dist <= MIN:
                MIN = dist
    
                if MIN == 1:
                    if folder in folder_mapping:
                        folder_mapping[folder].append(folder_)
                    else:
                        folder_mapping[folder] = [folder_]
        
    
    
    deleted_folders, selected_folders = [], []
    for count, folder in enumerate(sorted(folder_mapping.keys())):
        SET = [folder] + folder_mapping[folder]
        SET = [f for f in SET if len(f) >= 3]
        SET = list(set(SET) - set(deleted_folders + selected_folders))
        
        
        SET_ = SET.copy()
        for f in SET:
            for f_ in selected_folders:
                dist = Levenshtein.distance(f, f_, weights=(1,1,1))
                if dist == 1:
                    SET_.remove(f)
                    break
        SET = SET_
        
    
        if len(SET):
            MAX = 0
            for f in SET:
                LEN = len(filename_dict[f])
                if LEN > MAX:
                    selected = f
                    MAX = len(filename_dict[f])
        
            SET.remove(selected)
            
            deleted_folders += SET
            selected_folders.append(selected)

    return selected_folders

In [7]:
BUFFER = {}

for lang in tqdm(os.listdir(data_path)):
    print(5 * "*", lang, 5 * "*")
    
    filename_dict = FILENAME_DICT[lang].copy()
    folders = sorted(list(filename_dict.keys()))
    
    filename_dict = filter_filename_dict(filename_dict, folders)
    folders = sorted(list(filename_dict.keys()))
    print(len(folders), sum([len(a) for a in filename_dict.values()]))
    
    
    if len(folders) > 20:
        selected_folders = filter_vocabs(filename_dict, folders)
        LEN = sum([len(filename_dict[folder]) for folder in selected_folders])
        print(len(selected_folders), LEN)

        buffer = {}
        for folder in selected_folders:
            buffer[folder] = filename_dict[folder]

        BUFFER[lang] = {
            "filename_dict" : buffer,
            "dataset_length": LEN,
            "folders" : selected_folders,
        }
    else:
        print("Lang has not good resources !!!")

  0%|          | 0/1 [00:00<?, ?it/s]

***** fa *****
3343 956883


  0%|          | 0/3343 [00:00<?, ?it/s]

851 329303


In [8]:
print(sum([BUFFER[lang]["dataset_length"] for lang in BUFFER]))
print(sum([len(BUFFER[lang]["folders"]) for lang in BUFFER]))

329303
851


In [9]:
import json

with open('FILENAME_DICT.json', 'w') as fp:
    json.dump(BUFFER, fp)

In [10]:
BUFFER = {}

for lang in tqdm(os.listdir(data_path)):
    print(5 * "*", lang, 5 * "*")
    
    filename_dict = FILENAME_DICT[lang].copy()
    folders = sorted(list(filename_dict.keys()))
    
    filename_dict = filter_filename_dict(filename_dict, folders)
    folders = sorted(list(filename_dict.keys()))
    print(len(folders), sum([len(a) for a in filename_dict.values()]))
    
    if len(folders) > 20:
        LEN = sum([len(filename_dict[folder]) for folder in folders])

        BUFFER[lang] = {
            "filename_dict" : filename_dict,
            "dataset_length": LEN,
            "folders" : folders,
        }
    else:
        print("Lang has not good resources !!!")

  0%|          | 0/39 [00:00<?, ?it/s]

***** ar *****
18 4292
Lang has not good resources !!!
***** mn *****
22 4450
***** pl *****
293 100705
***** et *****
45 10061
***** rm-sursilv *****
12 2112
Lang has not good resources !!!
***** ha *****
4 282
Lang has not good resources !!!
***** tr *****
28 10054
***** fr *****
1445 918718
***** cs *****
64 13506
***** rw *****
1022 448855
***** ro *****
11 2032
Lang has not good resources !!!
***** sl *****
3 447
Lang has not good resources !!!
***** lt *****
5 558
Lang has not good resources !!!
***** lv *****
12 2275
Lang has not good resources !!!
***** mt *****
17 3504
Lang has not good resources !!!
***** fa *****
851 329303
***** nl *****
116 60075
***** fy-NL *****
22 5798
***** cnh *****
5 1001
Lang has not good resources !!!
***** ky *****
19 4867
Lang has not good resources !!!
***** eo *****
183 65788
***** id *****
19 7979
Lang has not good resources !!!
***** sah *****
3 475
Lang has not good resources !!!
***** de *****
1596 1316509
***** pt *****
134 58181
***** ga-

In [11]:
print(sum([BUFFER[lang]["dataset_length"] for lang in BUFFER]))
print(sum([len(BUFFER[lang]["folders"]) for lang in BUFFER]))

7477085
12030


In [12]:
import json

with open('FILENAME_DICT.json', 'w') as fp:
    json.dump(BUFFER, fp)

In [None]:
import os
import IPython
from glob import glob
from tqdm.notebook import tqdm

import textgrid
import librosa
import numpy as np
import matplotlib.pyplot as plt
import scipy.io.wavfile
from mpire import WorkerPool

In [None]:
import json

with open('FILENAME_DICT.json', 'r') as fp:
    FILENAME_DICT = json.load(fp)

In [None]:
LANG = "fa"
textgrid_filenames = glob("../Data/alignments/fa/mnt/disks/std750/data/common-voice-forced-alignments/fa/alignments/*/*.TextGrid")

In [None]:
SPEECH_TEMPLATE = "../Data/cv-corpus/fa/clips/{}.mp3"

In [None]:
filename2keyword = {}

filename_dict = FILENAME_DICT[LANG]["filename_dict"]
for keyword in tqdm(filename_dict):
    for filename in filename_dict[keyword]:
        filename2keyword[filename] = keyword

In [None]:
def segment_from_textgrid(filename, sr=16_000, segment_length=0.7):
    keyword = filename2keyword[filename]
    keyword_basename = os.path.basename(filename)[:-4] + "TextGrid"

    flag = False
    for textgrid_filename in textgrid_filenames:
        textgrid_basename = os.path.basename(textgrid_filename)
        if keyword_basename == textgrid_basename:
            flag = True
            break

    if flag:
        speech_path = SPEECH_TEMPLATE.format(os.path.basename(filename)[:-5])
        speech_wav, _ = librosa.load(speech_path, sr=sr)
        speech_length = len(speech_wav) / sr
        
        words = textgrid.TextGrid.fromFile(textgrid_filename)[0]
        target_word = filename.split(os.path.sep)[-2]
        words_list = [word.mark for word in words]
        segment_word = words[words_list.index(target_word)]
        
        minTime, maxTime = segment_word.minTime, segment_word.maxTime
        delta_time = (maxTime - minTime)
        if delta_time < segment_length:
            delta_time = segment_length - delta_time
            if delta_time > 0:
                if minTime - delta_time > 0:
                    minTime = minTime - delta_time / 2
                else:
                    minTime = 0.0

                maxTime = minTime + segment_length                

        
        save_path = f"./Processed_Keywords/{LANG}/{keyword}"
        os.makedirs(save_path, exist_ok=True)
        save_path = f"{save_path}/{os.path.basename(filename)[:-4]}wav"
        
        keyword_wav = speech_wav[int(sr*minTime):int(sr*maxTime)]
        scipy.io.wavfile.write(save_path, sr, keyword_wav)

In [None]:
with WorkerPool(n_jobs=8) as pool:
    results = pool.map(segment_from_textgrid, list(filename2keyword.keys()), progress_bar=True)

In [2]:
import os
from glob import glob
from tqdm.notebook import tqdm

from mpire import WorkerPool

In [12]:
filenames = glob("../Data/audio/*/clips/*/*.opus")
len(filenames)

22521349

In [13]:
def check_empty_file(filename):
    size = os.path.getsize(filename)
    if size < 100:
        print(filename)

In [None]:
with WorkerPool(n_jobs=16) as pool:
    results = pool.map(check_empty_file, filenames, progress_bar=True)

In [25]:
for folder in tqdm(BUFFER["fa"]["filename_dict"]):
    for filename in BUFFER["fa"]["filename_dict"][folder]:
        splited = filename.split(os.path.sep)

        new_folder = f"../Data/KeyWords/fa/clips/{splited[-2]}"
        new_filename = f"../Data/KeyWords/fa/clips/{splited[-2]}/{splited[-1]}"

        os.makedirs(new_folder, exist_ok=True)
        shutil.copy(filename, new_filename)

  0%|          | 0/851 [00:00<?, ?it/s]