In [None]:
#!:bash
python3 -m pip install  git+https://github.com/NVIDIA/NeMo.git@main

In [None]:
%pip install ffmpeg 
%pip install braceexpand 
%pip install webdataset 
%pip install g2p_en 
%pip install frozendict 
%pip install unidecode 
%pip install torch_stft 
%pip install editdistance 
%pip install install sphfile

In [None]:
import os
import glob
import json
import wget
import copy
import ffmpeg
import tarfile
import numpy as np 
import pandas as pd

import librosa
import librosa.display
import IPython.display as ipd

import pytorch_lightning as pl
import matplotlib.pyplot as plt

import nemo
import nemo.collections.asr as nemo_asr

from omegaconf import DictConfig
from pathlib import PurePath
from pydub import AudioSegment
from joblib import Parallel, delayed, cpu_count
from tqdm import tqdm

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [25]:
data_dir = os.getcwd()
dataset_name ='dev-clean'

In [46]:
def download_extract_libri(data_dir, dataset_name):
    '''
    downloading the tar file with dataset and extract it into the directory
    '''
    if not os.path.exists(os.path.join(data_dir, f'{dataset_name}.tar.gz')):
        libri_url = f'http://www.openslr.org/resources/12/{dataset_name}.tar.gz'
        libri_path = wget.download(libri_url, data_dir)
        print(f"Dataset in .tar format downloaded at: {libri_path}")
    else:
        print("Tarfile already exists")
        libri_path = os.path.join(data_dir, f'{dataset_name}.tar.gz')
    extracted_dir = os.path.join(data_dir, 'Librispeech')
    if not os.path.exists(extracted_dir):
        tar = tarfile.open(libri_path)
        tar.extractall(path=data_dir)
        print(f"Tarfile extracted in {extracted_dir}")
    else:
        print(f"Tarfile already extracted in {extracted_dir}")
        
def flac2wav(data_dir, dataset_name):
    '''
    converting flac to wav
    '''
    flac_list = glob.glob(os.path.join(data_dir, f'LibriSpeech/{dataset_name}/**/**/*.flac'))  
    for flac_path in tqdm(flac_list, position=0, leave=False):
        wav_path = path[:-5] + '.wav'
        os.system(f'ffmpeg -i {path} {wav_path}')
        os.remove(path)
        
def build_manifest(data_dir, dataset_name, test_manifest=data_dir + '/LibriSpeech/test_manifest.json'):
    '''
    build_manifest(for training)
    '''
    transcripts = glob.glob(os.path.join(data_dir, f'LibriSpeech/{dataset_name}/**/**/*.txt'))
    with open(test_manifest, 'w') as out_file:
        for trans_path in tqdm(transcripts, position=0, leave=False):
            with open(trans_path, 'r') as file:
                for line in file.readlines():   
                    
                    transcript = line.lower()[:-1]
                    audio_name = transcript.split(' ')[0]
                    text = transcript[len(transcript.split(' ')[0]) + 1:]
                    path_to_folder, _ = trans_path.rsplit('/', 1)
                    audio_path = os.path.join(path_to_folder, audio_name+'.wav')
                    duration = librosa.core.get_duration(filename=audio_path)
                    
                    metadata = {
                                "audio_filepath": audio_path,
                                "duration": duration,
                                "text": text
                                }
                    json.dump(metadata, out_file)
                    out_file.write('\n')
                    
def process_broken_files(data_dir, dataset_name):
    '''
    re convert broken files
    '''
    wav_files = glob.glob(os.path.join(data_dir, f'LibriSpeech/{dataset_name}/**/**/*.wav'))
    broken_files = [file for file in wav_files if os.path.getsize(file) < 100]
    for f in broken_files:
        path_to_folder = f.split('.wav')[0].split('train-clean-100')[1]
        flac_audio = data_dir + f'/LibriSpeech/{dataset_name}/'+ path_to_folder+'.flac'
        file_path = PurePath(flac_audio)
        flac_tmp_audio_data = AudioSegment.from_file(file_path, file_path.suffix[1:])
        flac_tmp_audio_data.export(f, format="wav")
        
def save_translation(data_dir, dataset_name):
    '''
    save all translations in one file to download from datasphere
    '''
    transcripts_j = glob.glob(os.path.join(data_dir, f'LibriSpeech/{dataset_name}/**/**/*.trans_jasper.txt'))
    for j_path in transcripts_j:
        with open(j_path, 'r') as j_file:
            with open(f"{dataset_name}-translation.txt", "a") as out_file:
                for line in j_file:
                    out_file.write(line)

In [None]:
download_extract_libri(data_dir, dataset_name)
flac2wav(data_dir, dataset_name)
process_broken_files(data_dir, dataset_name)

In [None]:
#!L
asr_model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="Jasper10x5Dr-En")

In [None]:
#!L
#make predictions with the loaded model
ground_truth, hypothesis = [], []
transcripts = glob.glob(os.path.join(data_dir, f'LibriSpeech/{dataset_name}/**/**/*.trans.txt'))
for trans_path in tqdm(transcripts, position=0, leave=False):
    with open(trans_path, 'r') as file:
        trans_jasper_path = trans_path.rsplit('.', 1)[0]+'_jasper.txt'
        with open(trans_jasper_path, 'w') as out_file:
            for line in file.readlines():   
                transcript = line.lower()[:-1]
                audio_name = transcript.split(' ')[0]
                text = transcript[len(transcript.split(' ')[0]) + 1:]
                path_to_folder, _ = trans_path.rsplit('/', 1)
                audio_path = os.path.join(path_to_folder, audio_name+'.wav')
                
                jasper_text = asr_model.transcribe(paths2audio_files=[audio_path])[0]
                ground_truth.append(text)
                hypothesis.append(jasper_text)
                out_file.write(f'{audio_name} {jasper_text.upper()}\n')
                
df = pd.DataFrame(zip(ground_truth, hypothesis))
df.to_csv(f"{dataset_name}.csv", index=False)

In [None]:
save_translation(data_dir, dataset_name)