<h1>Reading the csv</h1>

In [1]:
import pandas as pd
from pathlib import Path
import os
import numpy as np

In [2]:
cwd = Path(os.getcwd())
csv_path = Path(cwd/"transcript.csv")

In [3]:
df = pd.read_csv(csv_path)
df["asr"] = ""
df["WER"] = ""
df

Unnamed: 0,File,Transcription,Language,asr,WER
0,checkin.wav,where is the check in desk,EN,,
1,checkin_child.wav,where is the check in desk,EN,,
2,parents.wav,i have lost my parents,EN,,
3,parents_child.wav,i have lost my parents,EN,,
4,suitcase.wav,please I have lost my suitcase,EN,,
5,suitcase_child.wav,please I have lost my suitcase,EN,,
6,what_time.wav,what time is my plane,EN,,
7,what_time_child.wav,what time is my plane,EN,,
8,where.wav,where are the restaurants and shops,EN,,
9,where_child.wav,where are the restaurants and shops,EN,,


<h1>Automatic Speech Recognition based on Mozilla Deepspeech</h1>

In [4]:
import deepspeech
import librosa as lr
from deepspeech import Model, version
from scipy.io import wavfile
import noisereduce
from pydub import AudioSegment, effects



<h2>Mozilla Deepspeech</h2>

In [5]:
def DeepSpeechASR(scorer, model, audioFile):
    ds = Model(model)
    ds.enableExternalScorer(scorer)
    desired_sample_rate = ds.sampleRate()
    audio = lr.load(audioFile, sr=desired_sample_rate)[0]
    audio = (audio * 32767).astype(np.int16)
    res = ds.stt(audio)
    return res

<h2>Function to help with noisy environment</h2>

In [6]:
def reduceNoise():
    EN_audio = Path(cwd/"Ex4_audio_files/EN")
    ES_audio = Path(cwd/"Ex4_audio_files/ES")
    IT_audio = Path(cwd/"Ex4_audio_files/IT")
    EN_clean_audio = Path(cwd/"Ex4_audio_files/EN/clean")
    ES_clean_audio = Path(cwd/"Ex4_audio_files/ES/clean")
    IT_clean_audio = Path(cwd/"Ex4_audio_files/IT/clean")
    
    for i in df.index:
        if df["Language"][i] == "EN":
            if i == 0 or i == 1:
                rate, data = wavfile.read(Path(EN_audio/str(df["File"][i])))
                reduced_noise = noisereduce.reduce_noise(y=data, sr = rate)
                wavfile.write(Path(EN_clean_audio/str(df["File"][i])), rate, reduced_noise)
            else:
                rawSound = AudioSegment.from_file(Path(EN_audio/str(df["File"][i])), "wav")
                normalizedSound = effects.normalize(rawSound)
                normalizedSound.export(str(Path(EN_clean_audio/str(df["File"][i]))), format="wav")
        
        elif df["Language"][i] == "IT":
            rawSound = AudioSegment.from_file(Path(IT_audio/str(df["File"][i])), "wav")
            normalizedSound = effects.normalize(rawSound)
            normalizedSound.export(str(Path(IT_clean_audio/str(df["File"][i]))), format="wav")
        
        elif df["Language"][i] == "ES":
            rate, data = wavfile.read(Path(ES_audio/str(df["File"][i])))
            reduced_noise = noisereduce.reduce_noise(y=data, sr = rate)
            wavfile.write(Path(ES_clean_audio/str(df["File"][i])), rate, reduced_noise)

In [7]:
reduceNoise()

<h2>Creating our ASR system</h2>

In [8]:
def ASRsystem():
    #set up the models, scorers and paths
    EN_model = Path(cwd/"Models/deepspeech-0.9.3-models.pbmm")
    EN_scorer = Path(cwd/"Models/deepspeech-0.9.3-models.scorer")
    IT_model = Path(cwd/"Models/output_graph_it.pbmm")
    IT_scorer = Path(cwd/"Models/kenlm_it.scorer")
    ES_model = Path(cwd/"Models/output_graph_es.pbmm")
    ES_scorer = Path(cwd/"Models/kenlm_es.scorer")
    IT_audio = Path(cwd/"Ex4_audio_files/IT")
    EN_clean_audio = Path(cwd/"Ex4_audio_files/EN/clean")
    ES_clean_audio = Path(cwd/"Ex4_audio_files/ES/clean")
    IT_clean_audio = Path(cwd/"Ex4_audio_files/IT/clean")
    
    
    
    for i in df.index:
        if df["Language"][i] == "EN":
            speech = DeepSpeechASR(str(EN_scorer), str(EN_model), Path(EN_clean_audio/str(df["File"][i])))
            df["asr"][i] = speech
        elif df["Language"][i] == "IT":
            speech = DeepSpeechASR(str(IT_scorer), str(IT_model), Path(IT_clean_audio/str(df["File"][i])))
            df["asr"][i] = speech
        elif df["Language"][i] == "ES":
            speech = DeepSpeechASR(str(ES_scorer), str(ES_model), Path(ES_clean_audio/str(df["File"][i])))
            df["asr"][i] = speech      
    return df

In [9]:
ASRsystem()
df

Unnamed: 0,File,Transcription,Language,asr,WER
0,checkin.wav,where is the check in desk,EN,where is the check in desk,
1,checkin_child.wav,where is the check in desk,EN,where is the check in de,
2,parents.wav,i have lost my parents,EN,i had lost my parents,
3,parents_child.wav,i have lost my parents,EN,i had lost my parents,
4,suitcase.wav,please I have lost my suitcase,EN,please i have lost my suitcase,
5,suitcase_child.wav,please I have lost my suitcase,EN,three i had lost my safe,
6,what_time.wav,what time is my plane,EN,what time is my plain,
7,what_time_child.wav,what time is my plane,EN,what time is my plan,
8,where.wav,where are the restaurants and shops,EN,where are the restaurants and shops,
9,where_child.wav,where are the restaurants and shops,EN,where are the restaurants and shops,


<h2>Word error rate calculation on our speech recognition system using the jiwer library</h2>

In [10]:
import jiwer

In [11]:
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemovePunctuation(),
    jiwer.RemoveWhiteSpace(replace_by_space=True),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ReduceToListOfListOfWords(word_delimiter=" ")
]) 


for i in df.index:
    wer = jiwer.wer(df["Transcription"][i], df["asr"][i], truth_transform=transformation, hypothesis_transform=transformation)
    df["WER"][i] = wer
df

Unnamed: 0,File,Transcription,Language,asr,WER
0,checkin.wav,where is the check in desk,EN,where is the check in desk,0.0
1,checkin_child.wav,where is the check in desk,EN,where is the check in de,0.166667
2,parents.wav,i have lost my parents,EN,i had lost my parents,0.2
3,parents_child.wav,i have lost my parents,EN,i had lost my parents,0.2
4,suitcase.wav,please I have lost my suitcase,EN,please i have lost my suitcase,0.0
5,suitcase_child.wav,please I have lost my suitcase,EN,three i had lost my safe,0.5
6,what_time.wav,what time is my plane,EN,what time is my plain,0.2
7,what_time_child.wav,what time is my plane,EN,what time is my plan,0.2
8,where.wav,where are the restaurants and shops,EN,where are the restaurants and shops,0.0
9,where_child.wav,where are the restaurants and shops,EN,where are the restaurants and shops,0.0


In [12]:
en_wer = [] 
it_wer = []
es_wer = []
    
for i in df.index:
    if df["Language"][i] == "EN":
        en_wer.append(df["WER"][i])
    elif df["Language"][i] == "IT":
        it_wer.append(df["WER"][i])
    elif df["Language"][i] == "ES":
        es_wer.append(df["WER"][i])
        
enWER = 100 * sum(en_wer) / len(en_wer)
itWER = 100 * sum(it_wer) / len(it_wer)
esWER = 100 * sum(es_wer) / len(es_wer)

print("EN WER = %0.3f \n\n" %enWER)
print("IT WER = %0.3f \n\n" %itWER)
print("ES WER = %0.3f " %esWER)

EN WER = 19.722 


IT WER = 30.714 


ES WER = 29.714 


<h1>Evaluation of other ASR system</h1>

In [13]:
df_compare = pd.read_csv(csv_path, sep=",", skipfooter=10, engine='python', encoding='utf-8-sig')
df_compare["speechbrain"] = ""
df_compare["speechbrainWER"] = ""
df_compare

Unnamed: 0,File,Transcription,Language,speechbrain,speechbrainWER
0,checkin.wav,where is the check in desk,EN,,
1,checkin_child.wav,where is the check in desk,EN,,
2,parents.wav,i have lost my parents,EN,,
3,parents_child.wav,i have lost my parents,EN,,
4,suitcase.wav,please I have lost my suitcase,EN,,
5,suitcase_child.wav,please I have lost my suitcase,EN,,
6,what_time.wav,what time is my plane,EN,,
7,what_time_child.wav,what time is my plane,EN,,
8,where.wav,where are the restaurants and shops,EN,,
9,where_child.wav,where are the restaurants and shops,EN,,


The first ASR system that I plan to evaluate against would be speechbrain

In [14]:
import speechbrain
from speechbrain.pretrained import EncoderDecoderASR

The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
torchvision is not available - cannot save figures
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.


In [15]:
def speechbrainASR(file):
    asr_model = EncoderDecoderASR.from_hparams(source = "speechbrain/asr-crdnn-transformerlm-librispeech", 
                                               savedir="pretrained_models/asr-crdnn-transformerlm-librispeech")
    return asr_model.transcribe_file(str(file))

In [16]:
EN_clean_audio = Path(cwd/"Ex4_audio_files/EN/clean")
for i in df_compare.index:
    speech = speechbrainASR(Path(EN_clean_audio/str(df_compare['File'][i])))
    df_compare["speechbrain"][i] = speech

In [17]:
df_compare

Unnamed: 0,File,Transcription,Language,speechbrain,speechbrainWER
0,checkin.wav,where is the check in desk,EN,WHERE IS THE JACKET DESK,
1,checkin_child.wav,where is the check in desk,EN,WHERE IS THE JACK IN OFFICE,
2,parents.wav,i have lost my parents,EN,I HAVE LOST MY HEAD,
3,parents_child.wav,i have lost my parents,EN,LOST MY PARENTS,
4,suitcase.wav,please I have lost my suitcase,EN,PLEASE I HAVE LOST MYSELF,
5,suitcase_child.wav,please I have lost my suitcase,EN,PLEASE I'VE LOST MY SUIT CASE,
6,what_time.wav,what time is my plane,EN,WHAT TIME IS MY PLAN,
7,what_time_child.wav,what time is my plane,EN,WHAT TIME WAS MY OWN,
8,where.wav,where are the restaurants and shops,EN,WHERE ARE THE RESTAURANTS AND SHOPS,
9,where_child.wav,where are the restaurants and shops,EN,BURY THE RESTAURANTS AND SHOPS,


In [18]:
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemovePunctuation(),
    jiwer.RemoveWhiteSpace(replace_by_space=True),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ReduceToListOfListOfWords(word_delimiter=" ")
]) 

werList = []

for i in df_compare.index:
    wer = jiwer.wer(df_compare["Transcription"][i], df_compare["speechbrain"][i], truth_transform=transformation, hypothesis_transform=transformation)
    df_compare["speechbrainWER"][i] = wer
    werList.append(wer)

avgWER = 100 * sum(werList) / len(werList)
print("Average WER = %0.3f \n" %avgWER)
df_compare

Average WER = 32.778 



Unnamed: 0,File,Transcription,Language,speechbrain,speechbrainWER
0,checkin.wav,where is the check in desk,EN,WHERE IS THE JACKET DESK,0.333333
1,checkin_child.wav,where is the check in desk,EN,WHERE IS THE JACK IN OFFICE,0.333333
2,parents.wav,i have lost my parents,EN,I HAVE LOST MY HEAD,0.2
3,parents_child.wav,i have lost my parents,EN,LOST MY PARENTS,0.4
4,suitcase.wav,please I have lost my suitcase,EN,PLEASE I HAVE LOST MYSELF,0.333333
5,suitcase_child.wav,please I have lost my suitcase,EN,PLEASE I'VE LOST MY SUIT CASE,0.666667
6,what_time.wav,what time is my plane,EN,WHAT TIME IS MY PLAN,0.2
7,what_time_child.wav,what time is my plane,EN,WHAT TIME WAS MY OWN,0.4
8,where.wav,where are the restaurants and shops,EN,WHERE ARE THE RESTAURANTS AND SHOPS,0.0
9,where_child.wav,where are the restaurants and shops,EN,BURY THE RESTAURANTS AND SHOPS,0.333333


The next ASR system that I would want to evaluate against would be Facebook Wav2Vec system

In [19]:
df_compare["wav2vec"] = ""
df_compare["wav2vecWER"] = ""
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import soundfile as sf

In [20]:
def wav2vecASR(file):
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    with file.open("rb") as sound:
        data, sr = sf.read(sound)
        inputs = processor(data, sampling_rate=sr, return_tensors="pt")
        with torch.no_grad():
            logits = model(**inputs).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            return processor.batch_decode(predicted_ids)[0]

In [21]:
EN_clean_audio = Path(cwd/"Ex4_audio_files/EN/clean")
for i in df_compare.index:
    speech = wav2vecASR(Path(EN_clean_audio/str(df_compare['File'][i])))
    df_compare["wav2vec"][i] = speech

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']

In [22]:
df_compare

Unnamed: 0,File,Transcription,Language,speechbrain,speechbrainWER,wav2vec,wav2vecWER
0,checkin.wav,where is the check in desk,EN,WHERE IS THE JACKET DESK,0.333333,WHERE IS THE CHECKEN DESK,
1,checkin_child.wav,where is the check in desk,EN,WHERE IS THE JACK IN OFFICE,0.333333,WHERE IS THE JACKEN DESS,
2,parents.wav,i have lost my parents,EN,I HAVE LOST MY HEAD,0.2,I HAVE LOST MY PARENIS,
3,parents_child.wav,i have lost my parents,EN,LOST MY PARENTS,0.4,HAVE LOST MY PARENTTS,
4,suitcase.wav,please I have lost my suitcase,EN,PLEASE I HAVE LOST MYSELF,0.333333,PLEASE OWI HAVE LOST MY SICCESSE,
5,suitcase_child.wav,please I have lost my suitcase,EN,PLEASE I'VE LOST MY SUIT CASE,0.666667,DREAS I LOST MY THREET CASE,
6,what_time.wav,what time is my plane,EN,WHAT TIME IS MY PLAN,0.2,WHAT TIME IS MY PLAYING,
7,what_time_child.wav,what time is my plane,EN,WHAT TIME WAS MY OWN,0.4,WOT TINE IS MY GRA,
8,where.wav,where are the restaurants and shops,EN,WHERE ARE THE RESTAURANTS AND SHOPS,0.0,WHERE ARE THE RESTAURATS AND SHOPS,
9,where_child.wav,where are the restaurants and shops,EN,BURY THE RESTAURANTS AND SHOPS,0.333333,WARALE RESTRATS AND SHOPS,


In [23]:
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemovePunctuation(),
    jiwer.RemoveWhiteSpace(replace_by_space=True),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ReduceToListOfListOfWords(word_delimiter=" ")
]) 

werList2 = []

for i in df_compare.index:
    wer = jiwer.wer(df_compare["Transcription"][i], df_compare["wav2vec"][i], truth_transform=transformation, hypothesis_transform=transformation)
    df_compare["wav2vecWER"][i] = wer
    werList2.append(wer)

averageWER = 100 * sum(werList2) / len(werList2)
print("Average WER = %0.3f \n" %averageWER)
df_compare

Average WER = 35.556 



Unnamed: 0,File,Transcription,Language,speechbrain,speechbrainWER,wav2vec,wav2vecWER
0,checkin.wav,where is the check in desk,EN,WHERE IS THE JACKET DESK,0.333333,WHERE IS THE CHECKEN DESK,0.333333
1,checkin_child.wav,where is the check in desk,EN,WHERE IS THE JACK IN OFFICE,0.333333,WHERE IS THE JACKEN DESS,0.5
2,parents.wav,i have lost my parents,EN,I HAVE LOST MY HEAD,0.2,I HAVE LOST MY PARENIS,0.2
3,parents_child.wav,i have lost my parents,EN,LOST MY PARENTS,0.4,HAVE LOST MY PARENTTS,0.4
4,suitcase.wav,please I have lost my suitcase,EN,PLEASE I HAVE LOST MYSELF,0.333333,PLEASE OWI HAVE LOST MY SICCESSE,0.333333
5,suitcase_child.wav,please I have lost my suitcase,EN,PLEASE I'VE LOST MY SUIT CASE,0.666667,DREAS I LOST MY THREET CASE,0.666667
6,what_time.wav,what time is my plane,EN,WHAT TIME IS MY PLAN,0.2,WHAT TIME IS MY PLAYING,0.2
7,what_time_child.wav,what time is my plane,EN,WHAT TIME WAS MY OWN,0.4,WOT TINE IS MY GRA,0.6
8,where.wav,where are the restaurants and shops,EN,WHERE ARE THE RESTAURANTS AND SHOPS,0.0,WHERE ARE THE RESTAURATS AND SHOPS,0.166667
9,where_child.wav,where are the restaurants and shops,EN,BURY THE RESTAURANTS AND SHOPS,0.333333,WARALE RESTRATS AND SHOPS,0.666667
