In [76]:
# Structred Data
import numpy as np
import pandas as pd
import torch
import pickle

# System Libraries
import os
import librosa
import soundfile as sf
import nltk

# Garbage Collection
import gc

# Visualization
from IPython.display import Audio, display
import matplotlib.pyplot as plt

# Whisper Model
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

#### Set Paths

In [2]:
%cd ..
DIRECTORY_PATH = os.getcwd()

d:\ML\Real-Time-Speech-Recognition-and-Translation-


In [20]:
# Paths
VALIDATED_DATA_PATH = r"\Common_Voice\validated.tsv"
VALIDATED_SENTENCES_PATH = r"\Common_Voice\unvalidated_sentences.tsv"
CLIP_DURATION_PATH = r"\Common_Voice\clip_durations.tsv"
CLIPS_PATH = r"\Common_Voice\clips\\"
CLIPS_WAV_PATH = r"\Common_Voice\clips_wav\\"

# Constants
NYQUIST_SAMPLING_RATE = 16000

In [12]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"Using device: {device}")

Using device: cuda:0


In [13]:
#Load Model
model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)


pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

In [90]:
def ConvertToWav(paths):
    # Add Padding
    for path in paths:
        wavform, sr = librosa.load(DIRECTORY_PATH + CLIPS_PATH + path)
        wavform = librosa.resample(wavform, orig_sr=sr, target_sr=NYQUIST_SAMPLING_RATE)
        sf.write(DIRECTORY_PATH + CLIPS_WAV_PATH + path[:-4] + ".wav", wavform, sr, format='wav')

def getTokenizedSentences(sentences):
    tokenized_sentence = processor.tokenizer.tokenize(sentences)   #False --> Word
    return tokenized_sentence

def TranscribeAudios(paths):
    Transcribed_Sentences = pipe(paths)
    return Transcribed_Sentences

def fixPaths(paths):
    new_paths = []
    for path in paths:
        new_paths.append( DIRECTORY_PATH + CLIPS_WAV_PATH + path[:-4] + ".wav")
    return new_paths

def CalculateAvgBLEUScore(Transcribed_Sentences, Validated_Sentences, tokenizer):
    BLEU_Scores = []
    for i in range(len(Transcribed_Sentences)):
        Valid_Sentence = tokenizer.tokenize(Validated_Sentences[i])
        Transcribed_Sentence = tokenizer.tokenize(Transcribed_Sentences[i]['text'])
        BLEU_Scores.append(nltk.translate.bleu_score.sentence_bleu([Valid_Sentence], Transcribed_Sentence))
    return np.mean(BLEU_Scores)

In [88]:
# AudioPreprocessing
unfiltered_dataset = pd.read_csv(DIRECTORY_PATH + VALIDATED_DATA_PATH , sep='\t')[["path", "sentence", "up_votes"]]
ConvertToWav(unfiltered_dataset["path"].tolist())
unfiltered_dataset["path"] = fixPaths(unfiltered_dataset["path"].tolist())

In [73]:
# Transcribe
Transcribed_Sentences = TranscribeAudios(unfiltered_dataset["path"].tolist())
Real_Sentences = unfiltered_dataset["sentence"].tolist()



In [91]:
# Calculate BLEU Score
BLEU_Score = CalculateAvgBLEUScore(Transcribed_Sentences, Real_Sentences, processor.tokenizer)
print(f"BLEU Score: {BLEU_Score}")

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU Score: 0.6801078611053675


In [63]:
for i in range(2):
    BLEUscore = nltk.translate.bleu_score.sentence_bleu([tokenized_real_Sentences[i]], tokenized_predicted_Sentences[i])
    print(BLEUscore)

1.0
1.0


In [11]:
display(unfiltered_dataset.head(5))

Unnamed: 0,path,sentence,up_votes
0,common_voice_en_40353084.wav,She hides her grief and joins in the homecomin...,2
1,common_voice_en_40632268.wav,Proksch stayed at the palace of his personal f...,4
2,common_voice_en_40613175.wav,She had an uneventful career.,3
3,common_voice_en_40434934.wav,Let's eat the chocolate tonight.,2
4,common_voice_en_40401859.wav,Newport was a county borough.,2
