In [12]:
# Structred Data
import numpy as np
import pandas as pd
import torch
import pickle

# System Libraries
import os
import librosa
import soundfile as sf
import nltk

# Garbage Collection
import gc

# Visualization
from IPython.display import Audio, display
import matplotlib.pyplot as plt

# Whisper Model
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

# Evaluation
from rouge_score import rouge_scorer

#### Set Paths

In [2]:
%cd ..
DIRECTORY_PATH = os.getcwd()

d:\ML\Real-Time-Speech-Recognition-and-Translation-


In [3]:
# Paths
VALIDATED_DATA_PATH = r"\Common_Voice\validated.tsv"
VALIDATED_SENTENCES_PATH = r"\Common_Voice\unvalidated_sentences.tsv"
CLIP_DURATION_PATH = r"\Common_Voice\clip_durations.tsv"
CLIPS_PATH = r"\Common_Voice\clips\\"
CLIPS_WAV_PATH = r"\Common_Voice\clips_wav\\"

# Constants
NYQUIST_SAMPLING_RATE = 16000

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"Using device: {device}")

Using device: cuda:0


In [5]:
#Load Model
model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)


pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

In [38]:
def ConvertToWav(paths):
    # Add Padding
    for path in paths:
        wavform, sr = librosa.load(DIRECTORY_PATH + CLIPS_PATH + path)
        wavform = librosa.resample(wavform, orig_sr=sr, target_sr=NYQUIST_SAMPLING_RATE)
        sf.write(DIRECTORY_PATH + CLIPS_WAV_PATH + path[:-4] + ".wav", wavform, sr, format='wav')

def getTokenizedSentences(sentences):
    tokenized_sentence = processor.tokenizer.tokenize(sentences)   #False --> Word
    return tokenized_sentence

def TranscribeAudios(paths):
    Transcribed_Sentences = pipe(paths)
    return Transcribed_Sentences

def fixPaths(paths):
    new_paths = []
    for path in paths:
        new_paths.append( DIRECTORY_PATH + CLIPS_WAV_PATH + path[:-4] + ".wav")
    return new_paths

def CalculateAvgBLEUScore(Transcribed_Sentences, Validated_Sentences, tokenizer):
    BLEU_Scores = []
    for i in range(len(Transcribed_Sentences)):
        Valid_Sentence = tokenizer.tokenize(Validated_Sentences[i])
        Transcribed_Sentence = tokenizer.tokenize(Transcribed_Sentences[i]['text'])
        BLEU_Scores.append(nltk.translate.bleu_score.sentence_bleu([Valid_Sentence], Transcribed_Sentence))
    return np.mean(BLEU_Scores)

def CalculateAvgROUGEScore(Transcribed_Sentences, Validated_Sentences):
    rs = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    ROUGE1_Percision_Scores = []
    ROUGE1_Recall_Scores = []
    ROUGE1_F1_Scores = []
    ROUGE2_Percision_Scores = []
    ROUGE2_Recall_Scores = []
    ROUGE2_F1_Scores = []
    ROUGEL_Percision_Scores = []
    ROUGEL_Recall_Scores = []
    ROUGEL_F1_Scores = []
    for i in range(len(Transcribed_Sentences)):
        rouge_score = rs.score(Validated_Sentences[i], Transcribed_Sentences[i]['text'])
        ROUGE1_Percision_Scores.append(rouge_score['rouge1'][0])
        ROUGE1_Recall_Scores.append(rouge_score['rouge1'][1])
        ROUGE1_F1_Scores.append(rouge_score['rouge1'][2])
        ROUGE2_Percision_Scores.append(rouge_score['rouge2'][0])
        ROUGE2_Recall_Scores.append(rouge_score['rouge2'][1])
        ROUGE2_F1_Scores.append(rouge_score['rouge2'][2])
        ROUGEL_Percision_Scores.append(rouge_score['rougeL'][0])
        ROUGEL_Recall_Scores.append(rouge_score['rougeL'][1])
        ROUGEL_F1_Scores.append(rouge_score['rougeL'][2])
    return {"rouge1" : [np.mean(ROUGE1_Percision_Scores), np.mean(ROUGE1_Recall_Scores), np.mean(ROUGE1_F1_Scores)], 
            "rouge2" : [np.mean(ROUGE2_Percision_Scores), np.mean(ROUGE2_Recall_Scores), np.mean(ROUGE2_F1_Scores)],
            "rougeL" : [np.mean(ROUGEL_Percision_Scores), np.mean(ROUGEL_Recall_Scores), np.mean(ROUGEL_F1_Scores)]}
        
    

In [17]:
# AudioPreprocessing
unfiltered_dataset = pd.read_csv(DIRECTORY_PATH + VALIDATED_DATA_PATH , sep='\t')[["path", "sentence", "up_votes"]]
ConvertToWav(unfiltered_dataset["path"].tolist())
unfiltered_dataset["path"] = fixPaths(unfiltered_dataset["path"].tolist())

In [73]:
# Transcribe (Do NOT Run this cell it is time consuming)
Transcribed_Sentences = TranscribeAudios(unfiltered_dataset["path"].tolist())
Real_Sentences = unfiltered_dataset["sentence"].tolist()



In [18]:
with open('Pickle_Files/Transcribed_Sentences.pkl', 'rb') as f:
    Transcribed_Sentences = pickle.load(f)
Real_Sentences = unfiltered_dataset["sentence"].tolist()

In [32]:
# Calculate BLEU Score
BLEU_Score = CalculateAvgBLEUScore(Transcribed_Sentences, Real_Sentences, processor.tokenizer)
print(f"BLEU Score: {BLEU_Score}")

BLEU Score: 0.6801078611053675
ROUGE Score: [{'rouge1': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rouge2': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rougeL': Score(precision=1.0, recall=1.0, fmeasure=1.0)}, {'rouge1': Score(precision=0.8461538461538461, recall=0.8461538461538461, fmeasure=0.8461538461538461), 'rouge2': Score(precision=0.75, recall=0.75, fmeasure=0.75), 'rougeL': Score(precision=0.8461538461538461, recall=0.8461538461538461, fmeasure=0.8461538461538461)}, {'rouge1': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rouge2': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rougeL': Score(precision=1.0, recall=1.0, fmeasure=1.0)}, {'rouge1': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rouge2': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rougeL': Score(precision=1.0, recall=1.0, fmeasure=1.0)}, {'rouge1': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rouge2': Score(precision=1.0, recall=1.0, fmeasure=1.0), 'rougeL': Score(precision=1.0, recall=1.0

In [47]:
ROUGE_Scores = CalculateAvgROUGEScore(Transcribed_Sentences, Real_Sentences)
print("               Precision           | Recall             | F1")
print(f"Rouge_1 Score: {ROUGE_Scores['rouge1']}")
print(f"Rouge_2 Score: {ROUGE_Scores['rouge2']}")
print(f"Rouge_L Score: {ROUGE_Scores['rougeL']}")

               Precision           | Recall             | F1
Rouge_1 Score: [0.9415822418889903, 0.9426635481083333, 0.9419391677910781]
Rouge_2 Score: [0.9064298857243641, 0.9075811032866249, 0.906802859490404]
Rouge_L Score: [0.9415822418889903, 0.9426635481083333, 0.9419391677910781]
