In [1]:
# Structred Data
import numpy as np
import pandas as pd
import torch
import pickle

# System Libraries
import os
import librosa
import soundfile as sf
import nltk
from pydub import AudioSegment

# Garbage Collection
import gc

# Visualization
from IPython.display import Audio, display
import matplotlib.pyplot as plt

# Whisper Model
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

# Helsinki-NLP
from transformers import MarianMTModel, MarianTokenizer

# Evaluation
from rouge_score import rouge_scorer

#### Set Paths

In [3]:
%cd ..
DIRECTORY_PATH = os.getcwd()

d:\ML\Real-Time-Speech-Recognition-and-Translation-


In [2]:
# Paths
VALIDATED_DATA_PATH = r"\Common_Voice\validated.tsv"
VALIDATED_SENTENCES_PATH = r"\Common_Voice\unvalidated_sentences.tsv"
CLIP_DURATION_PATH = r"\Common_Voice\clip_durations.tsv"
CLIPS_PATH = r"\Common_Voice\clips\\"
CLIPS_WAV_PATH = r"\Common_Voice\clips_wav\\"

# Constants
NYQUIST_SAMPLING_RATE = 16000

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"Using device: {device}")

Using device: cuda:0


In [4]:
#Load Whisper Model
model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa"
)
# Load Helsinki-NLP Model
model_name_translate = "Helsinki-NLP/opus-mt-en-ar"
tokenizer_translation = MarianTokenizer.from_pretrained(model_name_translate)
model_translate = MarianMTModel.from_pretrained(model_name_translate)
model_translate.to(device)

AttributeError: 'WhisperForConditionalGeneration' object has no attribute 'save'

In [19]:
processor = AutoProcessor.from_pretrained(model_id)


pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

In [20]:
def ConvertToWav(paths):
    # Add Padding
    for path in paths:
        wavform, sr = librosa.load(DIRECTORY_PATH + CLIPS_PATH + path)
        wavform = librosa.resample(wavform, orig_sr=sr, target_sr=NYQUIST_SAMPLING_RATE)
        sf.write(DIRECTORY_PATH + CLIPS_WAV_PATH + path[:-4] + ".wav", wavform, sr, format='wav')

def getTokenizedSentences(sentences):
    tokenized_sentence = processor.tokenizer.tokenize(sentences)   #False --> Word
    return tokenized_sentence

def TranscribeAudios(paths):
    Transcribed_Sentences = pipe(paths, generate_kwargs={"task": "translate", "language": "ar"})
    return Transcribed_Sentences

def fixPaths(paths):
    new_paths = []
    for path in paths:
        new_paths.append( DIRECTORY_PATH + CLIPS_WAV_PATH + path[:-4] + ".wav")
    return new_paths

def CalculateAvgBLEUScore(Transcribed_Sentences, Validated_Sentences, tokenizer):
    BLEU_Scores = []
    for i in range(len(Transcribed_Sentences)):
        Valid_Sentence = tokenizer.tokenize(Validated_Sentences[i])
        Transcribed_Sentence = tokenizer.tokenize(Transcribed_Sentences[i]['text'])
        BLEU_Scores.append(nltk.translate.bleu_score.sentence_bleu([Valid_Sentence], Transcribed_Sentence))
    return np.mean(BLEU_Scores)

def CalculateAvgROUGEScore(Transcribed_Sentences, Validated_Sentences):
    rs = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    ROUGE1_Percision_Scores = []
    ROUGE1_Recall_Scores = []
    ROUGE1_F1_Scores = []
    ROUGE2_Percision_Scores = []
    ROUGE2_Recall_Scores = []
    ROUGE2_F1_Scores = []
    ROUGEL_Percision_Scores = []
    ROUGEL_Recall_Scores = []
    ROUGEL_F1_Scores = []
    for i in range(len(Transcribed_Sentences)):
        rouge_score = rs.score(Validated_Sentences[i], Transcribed_Sentences[i]['text'])
        ROUGE1_Percision_Scores.append(rouge_score['rouge1'][0])
        ROUGE1_Recall_Scores.append(rouge_score['rouge1'][1])
        ROUGE1_F1_Scores.append(rouge_score['rouge1'][2])
        ROUGE2_Percision_Scores.append(rouge_score['rouge2'][0])
        ROUGE2_Recall_Scores.append(rouge_score['rouge2'][1])
        ROUGE2_F1_Scores.append(rouge_score['rouge2'][2])
        ROUGEL_Percision_Scores.append(rouge_score['rougeL'][0])
        ROUGEL_Recall_Scores.append(rouge_score['rougeL'][1])
        ROUGEL_F1_Scores.append(rouge_score['rougeL'][2])
    return {"rouge1" : [np.mean(ROUGE1_Percision_Scores), np.mean(ROUGE1_Recall_Scores), np.mean(ROUGE1_F1_Scores)], 
            "rouge2" : [np.mean(ROUGE2_Percision_Scores), np.mean(ROUGE2_Recall_Scores), np.mean(ROUGE2_F1_Scores)],
            "rougeL" : [np.mean(ROUGEL_Percision_Scores), np.mean(ROUGEL_Recall_Scores), np.mean(ROUGEL_F1_Scores)]}
        
def TranslateSentence(Transcribed_Sentences):
    Translated_Sentences = []
    for sentence in Transcribed_Sentences:
        batch = tokenizer_translation([sentence['text']], return_tensors="pt")
        generated_ids = model_translate.generate(batch["input_ids"].to(device))
        Translated_Sentences.append(tokenizer_translation.batch_decode(generated_ids, skip_special_tokens=True)[0])
    return Translated_Sentences
        

In [21]:
# AudioPreprocessing
unfiltered_dataset = pd.read_csv(DIRECTORY_PATH + VALIDATED_DATA_PATH , sep='\t')[["path", "sentence", "up_votes"]]
ConvertToWav(unfiltered_dataset["path"].tolist())
unfiltered_dataset["path"] = fixPaths(unfiltered_dataset["path"].tolist())

In [22]:
# Transcribe (Do NOT Run this cell it is time consuming)
Transcribed_Sentences = TranscribeAudios(unfiltered_dataset["path"].tolist()[:3])
Real_Sentences = unfiltered_dataset["sentence"].tolist()[:3]




In [17]:
from transformers.utils import is_torch_sdpa_available
print(is_torch_sdpa_available())

True


In [23]:
print(Transcribed_Sentences)
print(Real_Sentences)

[{'text': ' ترجمة نانسي قنقر'}, {'text': ' راكشك في الهرناندز مرخوز'}, {'text': ' ترجمة نانسي قرار'}]
['She hides her grief and joins in the homecoming celebration.', 'Proksch stayed at the palace of his personal friend, the dictator Ferdinand Marcos.', 'She had an uneventful career.']


In [9]:
with open('Pickle_Files/Transcribed_Sentences.pkl', 'rb') as f:
    Transcribed_Sentences = pickle.load(f)
Real_Sentences = unfiltered_dataset["sentence"].tolist()

In [10]:
# Calculate BLEU Score
BLEU_Score = CalculateAvgBLEUScore(Transcribed_Sentences, Real_Sentences, processor.tokenizer)
print(f"BLEU Score: {BLEU_Score}")

BLEU Score: 0.6801078611053675


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [11]:
ROUGE_Scores = CalculateAvgROUGEScore(Transcribed_Sentences, Real_Sentences)
print("               Precision           | Recall             | F1")
print(f"Rouge_1 Score: {ROUGE_Scores['rouge1']}")
print(f"Rouge_2 Score: {ROUGE_Scores['rouge2']}")
print(f"Rouge_L Score: {ROUGE_Scores['rougeL']}")

               Precision           | Recall             | F1
Rouge_1 Score: [0.9415822418889903, 0.9426635481083333, 0.9419391677910781]
Rouge_2 Score: [0.9064298857243641, 0.9075811032866249, 0.906802859490404]
Rouge_L Score: [0.9415822418889903, 0.9426635481083333, 0.9419391677910781]


In [39]:
Arabic_Sentences = TranslateSentence(Transcribed_Sentences)
print(Arabic_Sentences)

['إنها تخفي حزنها وتنضم إلى احتفالات العودة للوطن', 'ومكث روكشيك في قصر صديقه الشخصي، الدكتاتور فيرنانديز ماركوس.', 'كان لديها حياة وظيفية غير مستقرة.', 'دعنا نأكل الشوكولاتة الليلة', '(نيوبورت) كانت مقاطعة.', 'نموذج هيكمان يقع في هذا النوع.', 'وفقدت هذه الكتابة أسطولا من الحبوب في الركن الريفي الجنوبي الغربي من سانت كاتارنيس.', 'ويتميز بكاميرا نهارية وليلية وتتابع الفيديو.', 'والصيغ الأكثر حداثة للضريبة لم تعد تتطلب طابعا فعليا.', 'ويستند التصنيف إلى توقيت الاستخلاص الذي ينظم مؤقتاً.', 'لقد وضع جليسة بيضاء على العشاء', 'وهي تشكل أكبر بحيرة اصطناعية في ماوي، بحيرة مانانتالي.', '(فوكس) قام بتكرار بث البرنامج مراراً وتكراراً ليفسح المجال لعروض أخرى.', 'وأنتج فيرنر أشرطة فيديو وثائقية قصيرة عن التاريخ القانوني والقانون المقارن.', 'عندما شخص اللاعب يلمس وحشاً، يموتون.', 'وافتتح مؤخرا مكتبا الجمارك وحماية الحدود في مبنى الشحن السابق في أقصى الشرق.', 'حصل زملاء الدراسة والأصدقاء على منحة دراسية باسمه في كارنيج ميلن.', 'وقد نشأ ذلك في الازمنة التي لم يُسمح فيها للرقيق ان يلعبوا الآلات الموسيق