In [4]:
from pydub import AudioSegment
import os

# Set FFmpeg path to the virtual environment's ffmpeg folder
ffmpeg_path = os.path.join(os.environ['VIRTUAL_ENV'], "ffmpeg", "ffmpeg.exe")
AudioSegment.converter = ffmpeg_path

ffprobe_path = os.path.join(os.environ['VIRTUAL_ENV'], "ffmpeg", "ffprobe.exe")
AudioSegment.ffprobe = ffprobe_path


In [6]:
import os
import torch
import librosa
from pydub import AudioSegment
from transformers import AutoProcessor, AutoModelForCTC

# Load the model and processor
processor = AutoProcessor.from_pretrained("mohammed/arabic-speech-recognition")
model = AutoModelForCTC.from_pretrained("mohammed/arabic-speech-recognition")

# Function to convert .m4a to .mp3
def convert_m4a_to_mp3(m4a_file, mp3_file):
    try:
        if not os.path.exists(m4a_file):
            raise FileNotFoundError(f"File {m4a_file} does not exist.")
        audio = AudioSegment.from_file(m4a_file, format="m4a")
        audio.export(mp3_file, format="mp3")
        print(f"Converted {m4a_file} to {mp3_file}")
    except Exception as e:
        print(f"Error converting {m4a_file} to mp3: {e}")

# Function to transcribe audio and return the letter said
def transcribe_audio(audio_file):
    try:
        # Convert .m4a to .mp3 if necessary
        if audio_file.endswith('.m4a'):
            mp3_file = audio_file.replace('.m4a', '.mp3')
            convert_m4a_to_mp3(audio_file, mp3_file)
            audio_file = mp3_file

        # Ensure the audio file exists
        if not os.path.exists(audio_file):
            raise FileNotFoundError(f"Audio file {audio_file} not found.")

        # Load the audio file
        speech, _ = librosa.load(audio_file, sr=16000)

        # Process the audio file
        input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values

        # Perform inference
        with torch.no_grad():
            logits = model(input_values).logits

        # Decode the predicted ids to text
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]

        # Extract the letter from the transcription
        letter = transcription.strip()

        return letter
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return None

# Example usage
audio_file = "../uploads/raOut.m4a"
if not os.path.exists(audio_file):
    print(f"Error: File {audio_file} does not exist. Please check the file path.")
else:
    letter = transcribe_audio(audio_file)
    if letter:
        print(f"The letter said in the audio is: {letter}")
    else:
        print("Failed to transcribe the audio.")




Error converting ../uploads/raOut.m4a to mp3: [WinError 2] The system cannot find the file specified
The letter said in the audio is: رء
