# Read Audio Files

Read WAV or MP3 Files with Python and transcribe to text.

### Install the Required Libraries

In [None]:
%pip install azure-cognitiveservices-speech
%pip install openai

### Python Imports

In [1]:
import azure.cognitiveservices.speech as speech_sdk
import sys
import time
import os
from dotenv import load_dotenv
from openai import AzureOpenAI

sys.path.append('..\\code')

load_dotenv(override=True)


True

### Make sure we have the Azure Speech information

We will need the Speech APIKEY, REGION and LANGUAGE for this notebook.

When running the below cell, the values should reflect the Azure Speech reource you have created in 

In [None]:
speech_info = {
        'SPEECH_APIKEY': os.environ.get('SPEECH_APIKEY'),
        'SPEECH_REGION': os.environ.get('SPEECH_REGION'),
        'SPEECH_LANGUAGE': os.environ.get('SPEECH_LANGUAGE'),
}

speech_info

In [None]:
model_info = {
        'AZURE_OPENAI_MODEL_WHISPER': os.environ.get('AZURE_OPENAI_MODEL_WHISPER'),
        'AZURE_OPENAI_KEY': os.environ.get('AZURE_OPENAI_KEY'),
        'AZURE_OPENAI_MODEL_WHISPER': os.environ.get('AZURE_OPENAI_MODEL_WHISPER'),
        'AZURE_OPENAI_ENDPOINT_WHISPER': os.environ.get('AZURE_OPENAI_ENDPOINT_WHISPER'),
        'AZURE_OPENAI_VERSION_WHISPER': os.environ.get('AZURE_OPENAI_VERSION_WHISPER'),
}

model_info

### Code Definitions

Defining the functions that will read in the audio file and return the transcription.

In [4]:
# Configure the Azure Speech Service
def config_speech_service():
    try:
        speech_config = speech_sdk.SpeechConfig(
            subscription=speech_info['SPEECH_APIKEY'], 
            region=speech_info['SPEECH_REGION'], 
            speech_recognition_language=speech_info['SPEECH_LANGUAGE'])

        # Set parameters
        speech_config.set_property(speech_sdk.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, "5000")
        speech_config.set_property(speech_sdk.PropertyId.Speech_SegmentationSilenceTimeoutMs, "2000")
        speech_config.set_property(speech_sdk.PropertyId.SpeechServiceConnection_EndSilenceTimeoutMs, "5000")
    
    except Exception as ex:
        print(ex)

    return speech_config

# Execute the transcription from file with Azure Speech service 
def speech_recognize_continuous_from_file(speech_config, filename):
    # Performs continuous speech recognition with input from an audio file"""
    audio_config = speech_sdk.AudioConfig(filename=filename)

    speech_recognizer = speech_sdk.SpeechRecognizer(speech_config, audio_config)

    done = False
    transcription = []

    # Callback that signals to stop continuous recognition upon receiving an event `evt`
    def stop_cb(evt: speech_sdk.SessionEventArgs):
        print('CLOSING')
        nonlocal done
        done = True
    
    # Callback that signals the recognition has been canceled
    def speech_recognizer_recognition_canceled_cb(evt: speech_sdk.SessionEventArgs):
        print('Canceled event')

    # Callback that signals the recognition session has been stopped
    def speech_recognizer_session_stopped_cb(evt: speech_sdk.SessionEventArgs):
        print('SessionStopped event')

    # Callback while transcribing
    def speech_recognizer_recognizing_cb(evt: speech_sdk.SpeechRecognitionEventArgs):
        print('Transcribing: ', evt.result.text)

    # Callback when a sentence has finished
    def speech_recognizer_transcribed_cb(evt: speech_sdk.SpeechRecognitionEventArgs):
        print('TRANSCRIBED:')
        if evt.result.reason == speech_sdk.ResultReason.RecognizedSpeech:
            print(f'\tText: {evt.result.text}')
            transcription.append(evt.result.text)
        elif evt.result.reason == speech_sdk.ResultReason.NoMatch:
            print(f'\tNOMATCH: Speech could not be TRANSCRIBED: {evt.result.no_match_details}')
            stop_cb(evt)

    # Callback that signal the session has started
    def speech_recognizer_session_started_cb(evt: speech_sdk.SessionEventArgs):
        print('SessionStarted event')

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(speech_recognizer_recognizing_cb)
    speech_recognizer.recognized.connect(speech_recognizer_transcribed_cb)
    speech_recognizer.session_started.connect(speech_recognizer_session_started_cb)
    speech_recognizer.session_stopped.connect(speech_recognizer_session_stopped_cb)
    speech_recognizer.canceled.connect(speech_recognizer_recognition_canceled_cb)
    # stop transcribing on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)
            
    final_text = ""
    for text in transcription:
        final_text += text + " \n"
    #print(f'TRANSCRIPTION: [{final_text}]')

    speech_recognizer.stop_continuous_recognition()

    return transcription

def config_whisper():
    whisper_client = AzureOpenAI(
        api_key=model_info['AZURE_OPENAI_KEY'],  
        api_version=model_info['AZURE_OPENAI_VERSION_WHISPER'],
        base_url=f"{model_info['AZURE_OPENAI_ENDPOINT_WHISPER']}/openai/deployments/{model_info['AZURE_OPENAI_MODEL_WHISPER']}"
    )

    return whisper_client


def transcribe_with_whisper(whisper_client, filename):
    try:
        transcript = whisper_client.audio.transcriptions.create(
            file=open(filename, "rb"), 
            model=model_info['AZURE_OPENAI_MODEL_WHISPER']
            )
        return transcript
    
    except Exception as ex:
        return ex

### Read Audio File

Read the audio file and print the transcription out.

In [6]:
# Usage with Azure Speech service
speech_config=config_speech_service()
#file_path = 'sample_data/sample_audio_parte_accidente.wav'
#file_path = 'sample_data/The_National_Park.wav'
#file_path = 'sample_data/CNVSample049.wav'
file_path = 'sample_data/call_recording_en.wav'
transcript = speech_recognize_continuous_from_file(speech_config, file_path)
display(transcript)


SessionStarted event
Transcribing:  hi i just had an
Transcribing:  hi i just had an accident
Transcribing:  hi i just had an accident and wanted to
Transcribing:  hi i just had an accident and wanted to report it
Transcribing:  hi i just had an accident and wanted to report it hi
Transcribing:  hi i just had an accident and wanted to report it hi OK
Transcribing:  hi i just had an accident and wanted to report it hi OK i hope you're fine
Transcribing:  hi i just had an accident and wanted to report it hi OK i hope you're fine what
Transcribing:  hi i just had an accident and wanted to report it hi OK i hope you're fine what happened
Transcribing:  hi i just had an accident and wanted to report it hi OK i hope you're fine what happened i
Transcribing:  hi i just had an accident and wanted to report it hi OK i hope you're fine what happened i was
Transcribing:  hi i just had an accident and wanted to report it hi OK i hope you're fine what happened i was driving
Transcribing:  hi i just

["Hi. I just had an accident and wanted to report it. Hi. OK. I hope you're fine. What happened? I was driving on Comina Road and I hit another car. Are you all right? Yes, just a bit nervous. That's normal. Can you tell me your full name? Sure. My name is Alvaro Gomez Rodriguez. Do you know what caused the accident? I think I hit a car. OK. Where did the accident occur?",
 "Uncle MENA Rd. Passed exit 17. Has anyone else been injured? I don't think so, but I'm not sure. OK, we'll investigate. Can you give me the other driver's information? Yes. His name is Juan Delgado Rivera. OK, One moment, please. Can you tell me your ID, please? Yes, It's 12345678 F OK. What damages has the car suffered? The right front headlight is broken and a tire is punctured. Can you drive the car? I don't know. Isn't the tow truck coming to pick it up? OK, we'll need to inspect the car. I'll file the report to proceed with the inspection and repair request.",
 'Perfect. Thank you very much for your help.']

In [7]:
# Usage with Whisper
whisper_client = config_whisper()
#file_path = 'sample_data/CNVSample049.wav'
file_path = 'sample_data/call_recording_en.wav'
transcript = transcribe_with_whisper(whisper_client, file_path)
display(transcript)

Transcription(text="Hi, I just had an accident and wanted to report it. Hi, ok, I hope you're fine. What happened? I was driving on Comino Road and I hit another car. Are you alright? Yes, just a bit nervous. That's normal. Can you tell me your full name? Sure, my name is Alvaro Gomez Rodriguez. Do you know what caused the accident? I think I hit a car. Ok, where did the accident occur? On Comino Road, past exit 17. Has anyone else been injured? I don't think so, but I'm not sure. Ok, we'll investigate. Can you give me the other driver's information? Yes, his name is Juan Delgado Rivera. Ok, one moment, please. Can you tell me your ID, please? Yes, it's 12345678F. Ok. What damages has the car suffered? The right front headlight is broken and a tyre is punctured. Can you drive the car? I don't know. Isn't the tow truck coming to pick it up? Ok, we'll need to inspect the car. I'll file the report to proceed with the inspection and repair request. Perfect. Thank you very much for your hel