# Read Audio Files

Read WAV or MP3 Files with Python and transcribe to text.

### Install the Required Libraries

In [31]:
%pip install azure-cognitiveservices-speech
%pip install openai

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Python Imports

In [1]:
import azure.cognitiveservices.speech as speech_sdk
import sys
import time
import os
from dotenv import load_dotenv
from openai import AzureOpenAI

sys.path.append('..\\code')

load_dotenv(override=True)


True

### Make sure we have the Azure Speech information

We will need the Speech APIKEY, REGION and LANGUAGE for this notebook.

When running the below cell, the values should reflect the Azure Speech reource you have created in 

In [2]:
speech_info = {
        'SPEECH_APIKEY': os.environ.get('SPEECH_APIKEY'),
        'SPEECH_REGION': os.environ.get('SPEECH_REGION'),
        'SPEECH_LANGUAGE': os.environ.get('SPEECH_LANGUAGE'),
}

speech_info

{'SPEECH_APIKEY': '6e067df5f9ff46cfac9cda6f8f122c60',
 'SPEECH_REGION': 'westeurope',
 'SPEECH_LANGUAGE': 'en-US'}

In [3]:
model_info = {
        'AZURE_OPENAI_MODEL_WHISPER': os.environ.get('AZURE_OPENAI_MODEL_WHISPER'),
        'AZURE_OPENAI_KEY': os.environ.get('AZURE_OPENAI_KEY'),
        'AZURE_OPENAI_MODEL_WHISPER': os.environ.get('AZURE_OPENAI_MODEL_WHISPER'),
        'AZURE_OPENAI_ENDPOINT_WHISPER': os.environ.get('AZURE_OPENAI_ENDPOINT_WHISPER'),
        'AZURE_OPENAI_VERSION_WHISPER': os.environ.get('AZURE_OPENAI_VERSION_WHISPER'),
}

model_info

{'AZURE_OPENAI_MODEL_WHISPER': 'whisper',
 'AZURE_OPENAI_KEY': 'daf52e67bb574e18ac4467cd6f787c83',
 'AZURE_OPENAI_ENDPOINT_WHISPER': 'https://openai-angels.openai.azure.com/',
 'AZURE_OPENAI_VERSION_WHISPER': '2024-02-01'}

### Code Definitions

Defining the functions that will read in the audio file and return the transcription.

In [11]:
# Configure the Azure Speech Service
def config_speech_service():
    try:
        speech_config = speech_sdk.SpeechConfig(
            subscription=speech_info['SPEECH_APIKEY'], 
            region=speech_info['SPEECH_REGION'], 
            speech_recognition_language=speech_info['SPEECH_LANGUAGE'])

        # Set parameters
        speech_config.set_property(speech_sdk.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, "5000")
        speech_config.set_property(speech_sdk.PropertyId.Speech_SegmentationSilenceTimeoutMs, "2000")
        speech_config.set_property(speech_sdk.PropertyId.SpeechServiceConnection_EndSilenceTimeoutMs, "5000")
    
    except Exception as ex:
        print(ex)

    return speech_config

# Execute the transcription from file with Azure Speech service 
def speech_recognize_continuous_from_file(speech_config, filename):
    # Performs continuous speech recognition with input from an audio file"""
    audio_config = speech_sdk.AudioConfig(filename=filename)

    speech_recognizer = speech_sdk.SpeechRecognizer(speech_config, audio_config)

    done = False
    transcription = []

    # Callback that signals to stop continuous recognition upon receiving an event `evt`
    def stop_cb(evt: speech_sdk.SessionEventArgs):
        print('CLOSING')
        nonlocal done
        done = True
    
    # Callback that signals the recognition has been canceled
    def speech_recognizer_recognition_canceled_cb(evt: speech_sdk.SessionEventArgs):
        print('Canceled event')

    # Callback that signals the recognition session has been stopped
    def speech_recognizer_session_stopped_cb(evt: speech_sdk.SessionEventArgs):
        print('SessionStopped event')

    # Callback while transcribing
    def speech_recognizer_recognizing_cb(evt: speech_sdk.SpeechRecognitionEventArgs):
        print('Transcribing: ', evt.result.text)

    # Callback when a sentence has finished
    def speech_recognizer_transcribed_cb(evt: speech_sdk.SpeechRecognitionEventArgs):
        print('TRANSCRIBED:')
        if evt.result.reason == speech_sdk.ResultReason.RecognizedSpeech:
            print(f'\tText: {evt.result.text}')
            transcription.append(evt.result.text)
        elif evt.result.reason == speech_sdk.ResultReason.NoMatch:
            print(f'\tNOMATCH: Speech could not be TRANSCRIBED: {evt.result.no_match_details}')
            stop_cb(evt)

    # Callback that signal the session has started
    def speech_recognizer_session_started_cb(evt: speech_sdk.SessionEventArgs):
        print('SessionStarted event')

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(speech_recognizer_recognizing_cb)
    speech_recognizer.recognized.connect(speech_recognizer_transcribed_cb)
    speech_recognizer.session_started.connect(speech_recognizer_session_started_cb)
    speech_recognizer.session_stopped.connect(speech_recognizer_session_stopped_cb)
    speech_recognizer.canceled.connect(speech_recognizer_recognition_canceled_cb)
    # stop transcribing on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)
            
    final_text = ""
    for text in transcription:
        final_text += text + " \n"
    #print(f'TRANSCRIPTION: [{final_text}]')

    speech_recognizer.stop_continuous_recognition()

    return transcription

def config_whisper():
    whisper_client = AzureOpenAI(
        api_key=model_info['AZURE_OPENAI_KEY'],  
        api_version=model_info['AZURE_OPENAI_VERSION_WHISPER'],
        base_url=f"{model_info['AZURE_OPENAI_ENDPOINT_WHISPER']}/openai/deployments/{model_info['AZURE_OPENAI_MODEL_WHISPER']}"
    )

    return whisper_client


def transcribe_with_whisper(whisper_client, filename):
    try:
        transcript = whisper_client.audio.transcriptions.create(
            file=open(filename, "rb"), 
            model=model_info['AZURE_OPENAI_MODEL_WHISPER']
            )
        return transcript
    
    except Exception as ex:
        return ex

### Read Audio File

Read the audio file and print the transcription out.

In [29]:
# Usage with Azure Speech service
speech_config=config_speech_service()
#file_path = 'sample_data/sample_audio_parte_accidente.wav'
#file_path = 'sample_data/The_National_Park.wav'
#file_path = 'sample_data/CNVSample049.wav'
file_path = 'call_recording_en.wav'
transcript = speech_recognize_continuous_from_file(speech_config, file_path)
display(transcript)


SessionStarted event
Transcribing:  the patient report
Transcribing:  the patient reported no neurom
Transcribing:  the patient reported no neuromuscular
Transcribing:  the patient reported no neuromuscular complaints
Transcribing:  the patient reported no neuromuscular complaints and on
Transcribing:  the patient reported no neuromuscular complaints and on physical
Transcribing:  the patient reported no neuromuscular complaints and on physical exam showed
Transcribing:  the patient reported no neuromuscular complaints and on physical exam showed no overt mus
Transcribing:  the patient reported no neuromuscular complaints and on physical exam showed no overt muscle weakness
TRANSCRIBED:
	Text: The patient reported no neuromuscular complaints and on physical exam showed no overt muscle weakness.
Canceled event
CLOSING
SessionStopped event
CLOSING


['The patient reported no neuromuscular complaints and on physical exam showed no overt muscle weakness.']

In [13]:
# Usage with Whisper
whisper_client = config_whisper()
#file_path = 'sample_data/CNVSample049.wav'
file_path = 'sample_data/call_recording_en.wav'
transcript = transcribe_with_whisper(whisper_client, file_path)
display(transcript)

FileNotFoundError(2, 'No such file or directory')