/
speech_sample.py
834 lines (687 loc) · 41.7 KB
/
speech_sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
#!/usr/bin/env python
# coding: utf-8
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
"""
Speech recognition samples for the Microsoft Cognitive Services Speech SDK
"""
import time
import wave
import string
import json
try:
import azure.cognitiveservices.speech as speechsdk
except ImportError:
print("""
Importing the Speech SDK for Python failed.
Refer to
https://docs.microsoft.com/azure/cognitive-services/speech-service/quickstart-python for
installation instructions.
""")
import sys
sys.exit(1)
# Set up the subscription info for the Speech Service:
# Replace with your own subscription key and service region (e.g., "westus").
speech_key, service_region = "YourSubscriptionKey", "YourServiceRegion"
# Specify the path to an audio file containing speech (mono WAV / PCM with a sampling rate of 16
# kHz).
weatherfilename = "whatstheweatherlike.wav"
weatherfilenamemp3 = "whatstheweatherlike.mp3"
def speech_recognize_once_from_mic():
"""performs one-shot speech recognition from the default microphone"""
# <SpeechRecognitionWithMicrophone>
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# Creates a speech recognizer using microphone as audio input.
# The default language is "en-us".
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)
# Starts speech recognition, and returns after a single utterance is recognized. The end of a
# single utterance is determined by listening for silence at the end or until a maximum of 15
# seconds of audio is processed. It returns the recognition text as result.
# Note: Since recognize_once() returns only a single utterance, it is suitable only for single
# shot recognition like command or query.
# For long-running multi-utterance recognition, use start_continuous_recognition() instead.
result = speech_recognizer.recognize_once()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(result.text))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# </SpeechRecognitionWithMicrophone>
def speech_recognize_once_from_file():
"""performs one-shot speech recognition with input from an audio file"""
# <SpeechRecognitionWithFile>
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
# Creates a speech recognizer using a file as audio input, also specify the speech language
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=speech_config, language="de-DE", audio_config=audio_config)
# Starts speech recognition, and returns after a single utterance is recognized. The end of a
# single utterance is determined by listening for silence at the end or until a maximum of 15
# seconds of audio is processed. It returns the recognition text as result.
# Note: Since recognize_once() returns only a single utterance, it is suitable only for single
# shot recognition like command or query.
# For long-running multi-utterance recognition, use start_continuous_recognition() instead.
result = speech_recognizer.recognize_once()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(result.text))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized: {}".format(result.no_match_details))
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# </SpeechRecognitionWithFile>
def speech_recognize_once_from_file_with_detailed_recognition_results():
"""performs one-shot speech recognition with input from an audio file, showing detailed recognition results including word-level timing"""
# <SpeechRecognitionFromFileWithDetailedRecognitionResults>
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# Ask for detailed recognition result
speech_config.output_format = speechsdk.OutputFormat.Detailed
# If you also want word-level timing in the detailed recognition results, set the following.
# Note that if you set the following, you can omit the previous line
# "speech_config.output_format = speechsdk.OutputFormat.Detailed",
# since word-level timing implies detailed recognition results.
speech_config.request_word_level_timestamps()
audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
# Creates a speech recognizer using a file as audio input, also specify the speech language
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=speech_config, language="en-US", audio_config=audio_config)
# Starts speech recognition, and returns after a single utterance is recognized. The end of a
# single utterance is determined by listening for silence at the end or until a maximum of 15
# seconds of audio is processed. It returns the recognition text as result.
# Note: Since recognize_once() returns only a single utterance, it is suitable only for single
# shot recognition like command or query.
# For long-running multi-utterance recognition, use start_continuous_recognition() instead.
result = speech_recognizer.recognize_once()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(result.text))
# Time units are in hundreds of nanoseconds (HNS), where 10000 HNS equals 1 millisecond
print("Offset: {}".format(result.offset))
print("Duration: {}".format(result.duration))
# Now get the detailed recognition results from the JSON
json_result = json.loads(result.json)
# The first cell in the NBest list corresponds to the recognition results
# (NOT the cell with the highest confidence number!)
print("Detailed results - Lexical: {}".format(json_result['NBest'][0]['Lexical']))
# ITN stands for Inverse Text Normalization
print("Detailed results - ITN: {}".format(json_result['NBest'][0]['ITN']))
print("Detailed results - MaskedITN: {}".format(json_result['NBest'][0]['MaskedITN']))
print("Detailed results - Display: {}".format(json_result['NBest'][0]['Display']))
# Print word-level timing. Time units are HNS.
words = json_result['NBest'][0]['Words']
print(f"Detailed results - Word timing:\nWord:\tOffset:\tDuration:")
for word in words:
print(f"{word['Word']}\t{word['Offset']}\t{word['Duration']}")
# You can access alternative recognition results through json_result['NBest'][i], i=1,2,..
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized: {}".format(result.no_match_details))
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# </SpeechRecognitionFromFileWithDetailedRecognitionResults>
def speech_recognize_once_compressed_input():
"""performs one-shot speech recognition with compressed input from an audio file"""
# <SpeechRecognitionWithCompressedFile>
class BinaryFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
def __init__(self, filename: str):
super().__init__()
self._file_h = open(filename, "rb")
def read(self, buffer: memoryview) -> int:
try:
size = buffer.nbytes
frames = self._file_h.read(size)
buffer[:len(frames)] = frames
return len(frames)
except Exception as ex:
print('Exception in `read`: {}'.format(ex))
raise
def close(self) -> None:
print('closing file')
try:
self._file_h.close()
except Exception as ex:
print('Exception in `close`: {}'.format(ex))
raise
# Creates an audio stream format. For an example we are using MP3 compressed file here
compressed_format = speechsdk.audio.AudioStreamFormat(compressed_stream_format=speechsdk.AudioStreamContainerFormat.MP3)
callback = BinaryFileReaderCallback(filename=weatherfilenamemp3)
stream = speechsdk.audio.PullAudioInputStream(stream_format=compressed_format, pull_stream_callback=callback)
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(stream=stream)
# Creates a speech recognizer using a file as audio input, also specify the speech language
speech_recognizer = speechsdk.SpeechRecognizer(speech_config, audio_config)
# Starts speech recognition, and returns after a single utterance is recognized. The end of a
# single utterance is determined by listening for silence at the end or until a maximum of 15
# seconds of audio is processed. It returns the recognition text as result.
# Note: Since recognize_once() returns only a single utterance, it is suitable only for single
# shot recognition like command or query.
# For long-running multi-utterance recognition, use start_continuous_recognition() instead.
result = speech_recognizer.recognize_once()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(result.text))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized: {}".format(result.no_match_details))
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# </SpeechRecognitionWithCompressedFile>
def speech_recognize_once_from_file_with_customized_model():
"""performs one-shot speech recognition with input from an audio file, specifying a custom
model"""
# <SpeechRecognitionUsingCustomizedModel>
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# Create source language configuration with the speech language and the endpoint ID of your customized model
# Replace with your speech language and CRIS endpoint ID.
source_language_config = speechsdk.languageconfig.SourceLanguageConfig("zh-CN", "YourEndpointId")
audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
# Creates a speech recognizer using a file as audio input and specify the source language config
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=speech_config, source_language_config=source_language_config, audio_config=audio_config)
# Starts speech recognition, and returns after a single utterance is recognized. The end of a
# single utterance is determined by listening for silence at the end or until a maximum of 15
# seconds of audio is processed. It returns the recognition text as result.
# Note: Since recognize_once() returns only a single utterance, it is suitable only for single
# shot recognition like command or query.
# For long-running multi-utterance recognition, use start_continuous_recognition() instead.
result = speech_recognizer.recognize_once()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(result.text))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized: {}".format(result.no_match_details))
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
# </SpeechRecognitionUsingCustomizedModel>
def speech_recognize_once_from_file_with_custom_endpoint_parameters():
"""performs one-shot speech recognition with input from an audio file, specifying an
endpoint with custom parameters"""
initial_silence_timeout_ms = 15 * 1e3
template = "wss://{}.stt.speech.microsoft.com/speech/recognition" \
"/conversation/cognitiveservices/v1?initialSilenceTimeoutMs={:d}"
speech_config = speechsdk.SpeechConfig(subscription=speech_key,
endpoint=template.format(service_region, int(initial_silence_timeout_ms)))
print("Using endpoint", speech_config.get_property(speechsdk.PropertyId.SpeechServiceConnection_Endpoint))
audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
# Creates a speech recognizer using a file as audio input.
# The default language is "en-us".
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
# Starts speech recognition, and returns after a single utterance is recognized. The end of a
# single utterance is determined by listening for silence at the end or until a maximum of 15
# seconds of audio is processed. It returns the recognition text as result.
# Note: Since recognize_once() returns only a single utterance, it is suitable only for single
# shot recognition like command or query.
# For long-running multi-utterance recognition, use start_continuous_recognition() instead.
result = speech_recognizer.recognize_once()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(result.text))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized: {}".format(result.no_match_details))
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
def speech_recognize_async_from_file():
"""performs one-shot speech recognition asynchronously with input from an audio file"""
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
# Creates a speech recognizer using a file as audio input.
# The default language is "en-us".
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
# Perform recognition. `recognize_async` does not block until recognition is complete,
# so other tasks can be performed while recognition is running.
# However, recognition stops when the first utterance has bee recognized.
# For long-running recognition, use continuous recognitions instead.
result_future = speech_recognizer.recognize_once_async()
print('recognition is running....')
# Other tasks can be performed here...
# Retrieve the recognition result. This blocks until recognition is complete.
result = result_future.get()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print("Recognized: {}".format(result.text))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized: {}".format(result.no_match_details))
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
def speech_recognize_continuous_from_file():
"""performs continuous speech recognition with input from an audio file"""
# <SpeechContinuousRecognitionWithFile>
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
# Start continuous speech recognition
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
# </SpeechContinuousRecognitionWithFile>
def speech_recognize_keyword_from_microphone():
"""performs keyword-triggered speech recognition with input microphone"""
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# Creates an instance of a keyword recognition model. Update this to
# point to the location of your keyword recognition model.
model = speechsdk.KeywordRecognitionModel("YourKeywordRecognitionModelFile.table")
# The phrase your keyword recognition model triggers on.
keyword = "YourKeyword"
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)
done = False
def stop_cb(evt):
"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
def recognizing_cb(evt):
"""callback for recognizing event"""
if evt.result.reason == speechsdk.ResultReason.RecognizingKeyword:
print('RECOGNIZING KEYWORD: {}'.format(evt))
elif evt.result.reason == speechsdk.ResultReason.RecognizingSpeech:
print('RECOGNIZING: {}'.format(evt))
def recognized_cb(evt):
"""callback for recognized event"""
if evt.result.reason == speechsdk.ResultReason.RecognizedKeyword:
print('RECOGNIZED KEYWORD: {}'.format(evt))
elif evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
print('RECOGNIZED: {}'.format(evt))
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
print('NOMATCH: {}'.format(evt))
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(recognizing_cb)
speech_recognizer.recognized.connect(recognized_cb)
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
# Start keyword recognition
speech_recognizer.start_keyword_recognition(model)
print('Say something starting with "{}" followed by whatever you want...'.format(keyword))
while not done:
time.sleep(.5)
speech_recognizer.stop_keyword_recognition()
def speech_recognition_with_pull_stream():
"""gives an example how to use a pull audio stream to recognize speech from a custom audio
source"""
class WavFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
"""Example class that implements the Pull Audio Stream interface to recognize speech from
an audio file"""
def __init__(self, filename: str):
super().__init__()
self._file_h = wave.open(filename, mode=None)
self.sample_width = self._file_h.getsampwidth()
assert self._file_h.getnchannels() == 1
assert self._file_h.getsampwidth() == 2
assert self._file_h.getframerate() == 16000
assert self._file_h.getcomptype() == 'NONE'
def read(self, buffer: memoryview) -> int:
"""read callback function"""
size = buffer.nbytes
frames = self._file_h.readframes(size // self.sample_width)
buffer[:len(frames)] = frames
return len(frames)
def close(self):
"""close callback function"""
self._file_h.close()
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# specify the audio format
wave_format = speechsdk.audio.AudioStreamFormat(samples_per_second=16000, bits_per_sample=16,
channels=1)
# setup the audio stream
callback = WavFileReaderCallback(weatherfilename)
stream = speechsdk.audio.PullAudioInputStream(callback, wave_format)
audio_config = speechsdk.audio.AudioConfig(stream=stream)
# instantiate the speech recognizer with pull stream input
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
# Start continuous speech recognition
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
def speech_recognition_with_push_stream():
"""gives an example how to use a push audio stream to recognize speech from a custom audio
source"""
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# setup the audio stream
stream = speechsdk.audio.PushAudioInputStream()
audio_config = speechsdk.audio.AudioConfig(stream=stream)
# instantiate the speech recognizer with push stream input
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# The number of bytes to push per buffer
n_bytes = 3200
wav_fh = wave.open(weatherfilename)
# start continuous speech recognition
speech_recognizer.start_continuous_recognition()
# start pushing data until all data has been read from the file
try:
while(True):
frames = wav_fh.readframes(n_bytes // 2)
print('read {} bytes'.format(len(frames)))
if not frames:
break
stream.write(frames)
time.sleep(.1)
finally:
# stop recognition and clean up
wav_fh.close()
stream.close()
speech_recognizer.stop_continuous_recognition()
def speech_recognize_once_with_auto_language_detection_from_mic():
"""performs one-shot speech recognition from the default microphone with auto language detection"""
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# create the auto detection language configuration with the potential source language candidates
auto_detect_source_language_config = \
speechsdk.languageconfig.AutoDetectSourceLanguageConfig(languages=["de-DE", "en-US"])
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=speech_config, auto_detect_source_language_config=auto_detect_source_language_config)
result = speech_recognizer.recognize_once()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
auto_detect_source_language_result = speechsdk.AutoDetectSourceLanguageResult(result)
print("Recognized: {} in language {}".format(result.text, auto_detect_source_language_result.language))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
def speech_recognize_with_auto_language_detection_UsingCustomizedModel():
"""performs speech recognition from the audio file with auto language detection, using customized model"""
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
# Replace the languages with your languages in BCP-47 format, e.g. fr-FR.
# Please see https://docs.microsoft.com/azure/cognitive-services/speech-service/language-support
# for all supported languages
en_language_config = speechsdk.languageconfig.SourceLanguageConfig("en-US")
# Replace the languages with your languages in BCP-47 format, e.g. zh-CN.
# Set the endpoint ID of your customized mode that will be used for fr-FR.
# Replace with your own CRIS endpoint ID.
fr_language_config = speechsdk.languageconfig.SourceLanguageConfig("fr-FR", "myendpointId")
# create the auto detection language configuration with the source language configurations
auto_detect_source_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
sourceLanguageConfigs=[en_language_config, fr_language_config])
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=speech_config,
auto_detect_source_language_config=auto_detect_source_language_config,
audio_config=audio_config)
result = speech_recognizer.recognize_once()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
auto_detect_source_language_result = speechsdk.AutoDetectSourceLanguageResult(result)
print("Recognized: {} in language {}".format(result.text, auto_detect_source_language_result.language))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
def speech_recognize_keyword_locally_from_microphone():
"""runs keyword spotting locally, with direct access to the result audio"""
# Creates an instance of a keyword recognition model. Update this to
# point to the location of your keyword recognition model.
model = speechsdk.KeywordRecognitionModel("YourKeywordRecognitionModelFile.table")
# The phrase your keyword recognition model triggers on.
keyword = "YourKeyword"
# Create a local keyword recognizer with the default microphone device for input.
keyword_recognizer = speechsdk.KeywordRecognizer()
done = False
def recognized_cb(evt):
# Only a keyword phrase is recognized. The result cannot be 'NoMatch'
# and there is no timeout. The recognizer runs until a keyword phrase
# is detected or recognition is canceled (by stop_recognition_async()
# or due to the end of an input file or stream).
result = evt.result
if result.reason == speechsdk.ResultReason.RecognizedKeyword:
print("RECOGNIZED KEYWORD: {}".format(result.text))
nonlocal done
done = True
def canceled_cb(evt):
result = evt.result
if result.reason == speechsdk.ResultReason.Canceled:
print('CANCELED: {}'.format(result.cancellation_details.reason))
nonlocal done
done = True
# Connect callbacks to the events fired by the keyword recognizer.
keyword_recognizer.recognized.connect(recognized_cb)
keyword_recognizer.canceled.connect(canceled_cb)
# Start keyword recognition.
result_future = keyword_recognizer.recognize_once_async(model)
print('Say something starting with "{}" followed by whatever you want...'.format(keyword))
result = result_future.get()
# Read result audio (incl. the keyword).
if result.reason == speechsdk.ResultReason.RecognizedKeyword:
time.sleep(2) # give some time so the stream is filled
result_stream = speechsdk.AudioDataStream(result)
result_stream.detach_input() # stop any more data from input getting to the stream
save_future = result_stream.save_to_wav_file_async("AudioFromRecognizedKeyword.wav")
print('Saving file...')
saved = save_future.get()
# If active keyword recognition needs to be stopped before results, it can be done with
#
# stop_future = keyword_recognizer.stop_recognition_async()
# print('Stopping...')
# stopped = stop_future.get()
def pronunciation_assessment_from_microphone():
""""performs one-shot pronunciation assessment asynchronously with input from microphone."""
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and service region (e.g., "westus").
# Note: The pronunciation assessment feature is currently only available on en-US language.
config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# The pronunciation assessment service has a longer default end silence timeout (5 seconds) than normal STT
# as the pronunciation assessment is widely used in education scenario where kids have longer break in reading.
# You can adjust the end silence timeout based on your real scenario.
config.set_property(speechsdk.PropertyId.SpeechServiceConnection_EndSilenceTimeoutMs, "3000")
reference_text = ""
# create pronunciation assessment config, set grading system, granularity and if enable miscue based on your requirement.
pronunciation_config = speechsdk.PronunciationAssessmentConfig(reference_text=reference_text,
grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme,
enable_miscue=True)
recognizer = speechsdk.SpeechRecognizer(speech_config=config)
while True:
# Receives reference text from console input.
print('Enter reference text you want to assess, or enter empty text to exit.')
print('> ')
try:
reference_text = input()
except EOFError:
break
pronunciation_config.reference_text = reference_text
pronunciation_config.apply_to(recognizer)
# Starts recognizing.
print('Read out "{}" for pronunciation assessment ...'.format(reference_text))
# Note: Since recognize_once() returns only a single utterance, it is suitable only for single
# shot evaluation.
# For long-running multi-utterance pronunciation evaluation, use start_continuous_recognition() instead.
result = recognizer.recognize_once_async().get()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print('Recognized: {}'.format(result.text))
print(' Pronunciation Assessment Result:')
pronunciation_result = speechsdk.PronunciationAssessmentResult(result)
print(' Accuracy score: {}, Pronunciation score: {}, Completeness score : {}, FluencyScore: {}'.format(
pronunciation_result.accuracy_score, pronunciation_result.pronunciation_score,
pronunciation_result.completeness_score, pronunciation_result.fluency_score
))
print(' Word-level details:')
for idx, word in enumerate(pronunciation_result.words):
print(' {}: word: {}, accuracy score: {}, error type: {};'.format(
idx + 1, word.word, word.accuracy_score, word.error_type
))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
def pronunciation_assessment_continuous_from_file():
"""performs continuous speech recognition asynchronously with input from an audio file"""
import difflib
import json
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and service region (e.g., "westus").
# Note: The pronunciation assessment feature is currently only available on en-US language.
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=weatherfilename)
reference_text = "What's the weather like?"
# create pronunciation assessment config, set grading system, granularity and if enable miscue based on your requirement.
enable_miscue = True
pronunciation_config = speechsdk.PronunciationAssessmentConfig(reference_text=reference_text,
grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme,
enable_miscue=enable_miscue)
# Creates a speech recognizer using a file as audio input.
language = 'en-US'
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language=language, audio_config=audio_config)
# apply pronunciation assessment config to speech recognizer
pronunciation_config.apply_to(speech_recognizer)
done = False
recognized_words = []
accuracy_scores = []
durations = []
valid_durations = []
start_offset, end_offset = None, None
def stop_cb(evt):
"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
def recognized(evt):
print('pronunciation assessment for: {}'.format(evt.result.text))
pronunciation_result = speechsdk.PronunciationAssessmentResult(evt.result)
print(' Accuracy score: {}, pronunciation score: {}, completeness score : {}, fluency score: {}'.format(
pronunciation_result.accuracy_score, pronunciation_result.pronunciation_score,
pronunciation_result.completeness_score, pronunciation_result.fluency_score
))
nonlocal recognized_words, accuracy_scores, durations, valid_durations, start_offset, end_offset
recognized_words += pronunciation_result.words
accuracy_scores.append(pronunciation_result.accuracy_score)
json_result = evt.result.properties.get(speechsdk.PropertyId.SpeechServiceResponse_JsonResult)
jo = json.loads(json_result)
nb = jo['NBest'][0]
durations.append(sum([int(w['Duration']) for w in nb['Words']]))
if start_offset is None:
start_offset = nb['Words'][0]['Offset']
end_offset = nb['Words'][-1]['Offset'] + nb['Words'][-1]['Duration'] + 100000
for w, d in zip(pronunciation_result.words, nb['Words']):
if w.error_type == 'None':
valid_durations.append(d['Duration'] + 100000)
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognized.connect(recognized)
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
# Start continuous pronunciation assessment
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
# We can calculate whole accuracy and fluency scores by duration weighted averaging
accuracy_score = sum(i[0] * i[1] for i in zip(accuracy_scores, durations)) / sum(durations)
# Re-calculate fluency score
if start_offset is not None:
fluency_score = sum(valid_durations) / (end_offset - start_offset) * 100
# we need to convert the reference text to lower case, and split to words, then remove the punctuations.
if language == 'zh-CN':
# Use jieba package to split words for Chinese
import jieba, zhon.hanzi
jieba.suggest_freq([x.word for x in recognized_words], True)
reference_words = [w for w in jieba.cut(reference_text) if w not in zhon.hanzi.punctuation]
else:
reference_words = [w.strip(string.punctuation) for w in reference_text.lower().split()]
# For continuous pronunciation assessment mode, the service won't return the words with `Insertion` or `Omission`
# even if miscue is enabled.
# We need to compare with the reference text after received all recognized words to get these error words.
if enable_miscue:
diff = difflib.SequenceMatcher(None, reference_words, [x.word for x in recognized_words])
final_words = []
for tag, i1, i2, j1, j2 in diff.get_opcodes():
if tag in ['insert', 'replace']:
for word in recognized_words[j1:j2]:
if word.error_type == 'None':
word._error_type = 'Insertion'
final_words.append(word)
if tag in ['delete', 'replace']:
for word_text in reference_words[i1:i2]:
word = speechsdk.PronunciationAssessmentWordResult({
'Word': word_text,
'PronunciationAssessment': {
'ErrorType': 'Omission',
}
})
final_words.append(word)
if tag == 'equal':
final_words += recognized_words[j1:j2]
else:
final_words = recognized_words
# Calculate whole completeness score
completeness_score = len([w for w in final_words if w.error_type == 'None']) / len(reference_words) * 100
print(' Paragraph accuracy score: {}, completeness score: {}, fluency score: {}'.format(
accuracy_score, completeness_score, fluency_score
))
for idx, word in enumerate(final_words):
print(' {}: word: {}\taccuracy score: {}\terror type: {};'.format(
idx + 1, word.word, word.accuracy_score, word.error_type
))