Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SDK v1.13 draft: add KeywordRecognizer support to UWP VA #500

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions clients/csharp-uwp/UWPVoiceAssistantSample/ActivityWrapper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@

namespace UWPVoiceAssistantSample
{
using Newtonsoft.Json.Linq;

using Newtonsoft.Json.Linq;
using System.Globalization;

/// <summary>
/// Class determines the activity received from the Bot and deserializes the response.
/// </summary>
Expand All @@ -18,31 +19,31 @@ public ActivityWrapper(string activityJson)
{
var activityObj = JObject.Parse(activityJson);

switch (activityObj["type"]?.ToString().ToLower())
switch (activityObj["type"]?.ToString().ToUpperInvariant())
{
case "trace":
case "TRACE":
this.Type = ActivityType.Trace;
break;
case "message":
case "MESSAGE":
this.Type = ActivityType.Message;
break;
case "event":
case "EVENT":
this.Type = ActivityType.Event;
break;
default:
this.Type = ActivityType.Unrecognized;
break;
}

switch (activityObj["inputHint"]?.ToString().ToLower())
switch (activityObj["inputHint"]?.ToString().ToUpperInvariant())
{
case "ignoringinput":
case "IGNORINGINPUT":
this.InputHint = InputHintType.IgnoringInput;
break;
case "acceptinginput":
case "ACCEPTINGINPUT":
this.InputHint = InputHintType.AcceptingInput;
break;
case "expectinginput":
case "EXPECTINGINPUT":
this.InputHint = InputHintType.ExpectingInput;
break;
default:
Expand Down
10 changes: 4 additions & 6 deletions clients/csharp-uwp/UWPVoiceAssistantSample/App.xaml.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ public sealed partial class App : Application, IDisposable
{
private readonly ILogProvider logger;
private readonly IDialogManager dialogManager;
private readonly IKeywordRegistration keywordRegistration;
private readonly IAgentSessionManager agentSessionManager;
private BackgroundTaskDeferral deferral;
private bool alreadyDisposed = false;
Expand All @@ -45,21 +46,18 @@ public App()
this.Suspending += this.OnSuspending;
MVARegistrationHelpers.UnlockLimitedAccessFeature();

var keywordRegistration = new KeywordRegistration(
new Version(1, 0, 0, 0));

this.keywordRegistration = new KeywordRegistration();
this.agentSessionManager = new AgentSessionManager();

this.dialogManager = new DialogManager<List<byte>>(
new DirectLineSpeechDialogBackend(),
keywordRegistration,
this.keywordRegistration,
new AgentAudioInputProvider(),
this.agentSessionManager,
new MediaPlayerDialogAudioOutputAdapter());

var serviceCollection = new ServiceCollection();
serviceCollection.AddSingleton(this.dialogManager);
serviceCollection.AddSingleton<IKeywordRegistration>(keywordRegistration);
serviceCollection.AddSingleton(this.keywordRegistration);
serviceCollection.AddSingleton(this.agentSessionManager);
this.Services = serviceCollection.BuildServiceProvider();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public enum WaveHeaderLengthOption
/// <summary>
/// An encapsulation of extra data and operations needed to apply a WAVEFORMAT header to an existing stream.
/// </summary>
public class WaveHeader
public static class WaveHeader
{
/// <summary>
/// Writes a standard WAVEFORMAT header (RIFF) to the provided stream that matches the provided PCM encoding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,16 @@ public class AgentAudioInputProvider
protected AudioEncodingProperties outputEncoding;
protected IAgentSessionWrapper agentSession;
protected AudioGraph inputGraph;
protected AudioDeviceInputNode inputNode;
protected AudioFrameOutputNode outputNode;
protected bool graphRunning;
protected bool disposed;
protected SemaphoreSlim debugAudioOutputFileSemaphore;
protected Stream debugAudioOutputFileStream;
private readonly ILogProvider logger;
private AudioDeviceInputNode inputNode;
private bool dataAvailableInitialized = false;
private int bytesToSkip;
private int bytesAlreadySkipped;
private ILogProvider logger;

/// <summary>
/// Initializes a new instance of the <see cref="AgentAudioInputProvider"/> class.
Expand All @@ -65,7 +65,7 @@ public AgentAudioInputProvider()
/// a keyword in an audio stream. Some amount of silence/audio prior to the keyword is necessary for
/// normal operation.
/// </summary>
public static TimeSpan InitialKeywordTrimDuration { get; } = new TimeSpan(0, 0, 0, 0, 2000);
public static TimeSpan InitialKeywordTrimDuration { get; } = new TimeSpan(0, 0, 0, 0, 2250);

/// <summary>
/// Gets or sets a value indicating whether debug audio output to local file capture
Expand Down Expand Up @@ -154,6 +154,9 @@ public async Task StopAsync()
await this.FinishDebugAudioDumpIfNeededAsync();

this.inputGraph.Stop();
this.inputNode.Stop();
this.inputNode.Dispose();
this.inputGraph.Dispose();

this.logger.Log(LogMessageLevel.AudioLogs, "Audio Graph Stopped");
this.graphRunning = false;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

namespace UWPVoiceAssistantSample.AudioInput
{
using Microsoft.CognitiveServices.Speech;
using UWPVoiceAssistantSample.AudioCommon;
using Windows.Media.MediaProperties;

/// <summary>
/// Encapsulation of the data source types usable by PullAudioInputSink.
/// </summary>
public class PullAudioDataSource
{
private PullAudioDataSource(object baseSource, bool padWithZeroes = true)
{
this.BaseSource = baseSource;
this.PadWithZeroes = padWithZeroes;
}

/// <summary>
/// Gets a predefined PullAudioDataSource for empty input (all zeroes).
/// </summary>
public static PullAudioDataSource EmptyInput { get; } = new PullAudioDataSource(null);

/// <summary>
/// Gets a predefined PullAudioDataSource that designates data will be manually pushed into the consuming sink.
/// </summary>
public static PullAudioDataSource PushedData { get; } = new PullAudioDataSource(null);

/// <summary>
/// Gets the underlying object, if applicable, used as the data source.
/// </summary>
public object BaseSource { get; private set; }

/// <summary>
/// Gets a value indicating whether incomplete reads should be padded with zeroes for this source.
/// </summary>
public bool PadWithZeroes { get; private set; } = true;

/// <summary>
/// Gets or sets the encoding information associated with the current base audio source.
/// </summary>
public AudioEncodingProperties AudioFormat { get; set; } = DirectLineSpeechAudio.DefaultInput.Encoding;

/// <summary>
/// Creates a PullAudioDataSource from the provided KeywordRecognitionResult that will instruct consumers to
/// read data from the derived AudioDataInputStream.
/// </summary>
/// <param name="result"> The KeywordRecognitionResult from which to derive the input audio. </param>
/// <param name="padWithZeroes"> Whether incomplete reads should be padded with zeroes. </param>
/// <returns> A PullAudioDataSource for this KeywordRecognitionResult. </returns>
public static PullAudioDataSource FromKeywordResult(KeywordRecognitionResult result, bool padWithZeroes = true)
=> new PullAudioDataSource(AudioDataStream.FromResult(result), padWithZeroes);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

namespace UWPVoiceAssistantSample.AudioInput
{
using System;
using System.Collections.Generic;
using System.Diagnostics.Contracts;
using System.Threading;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Windows.Media.MediaProperties;

/// <summary>
/// Helper class that encapsulates state management for a reusable PullAudioInputStream object that may use a variety
/// of underlying sources.
/// </summary>
public class PullAudioInputSink : PullAudioInputStreamCallback
{
private readonly List<byte> pushDataBuffer = new List<byte>();
private readonly object dataSourceLock = new object();
private PullAudioDataSource dataSource;

/// <summary>
/// Raised upon the first read that crosses the current BookmarkPosition, as counted since last reset.
/// </summary>
public event Action<TimeSpan> BookmarkReached;

/// <summary>
/// Gets the duration of audio pulled from this sink since its last reset operation.
/// </summary>
public TimeSpan AudioReadSinceReset { get; private set; } = TimeSpan.Zero;

/// <summary>
/// Gets or sets the next position when BookmarkReached will be fired if a read causes BytesReadSinceReset to cross
/// the position.
/// </summary>
public TimeSpan BookmarkPosition { get; set; } = TimeSpan.Zero;

/// <summary>
/// Gets or sets a friendly label to associate with this input sink.
/// </summary>
public string Label { get; set; }

/// <summary>
/// Gets or sets the active data source to be used for subsequent reads from this input sink.
/// </summary>
public PullAudioDataSource DataSource
{
get => this.dataSource;
set
{
if (this.dataSource != null)
{
switch (this.dataSource.BaseSource)
{
case AudioDataStream resultStream:
resultStream.DetachInput();
resultStream.Dispose();
break;
default:
break;
}
}

this.dataSource = value;
}
}

/// <summary>
/// Adds the provided data to the buffer to be consumed when an appropriate source type is set.
/// </summary>
/// <param name="bytes"> The bytes to enqueue to the buffer. </param>
public void PushData(IEnumerable<byte> bytes)
{
lock (this.dataSourceLock)
{
this.pushDataBuffer.AddRange(bytes);
}
}

/// <summary>
/// Resets the source and data consumption count of this input sink.
/// </summary>
public void Reset()
{
this.DataSource = null;
lock (this.dataSourceLock)
{
this.pushDataBuffer.Clear();
}

this.AudioReadSinceReset = TimeSpan.Zero;
}

/// <summary>
/// Implemented for PullAudioInputStreamCallback. Fills the provided buffer from the currently selected data
/// source and optionally pads incomplete base reads with zeroes to prevent a stream termination.
/// </summary>
/// <param name="dataBuffer"> The buffer to fill with data. </param>
/// <param name="size"> The size of the buffer. </param>
/// <returns> The final number of bytes populated in dataBuffer. </returns>
public override int Read(byte[] dataBuffer, uint size)
{
Contract.Requires(dataBuffer != null);

var baseSource = this.DataSource?.BaseSource;

var bytesRead =
this.DataSource == PullAudioDataSource.PushedData ? this.ReadFromBuffer(dataBuffer)
: baseSource is AudioDataStream ? this.ReadFromKeyword(dataBuffer)
: baseSource is null ? 0
: throw new ArgumentException("Unsupported PullAudioDataSource");

if (this.DataSource != null && this.DataSource.PadWithZeroes)
{
Array.Fill<byte>(dataBuffer, 0, bytesRead, dataBuffer.Length - bytesRead);
bytesRead = dataBuffer.Length;
}

if (this.DataSource?.AudioFormat is AudioEncodingProperties audioFormat)
{
var priorTotalReadDuration = this.AudioReadSinceReset;
this.AudioReadSinceReset += TimeSpan.FromSeconds(8.0f * bytesRead / audioFormat.Bitrate);

if (priorTotalReadDuration < this.BookmarkPosition
&& this.AudioReadSinceReset >= this.BookmarkPosition)
{
this.BookmarkReached?.Invoke(this.AudioReadSinceReset);
}
}

return bytesRead;
}

private int ReadFromKeyword(byte[] buffer)
{
for (var doneWaiting = false; !doneWaiting;)
{
lock (this.dataSourceLock)
{
doneWaiting = !(this.DataSource?.BaseSource is AudioDataStream keywordAudioSource)
|| keywordAudioSource.GetStatus() != StreamStatus.PartialData
|| keywordAudioSource.CanReadData((uint)buffer.Length);
}

if (!doneWaiting)
{
Thread.Sleep(50);
}
}

lock (this.dataSourceLock)
{
if (!(this.DataSource?.BaseSource is AudioDataStream keywordAudioSource))
{
return 0;
}

var bytesToRead = buffer.Length;

while (!keywordAudioSource.CanReadData((uint)bytesToRead))
{
bytesToRead--;
}

var bufferToUse = bytesToRead == buffer.Length ? buffer : new byte[bytesToRead];
var result = keywordAudioSource.ReadData(bufferToUse);

if (bufferToUse != buffer)
{
Array.Copy(bufferToUse, buffer, bufferToUse.Length);
}

return (int)result;
}
}

private int ReadFromBuffer(byte[] readBuffer)
{
while (true)
{
lock (this.dataSourceLock)
{
if (this.pushDataBuffer.Count >= readBuffer.Length || this.DataSource != PullAudioDataSource.PushedData)
{
var bytesAvailable = (int)Math.Min(readBuffer.Length, this.pushDataBuffer.Count);
this.pushDataBuffer.CopyTo(0, readBuffer, 0, bytesAvailable);
this.pushDataBuffer.RemoveRange(0, bytesAvailable);
return bytesAvailable;
}
}

Thread.Sleep(50);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ public class DirectLineSpeechAudioOutputStream
/// Initializes a new instance of the <see cref="DirectLineSpeechAudioOutputStream"/> class.
/// </summary>
/// <param name="audioSource"> The PullAudioOutputStream that should be read from. </param>
/// <param name="format"> The format information to associate with this audio source. </param>
public DirectLineSpeechAudioOutputStream(PullAudioOutputStream audioSource, DialogAudio format)
: base(format)
{
Expand Down
Loading