-
Notifications
You must be signed in to change notification settings - Fork 18
/
AzureAISearchProvider.cs
300 lines (266 loc) · 12.3 KB
/
AzureAISearchProvider.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
using Azure;
using Azure.Identity;
using Azure.Search.Documents;
using Azure.Search.Documents.Indexes;
using Azure.Search.Documents.Indexes.Models;
using Azure.Search.Documents.Models;
using Microsoft.Azure.WebJobs.Extensions.OpenAI.Search;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace Microsoft.Azure.WebJobs.Extensions.OpenAI.AzureAISearch;
sealed class AzureAISearchProvider : ISearchProvider
{
readonly IConfiguration configuration;
readonly ILogger logger;
readonly bool isSemanticSearchEnabled = false;
readonly bool useSemanticCaptions = false;
readonly int vectorSearchDimensions = 1536;
readonly string searchAPIKeySetting = "SearchAPIKey";
const string defaultSearchIndexName = "openai-index";
const string vectorSearchConfigName = "openai-vector-config";
const string vectorSearchProfile = "openai-vector-profile";
public string Name { get; set; } = "AzureAISearch";
/// <summary>
/// Initializes AI Search provider.
/// </summary>
/// <param name="configuration">The configuration.</param>
/// <param name="loggerFactory">The logger factory.</param>
/// <exception cref="ArgumentNullException">Throws ArgumentNullException if logger factory is null.</exception>
public AzureAISearchProvider(IConfiguration configuration, ILoggerFactory loggerFactory, IOptions<AzureAISearchConfigOptions> azureAiSearchConfigOptions)
{
this.configuration = configuration ?? throw new ArgumentNullException(nameof(configuration));
if (loggerFactory == null)
{
throw new ArgumentNullException(nameof(loggerFactory));
}
this.isSemanticSearchEnabled = azureAiSearchConfigOptions.Value.IsSemanticSearchEnabled;
this.useSemanticCaptions = azureAiSearchConfigOptions.Value.UseSemanticCaptions;
this.searchAPIKeySetting = azureAiSearchConfigOptions.Value.SearchAPIKeySetting ?? this.searchAPIKeySetting;
int value = azureAiSearchConfigOptions.Value.VectorSearchDimensions;
if (value < 2 || value > 3072)
{
throw new ArgumentOutOfRangeException(nameof(AzureAISearchConfigOptions.VectorSearchDimensions), value, "Vector search dimensions must be between 2 and 3072");
}
this.vectorSearchDimensions = value;
this.logger = loggerFactory.CreateLogger<AzureAISearchProvider>();
}
/// <summary>
/// Add a document to the search index.
/// </summary>
/// <param name="document">The searchable document.</param>
/// <param name="cancellationToken">The cancellation token.</param>
/// <returns>Returns a task that completes when the document is successfully saved.</returns>
public async Task AddDocumentAsync(SearchableDocument document, CancellationToken cancellationToken)
{
if (document.ConnectionInfo is null)
{
throw new ArgumentNullException(nameof(document.ConnectionInfo));
}
string endpoint = this.configuration.GetValue<string>(document.ConnectionInfo.ConnectionName);
SearchIndexClient searchIndexClient = this.GetSearchIndexClient(endpoint);
SearchClient searchClient = this.GetSearchClient(endpoint, document.ConnectionInfo.CollectionName ?? defaultSearchIndexName);
await this.CreateIndexIfDoesntExist(searchIndexClient, document.ConnectionInfo.CollectionName ?? defaultSearchIndexName, cancellationToken);
await this.IndexSectionsAsync(searchClient, document, cancellationToken);
}
/// <summary>
/// Search for documents using the provided request.
/// </summary>
/// <param name="request">The search request.</param>
/// <returns>Search Response.</returns>
/// <exception cref="ArgumentException">Throws argument exception if query or embeddings is null.</exception>
/// <exception cref="InvalidOperationException">Throws the invalid operation exception if search result response is null.</exception>
public async Task<SearchResponse> SearchAsync(SearchRequest request)
{
if (request.Query is null && request.Embeddings.IsEmpty)
{
throw new ArgumentException("Either query or embeddings must be provided");
}
if (request.ConnectionInfo is null)
{
throw new ArgumentNullException(nameof(request.ConnectionInfo));
}
string endpoint = this.configuration.GetValue<string>(request.ConnectionInfo.ConnectionName);
SearchClient searchClient = this.GetSearchClient(endpoint, request.ConnectionInfo.CollectionName ?? defaultSearchIndexName);
SearchOptions searchOptions = this.isSemanticSearchEnabled
? new SearchOptions
{
QueryType = SearchQueryType.Semantic,
SemanticSearch = new()
{
SemanticConfigurationName = "default",
QueryCaption = new(this.useSemanticCaptions
? QueryCaptionType.Extractive
: QueryCaptionType.None),
},
Size = request.MaxResults,
}
: new SearchOptions
{
Size = request.MaxResults,
};
// Use vector search if embeddings are provided.
if (!request.Embeddings.IsEmpty)
{
VectorizedQuery vectorQuery = new(request.Embeddings)
{
// Use a higher K value for semantic search to get better results.
KNearestNeighborsCount = this.isSemanticSearchEnabled ? Math.Max(50, request.MaxResults) : request.MaxResults,
};
vectorQuery.Fields.Add("embeddings");
searchOptions.VectorSearch = new();
searchOptions.VectorSearch.Queries.Add(vectorQuery);
}
Response<SearchResults<SearchDocument>> searchResultResponse = await searchClient.SearchAsync<SearchDocument>(
request.Query, searchOptions);
if (searchResultResponse.Value is null)
{
throw new InvalidOperationException($"Failed to get search result from Azure AI Search instance: {searchClient.ServiceName} and index: {searchClient.IndexName}");
}
SearchResults<SearchDocument> searchResult = searchResultResponse.Value;
List<SearchResult> results = new(capacity: request.MaxResults);
foreach (SearchResult<SearchDocument> doc in searchResult.GetResults())
{
doc.Document.TryGetValue("title", out object? titleValue);
string? contentValue;
if (this.useSemanticCaptions)
{
IEnumerable<string> docs = doc.SemanticSearch.Captions.Select(c => c.Text);
contentValue = string.Join(" . ", docs);
}
else
{
doc.Document.TryGetValue("text", out object? content);
contentValue = (string)content;
}
if (titleValue is string title && contentValue is string text)
{
results.Add(new SearchResult(title, text));
}
}
SearchResponse response = new(results);
return response;
}
async Task CreateIndexIfDoesntExist(SearchIndexClient searchIndexClient, string searchIndexName, CancellationToken cancellationToken = default)
{
AsyncPageable<string> indexNames = searchIndexClient.GetIndexNamesAsync();
await foreach (Page<string> page in indexNames.AsPages())
{
if (page.Values.Any(indexName => string.Equals(indexName, searchIndexName, StringComparison.OrdinalIgnoreCase)))
{
this.logger.LogDebug("Search index - {searchIndexName} already exists", searchIndexName);
return;
}
}
SearchIndex index = new(searchIndexName)
{
VectorSearch = new()
{
Algorithms =
{
new HnswAlgorithmConfiguration(vectorSearchConfigName)
},
Profiles =
{
new VectorSearchProfile(vectorSearchProfile, vectorSearchConfigName)
}
},
Fields =
{
new SimpleField("id", SearchFieldDataType.String) { IsKey = true },
new SearchableField("text") { AnalyzerName = LexicalAnalyzerName.EnMicrosoft },
new SimpleField("title", SearchFieldDataType.String) { IsFacetable = true },
new SearchField("embeddings", SearchFieldDataType.Collection(SearchFieldDataType.Single))
{
VectorSearchDimensions = this.vectorSearchDimensions,
IsSearchable = true,
VectorSearchProfileName = vectorSearchProfile,
},
new SimpleField("timestamp", SearchFieldDataType.DateTimeOffset) { IsFacetable = true }
},
SemanticSearch = new()
{
Configurations =
{
new SemanticConfiguration("default", new()
{
ContentFields =
{
new SemanticField("text")
},
TitleField = new SemanticField("title")
})
}
}
};
await searchIndexClient.CreateIndexAsync(index, cancellationToken);
}
async Task IndexSectionsAsync(SearchClient searchClient, SearchableDocument document, CancellationToken cancellationToken = default)
{
int iteration = 0;
IndexDocumentsBatch<SearchDocument> batch = new();
for (int i = 0; i < document.Embeddings?.Response?.Data.Count; i++)
{
batch.Actions.Add(new IndexDocumentsAction<SearchDocument>(
IndexActionType.MergeOrUpload,
new SearchDocument
{
["id"] = Guid.NewGuid().ToString("N"),
["text"] = document.Embeddings.Request.Input![i],
["title"] = Path.GetFileNameWithoutExtension(document.Title),
["embeddings"] = document.Embeddings.Response.Data[i].Embedding.ToArray() ?? Array.Empty<float>(),
["timestamp"] = DateTime.UtcNow
}));
iteration++;
if (iteration % 1_000 is 0)
{
// Every one thousand documents, batch create.
await this.IndexDocumentsBatchAsync(searchClient, batch, cancellationToken);
batch = new();
}
}
if (batch is { Actions.Count: > 0 })
{
// Any remaining documents, batch create.
await this.IndexDocumentsBatchAsync(searchClient, batch, cancellationToken);
}
}
async Task IndexDocumentsBatchAsync(SearchClient searchClient, IndexDocumentsBatch<SearchDocument> batch, CancellationToken cancellationToken)
{
IndexDocumentsResult result = await searchClient.IndexDocumentsAsync(batch, cancellationToken: cancellationToken);
int succeeded = result.Results.Count(r => r.Succeeded);
this.logger.LogInformation("""
Indexed {Count} sections, {Succeeded} succeeded
""",
batch.Actions.Count,
succeeded);
}
SearchIndexClient GetSearchIndexClient(string endpoint)
{
string? key = this.configuration.GetValue<string>(this.searchAPIKeySetting);
if (string.IsNullOrEmpty(key))
{
return new SearchIndexClient(new Uri(endpoint), new DefaultAzureCredential());
}
else
{
return new SearchIndexClient(new Uri(endpoint), new AzureKeyCredential(key));
}
}
SearchClient GetSearchClient(string endpoint, string searchIndexName)
{
string? key = this.configuration.GetValue<string>(this.searchAPIKeySetting);
SearchClient searchClient;
if (string.IsNullOrEmpty(key))
{
searchClient = new SearchClient(new Uri(endpoint), searchIndexName, new DefaultAzureCredential());
}
else
{
searchClient = new SearchClient(new Uri(endpoint), searchIndexName, new AzureKeyCredential(key));
}
return searchClient;
}
}