# Ingest GitHub Issues into Qdrant

This sample shows how to get started loading and querying dotnet/runtime repo GitHub issue embeddings using Azure OpenAI and the Qdrant SDK

## Install packages

In [None]:
#r "nuget: Azure.AI.OpenAI, 1.0.0-beta.9"
#r "nuget: Qdrant.Client, 1.6.0-alpha.1"

In [None]:
#r "nuget: Octokit, 9.0.0"
#r "nuget: Octokit.Reactive, 9.0.0"

In [None]:
#r "nuget:Microsoft.DotNet.Interactive.AIUtilities, 1.0.0-beta.23557.4"

## Add using statements

In [None]:
using Azure;
using Azure.AI.OpenAI;
using Microsoft.DotNet.Interactive;
using Microsoft.DotNet.Interactive.AIUtilities;
using Octokit;

## Configure Azure OpenAI credentials

In [26]:
var azureOpenAIKey = await Kernel.GetPasswordAsync("Provide your OPEN_AI_KEY");
var azureOpenAIEndpoint = await Kernel.GetInputAsync("Provide the OPEN_AI_ENDPOINT");
var embeddingDeployment = await Kernel.GetInputAsync("Provide embedding name");

## Configure GitHub credentials 

You will need access token with rights to query and update issues.

In [None]:
var githubKey = await Kernel.GetPasswordAsync("Provide your Github api key");
var repoName = await Kernel.GetInputAsync("Provide repo");
var org = await Kernel.GetInputAsync("Provide org");

## Configure OpenAI client

In [27]:
OpenAIClient openAIClient = new (new System.Uri(azureOpenAIEndpoint), new AzureKeyCredential(azureOpenAIKey.GetClearTextPassword()));

## Configure GitHub client

In [None]:
var options = new ApiOptions();
var gitHubClient = new GitHubClient(new ProductHeaderValue("notebook"));

if (!string.IsNullOrEmpty(githubKey.GetClearTextPassword())) {
    Console.WriteLine("Using GitHub API token");
    var tokenAuth = new Credentials(githubKey.GetClearTextPassword());
    gitHubClient.Credentials = tokenAuth;
} else {
    Console.WriteLine("Using anonymous GitHub API");
}

## Download data from GitHub

### Get labels from the repository

In [None]:
var allLabels = await gitHubClient.Issue.Labels.GetAllForRepository(org, repoName);

In [None]:
var areaLabels = allLabels.Where(label => label.Name.StartsWith("area-", StringComparison.OrdinalIgnoreCase)).ToList();

### Get all issues from the repository

In [None]:
var allIssues = new List<Issue>();

In [None]:
foreach(var label in areaLabels)
{
    var request = new RepositoryIssueRequest
    {
        Filter = IssueFilter.All
    };
    
    request.Labels.Add(label.Name);

    var apiOptions = new ApiOptions
    {
        PageSize = 50,
        PageCount = 1
    };

    var issues = await gitHubClient.Issue.GetAllForRepository(org, repoName, request, apiOptions);
    
    allIssues.AddRange(issues);
}

In [None]:
allIssues.Count()

In [None]:
public record GitHubIssue(string Title, string Text, string Area, string Url);

In [None]:
var dataCollection = 
    allIssues
        .DistinctBy(issue => issue.Id)
        .Select(issue => 
            new GitHubIssue(
                issue.Title,
                issue.Body,
                issue.Labels?.Where(l => 
                    l.Name.StartsWith("area-",StringComparison.OrdinalIgnoreCase))
                        .FirstOrDefault()?
                        .Name?
                        .Replace("area-",string.Empty)?
                        .Replace("-"," "),
                issue.HtmlUrl));

## Helper functions to save and load to disk

In [None]:
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;

public async Task SaveIssuesToFileAsync(IEnumerable<GitHubIssue> data, string fileName)
{ 
    var filePath = Path.Combine("..","..","..","Data",fileName);
    var issuesJson = JsonSerializer.Serialize(data,new JsonSerializerOptions(JsonSerializerOptions.Default){WriteIndented=true});
    await File.WriteAllTextAsync(filePath, issuesJson);
}

public async Task<GitHubIssue[]> LoadIssuesFromFileAsync(string fileName)
{
    var filePath = Path.Combine("..","..","..","Data",fileName);
    var text = await File.ReadAllTextAsync(filePath);
    return JsonSerializer.Deserialize<GitHubIssue[]>(text);
}

In [None]:
await SaveIssuesToFileAsync(dataCollection, "issues.json");

## Chunk issues

In [None]:
var dataCollection = await LoadIssuesFromFileAsync("issues.json");

In [None]:
dataCollection.Count()

### Initialize collection of issues with chunks

In [None]:
public record TextWithEmbedding(string Text, float[] Embedding);
public record IssueWithChunks(GitHubIssue Issue,List<TextWithEmbedding> Chunks);

In [None]:
var issuesWithChunksCollection = 
    dataCollection
        .Select(issue => new IssueWithChunks(issue, new ()))
        .ToArray();

### Get embedding deployment name

In [None]:
var embeddingDeployment = await Kernel.GetInputAsync("Provide embedding deploymnet name");

### Helper functions to save and load chunks to disk

In [1]:
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;

public async Task SaveIssuesWithChunksToFileAsync(IEnumerable<IssueWithChunks> data, string fileName)
{ 
    var filePath = Path.Combine("..","..","..","Data",fileName);
    var issuesJson = JsonSerializer.Serialize(data,new JsonSerializerOptions(JsonSerializerOptions.Default){WriteIndented=true});
    await File.WriteAllTextAsync(filePath, issuesJson);
}

public async Task<IssueWithChunks[]> LoadIssuesWithChunksFromFileAsync(string fileName)
{
    var filePath = Path.Combine("..","..","..","Data",fileName);
    var text = await File.ReadAllTextAsync(filePath);
    return JsonSerializer.Deserialize<IssueWithChunks[]>(text);
}

SyntaxError: invalid syntax (4041573465.py, line 1)

### Chunk data and generate embeddings

In [None]:
var tokenizer = await Tokenizer.CreateAsync(TokenizerModel.ada2);

var counter = 0;

foreach(var item in issuesWithChunksCollection.Take(100))
{
    var fullText = item.Issue.Text;
    if(string.IsNullOrWhiteSpace(fullText))
        continue;

    var chunks = 
        tokenizer
            .ChunkByTokenCountWithOverlap(fullText, 3000, 50)
            .Select(t => 
$"""
Title: {item.Issue.Title}
Area: {item.Issue.Area}

{t}
""")
            .Chunk(16)
            .ToArray();

    foreach(var chunk in chunks)
    {
        
        var embeddingResponse = await openAIClient.GetEmbeddingsAsync(new EmbeddingsOptions(embeddingDeployment,chunk));
        item.Chunks.AddRange(
            embeddingResponse.Value.Data.Select(d => 
                new TextWithEmbedding(chunk[d.Index],d.Embedding.ToArray())));
    }

    if(counter % 50 == 0)
        await SaveIssuesWithChunksToFileAsync(issuesWithChunksCollection, "areaIssuesWithEmbeddingsSubset.json");
    counter++;
}

await SaveIssuesWithChunksToFileAsync(issuesWithChunksCollection, "areaIssuesWithEmbeddingsSubset.json");

In [None]:
dataCollection.Take(5).DisplayTable();

In [None]:
await SaveDataToFileAsync(dataCollection, "issueWithEmbeddings.json");

## Naive Search

In [None]:
var dataCollection = await LoadDataFromFileAsync("issueWithEmbeddings.json");

In [None]:
public string[] NaiveSearch(string query, IEnumerable<IssueWithChunks> data,int resultLimit = 1)
{
    return data.Where(d => d.Issue.Text?.Contains(query)==true).Select(d => d.Issue.Text).Take(resultLimit).ToArray();
}

In [None]:
NaiveSearch("What are the latest issues for AOT",dataCollection).Display();

## Embedding Search

In [None]:
#r "nuget: System.Numerics.Tensors, 8.0.0-rc.2.23479.6"

In [None]:
using System.Numerics.Tensors;

In [None]:
public class SimilarityComparer : ISimilarityComparer<float[]>
{
    public float Score(float[] a, float[] b)
    {
        return TensorPrimitives.CosineSimilarity(a,b);
    }
}

In [None]:
public async Task<string[]> EmbeddingSearchAsync(string query, IEnumerable<IssueWithChunks> data,int resultLimit = 1)
{
    var embeddingResponse = await openAIClient.GetEmbeddingsAsync(new EmbeddingsOptions(embeddingDeployment,new [] {query}));
    var embeddingVector = embeddingResponse.Value.Data[0].Embedding.ToArray();

    var searchResults = 
        data
        .SelectMany(d => d.Chunks)
        .ScoreBySimilarityTo(embeddingVector,new SimilarityComparer(),c => c.Embedding)
        .OrderByDescending(e => e.Value)
        .Where(e => e.Value > 0.5)
        .Take(resultLimit)
        .Select(e => e.Key.Text)
        .ToArray();
    
    return searchResults;
}

In [None]:
(await EmbeddingSearchAsync("What are the latest issues for AOT", dataCollection, 3)).Display();

## Store in DB

### Start DB locally

In [None]:
docker run -d -p 6333:6333 -p 6334:6334 -v "$pwd/qdrant_storage:/qdrant/storage:z" qdrant/qdrant

In [None]:
using Qdrant.Client;
using Qdrant.Client.Grpc;

### Initialize Qdrant client

In [None]:
var qdrantClient = new QdrantClient(host: "localhost",port: 6334,https:false);

### Create collection

In [None]:
var collectionName = "gh_issues";

In [30]:
var collections = await qdrantClient.ListCollectionsAsync();
if(collections.Where(x => x.Contains(collectionName)).Count() > 0) 
    await qdrantClient.DeleteCollectionAsync(collectionName);

In [31]:
await qdrantClient.CreateCollectionAsync(collectionName, new VectorParams { Size=1536, Distance=Distance.Cosine})

### Map issue embeddings to points

In [32]:
var vectors = 
    issuesWithChunksCollection
        .Where(d => d.Chunks.Count > 0)
        .SelectMany(d => 
            d.Chunks.Select(c => new {
                Embedding=c.Embedding,
                Text=$"<issueTitle>{d.Issue.Title}</issueTitle>\n<issueUrl>{d.Issue.Url}</issueUrl>\n<issueArea>{d.Issue.Area}</issueArea>\n<issueSnippet>{c.Text}</issueSnippet>"
                }))
        .ToList();
  

In [33]:
var points = vectors.Select(vector => 
{
    var point = new PointStruct
    {
        Id = new PointId { Uuid = Guid.NewGuid().ToString() },
        Vectors = vector.Embedding,
        Payload = 
            {
                ["text"] = vector.Text
            }
    };
    return point;
}).ToList();


### Insert data into Qdrant collection

In [34]:
await qdrantClient.UpsertAsync(collectionName,points);

## Get Count

In [35]:
await qdrantClient.CountAsync(collectionName)

## Search with Qdrant

In [36]:
public async Task<string[]> SearchWithQdrantAsync(string query, string collectionName, int resultLimit = 1)
{
    var embeddingResponse = await openAIClient.GetEmbeddingsAsync(new EmbeddingsOptions(embeddingDeployment,new [] {query}));
    var embeddingVector = embeddingResponse.Value.Data[0].Embedding.ToArray();

    var results = await qdrantClient.SearchAsync(collectionName,embeddingVector, limit:(ulong)resultLimit);
    return results.Select(r => r.Payload["text"].StringValue).ToArray();
}

In [37]:
(await SearchWithQdrantAsync("What are the latest issues for AOT", collectionName, 3)).Display();