# Ingest GitHub Issues into Qdrant

In [None]:
#r "nuget: Azure.AI.OpenAI, 1.0.0-beta.9"
#r "nuget: Qdrant.Client, 1.6.0-alpha.1"

In [None]:
#r "nuget: Octokit, 9.0.0"
#r "nuget: Octokit.Reactive, 9.0.0"

In [None]:
#r "nuget:Microsoft.DotNet.Interactive.AIUtilities, 1.0.0-beta.23557.4"

In [None]:
using Azure;
using Azure.AI.OpenAI;
using Microsoft.DotNet.Interactive;
using Microsoft.DotNet.Interactive.AIUtilities;
using Octokit;

In [26]:
var azureOpenAIKey = await Kernel.GetPasswordAsync("Provide your OPEN_AI_KEY");
var azureOpenAIEndpoint = await Kernel.GetInputAsync("Provide the OPEN_AI_ENDPOINT");
var embeddingDeployment = await Kernel.GetInputAsync("Provide embedding name");

## Access to GitHub
You will need access token with rights to query and update issues.

In [None]:
var githubKey = await Kernel.GetPasswordAsync("Provide your Github api key");
var repoName = await Kernel.GetInputAsync("Provide repo");
var org = await Kernel.GetInputAsync("Provide org");

In [27]:
OpenAIClient openAIClient = new (new System.Uri(azureOpenAIEndpoint), new AzureKeyCredential(azureOpenAIKey.GetClearTextPassword()));

In [None]:
var options = new ApiOptions();
var gitHubClient = new GitHubClient(new ProductHeaderValue("notebook"));

if (!string.IsNullOrEmpty(githubKey.GetClearTextPassword())) {
    Console.WriteLine("Using GitHub API token");
    var tokenAuth = new Credentials(githubKey.GetClearTextPassword());
    gitHubClient.Credentials = tokenAuth;
} else {
    Console.WriteLine("Using anonymous GitHub API");
}

# This is my other section

The code below is using the Octokit library, which is a .NET client for interacting with the GitHub API.

The first part of the code is creating a new instance of `RepositoryIssueRequest` named `last6Months`. This object is used to specify the parameters for a request to fetch issues from a GitHub repository. In this case, the `Filter` property is set to `IssueFilter.All`, which means that the request will return all issues regardless of their state (open, closed, etc.). The `Since` property is set to a date that is six months prior to the current date (`DateTimeOffset.UtcNow.Subtract(TimeSpan.FromDays(30*6))`). This means that the request will return only the issues that have been updated in the last six months.

The second part of the code is making an asynchronous request to fetch all issues for a specific repository. The `GetAllForRepository` method of the `Issue` class in the `gitHubClient` object is used to make this request. The `org` and `repoName` variables are used to specify the organization and the name of the repository from which to fetch the issues. The method returns a list of all issues in the specified repository. The `await` keyword is used to wait for the method to complete execution before moving on to the next line of code. This is necessary because the method is asynchronous, meaning it runs in the background and may not complete immediately.

In [None]:
var last6Months = new RepositoryIssueRequest
{
    Filter = IssueFilter.All,
    Since = DateTimeOffset.UtcNow.Subtract(TimeSpan.FromDays(30*6))
};
var allIssues = await gitHubClient.Issue.GetAllForRepository(org, repoName);

In [None]:
public record GitHubIssue(string Title, string Text, string Url);
public record TextWithEmbedding(string Text, float[] Embedding);
public class IssueWithChunks
{
    public GitHubIssue Issue {get;set;}
    public List<TextWithEmbedding> Chunks {get;set;}
}

In [None]:
var dataCollection = 
    allIssues
        .Select(issue => new IssueWithChunks{
            Issue=new GitHubIssue(issue.Title,issue.Body,issue.HtmlUrl), 
            Chunks= new ()
        })
        .ToArray();

## Helper functions to save and load to disk

In [None]:
using System.IO;
using System.Text.Json;
using System.Text.Json.Serialization;

public async Task SaveDataToFileAsync(IEnumerable<IssueWithChunks> data, string fileName)
{ 
    var filePath = Path.Combine("..","..","..","Data",fileName);
    var issuesJson = JsonSerializer.Serialize(data,new JsonSerializerOptions(JsonSerializerOptions.Default){WriteIndented=true});
    await File.WriteAllTextAsync(filePath, issuesJson);
}

public async Task<IssueWithChunks[]> LoadDataFromFileAsync(string fileName)
{
    var filePath = Path.Combine("..","..","..","Data",fileName);
    var text = await File.ReadAllTextAsync(filePath);
    return JsonSerializer.Deserialize<IssueWithChunks[]>(text);
}

In [None]:
await SaveDataToFileAsync(dataCollection, "issues.json");

## Chunk issues

In [None]:
var dataCollection = await LoadDataFromFileAsync("issues.json");

In [None]:
dataCollection.First().Display()

In [None]:
var tokenizer = await Tokenizer.CreateAsync(TokenizerModel.ada2);

foreach(var item in dataCollection)
{
    var fullText = item.Issue.Text;
    if(string.IsNullOrWhiteSpace(fullText))
        continue;

    var chunks = 
        tokenizer
            .ChunkByTokenCountWithOverlap(fullText, 3000, 50)
            .Select(t => 
$"""
Issue Title: {item.Issue.Title}
Issue Url: {item.Issue.Url} 
Chunk: 
{t}
""")
            .Chunk(16)
            .ToArray();

    foreach(var chunk in chunks)
    {
        
        var embeddingResponse = await openAIClient.GetEmbeddingsAsync(new EmbeddingsOptions(embeddingDeployment,chunk));
        item.Chunks.AddRange(
            embeddingResponse.Value.Data.Select(d => 
                new TextWithEmbedding(chunk[d.Index],d.Embedding.ToArray())));
    }
    await SaveDataToFileAsync(dataCollection, "issueWithEmbeddings.json");
}

In [None]:
dataCollection.Take(5).DisplayTable();

In [None]:
await SaveDataToFileAsync(dataCollection, "issueWithEmbeddings.json");

## Naive Search

In [None]:
var dataCollection = await LoadDataFromFileAsync("issueWithEmbeddings.json");

In [None]:
public string[] NaiveSearch(string query, IEnumerable<IssueWithChunks> data,int resultLimit = 1)
{
    return data.Where(d => d.Issue.Text?.Contains(query)==true).Select(d => d.Issue.Text).Take(resultLimit).ToArray();
}

In [None]:
NaiveSearch("What are the latest issues for AOT",dataCollection).Display();

## Embedding Search

In [None]:
#r "nuget: System.Numerics.Tensors, 8.0.0-rc.2.23479.6"

In [None]:
using System.Numerics.Tensors;

In [None]:
public class SimilarityComparer : ISimilarityComparer<float[]>
{
    public float Score(float[] a, float[] b)
    {
        return TensorPrimitives.CosineSimilarity(a,b);
    }
}

In [None]:
public async Task<string[]> EmbeddingSearchAsync(string query, IEnumerable<IssueWithChunks> data,int resultLimit = 1)
{
    var embeddingResponse = await openAIClient.GetEmbeddingsAsync(new EmbeddingsOptions(embeddingDeployment,new [] {query}));
    var embeddingVector = embeddingResponse.Value.Data[0].Embedding.ToArray();

    var searchResults = 
        data
        .SelectMany(d => d.Chunks)
        .ScoreBySimilarityTo(embeddingVector,new SimilarityComparer(),c => c.Embedding)
        .OrderByDescending(e => e.Value)
        .Where(e => e.Value > 0.5)
        .Take(resultLimit)
        .Select(e => e.Key.Text)
        .ToArray();
    
    return searchResults;
}

In [None]:
(await EmbeddingSearchAsync("What are the latest issues for AOT", dataCollection, 3)).Display();

## Store in DB

In [None]:
docker run -d -p 6333:6333 -p 6334:6334 -v "$pwd/qdrant_storage:/qdrant/storage:z" qdrant/qdrant

In [None]:
using Qdrant.Client;
using Qdrant.Client.Grpc;

In [None]:
var client = new QdrantClient(host: "localhost",port: 6334,https:false);

In [None]:
var collectionName = "gh_issues";

In [30]:
var collections = await client.ListCollectionsAsync();
if(collections.Where(x => x.Contains(collectionName)).Count() > 0) 
    await client.DeleteCollectionAsync(collectionName);

In [31]:
await client.CreateCollectionAsync(collectionName, new VectorParams { Size=1536, Distance=Distance.Cosine})

In [32]:
var vectors = 
    dataCollection
        .Where(d => d.Chunks.Count > 0)
        .SelectMany(d => d.Chunks.Select(c => new {Embedding=c.Embedding, Text=$"<issueTitle>{d.Issue.Title}</issueTitle>\n<issueUrl>{d.Issue.Url}</issueUrl>\n<issueSnippet>{c.Text}</issueSnippet>"}))
        .ToList();
  

In [33]:
var points = vectors.Select(vector => 
{
    var point = new PointStruct
    {
        Id = new PointId { Uuid = Guid.NewGuid().ToString() },
        Vectors = vector.Embedding,
        Payload = 
            {
                ["text"] = vector.Text
            }
    };
    return point;
}).ToList();


In [34]:
await client.UpsertAsync(collectionName,points);

## Get Count

In [35]:
await client.CountAsync(collectionName)

## Search with Qdrant

In [36]:
public async Task<string[]> SearchWithQdrantAsync(string query, string collectionName, int resultLimit = 1)
{
    var embeddingResponse = await openAIClient.GetEmbeddingsAsync(new EmbeddingsOptions(embeddingDeployment,new [] {query}));
    var embeddingVector = embeddingResponse.Value.Data[0].Embedding.ToArray();

    var results = await client.SearchAsync(collectionName,embeddingVector, limit:(ulong)resultLimit);
    return results.Select(r => r.Payload["text"].StringValue).ToArray();
}

In [37]:
(await SearchWithQdrantAsync("What are the latest issues for AOT", collectionName, 3)).Display();