In [1]:
import Pkg; 
Pkg.add("HTTP");
Pkg.add("JSON3");
Pkg.add("LinearAlgebra");
Pkg.add("DotEnv");

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Manifest.toml`


In [2]:
using HTTP, JSON3
using LinearAlgebra
using DotEnv

DotEnv.load!()

In [3]:
const MODEL_CHAT = "gpt-3.5-turbo"
const MODEL_EMBEDDING = "text-embedding-ada-002"
const API_KEY = ENV["OPENAI_API_KEY"];

In [4]:
Pkg.add("OpenAI")

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Manifest.toml`


In [5]:
using OpenAI

In [6]:
function create_chat(api_key::String,
    model::String,
    messages::Vector{Dict{String, String}};
    http_kwargs::NamedTuple = NamedTuple(), api_kwargs::NamedTuple = NamedTuple())
url = "https://api.openai.com/v1/chat/completions"
headers = Dict("Authorization" => "Bearer $api_key",
    "Content-Type" => "application/json")
# JSON-encoded data (string)
body = (; messages, model, api_kwargs...) |> JSON3.write
resp = HTTP.request("POST", url; body, headers, http_kwargs...)
return (;
    response = JSON3.read(resp.body),
    status = resp.status)
end

create_chat (generic function with 1 method)

In [7]:
# Define shared abstract type for custom printing
abstract type AbstractBuildingBlock end

In [8]:
# Convenience for building "messages" for the chatbot faster (see the standard format in the API calls section...)
@kwdef struct PromptTemplate <: AbstractBuildingBlock
    system_prompt::Union{String, Nothing} = nothing
    user_prompt::String = ""
end

PromptTemplate

In [9]:
# Return type for the AI model
@kwdef struct AIMessage <: AbstractBuildingBlock
    content::AbstractString
    status::Union{Int, Nothing} = nothing
    tokens::Tuple{Int, Int} = (-1, -1)
    elapsed::Float64 = -1.0
end

AIMessage

In [10]:
# Stores document chunks and their embeddings
@kwdef struct ChunkIndex{T <: AbstractString} <: AbstractBuildingBlock
    id::Symbol = gensym("ChunkIndex")
    embeddings::Matrix{Float32}
    chunks::Vector{T}
    sources::Vector{<:AbstractString}
end

In [11]:
# Structured show method for easier reading (each kwarg on a new line)
function Base.show(io::IO, t::AbstractBuildingBlock)
    dump(IOContext(io, :limit => true), t, maxdepth = 1)
end

In [12]:
# Utitity to be able to combine indices from different sources/documents easily
function Base.vcat(i1::ChunkIndex{T}, i2::ChunkIndex{T}) where {T <: AbstractString}
    ChunkIndex(;
        embeddings = hcat(i1.embeddings, i2.embeddings),
        chunks = vcat(i1.chunks, i2.chunks),
        sources = vcat(i1.sources, i2.sources))
end

In [13]:
"Builds a history of the conversation (=messages) to provide the prompt to the API. All kwargs are passed as replacements such that `{{key}} => value` in the template.}}"
function render(prompt::PromptTemplate; kwargs...)
    conversation = Dict{String, String}[]
    !isnothing(prompt.system_prompt) &&
        push!(conversation, Dict("role" => "system", "content" => prompt.system_prompt))
    # Replace any handlebar-style placeholders like `{{key}}` in the user_prompt with user-provided kwargs
    user_prompt = replace(prompt.user_prompt, ["{{$(k)}}" => v for (k, v) in kwargs]...)
    push!(conversation, Dict("role" => "user", "content" => user_prompt))
    return conversation
end

"Builds the request to the API and waits for the response."
function aigenerate(template::PromptTemplate;
        api_key::String = API_KEY,
        model::String = MODEL_CHAT,
        # Let's use smart defaults because OpenAI is a bit fiddly...
        http_kwargs::NamedTuple = (;
            retry_non_idempotent = true,
            retries = 10,
            readtimeout = 30), api_kwargs::NamedTuple = NamedTuple(),
        kwargs...)
    ##
    conversation = render(template; kwargs...)
    time = @elapsed r = create_chat(api_key,
        model,
        conversation;
        http_kwargs,
        api_kwargs...)
    return AIMessage(; content = r.response[:choices][begin][:message][:content] |> strip,
        status = Int(r.status),
        tokens = (r.response[:usage][:prompt_tokens],
            r.response[:usage][:completion_tokens]),
        elapsed = time)
end
"Creates embeddings for `docs` (string or array of strings) and returns a normalized matrix (column-wise)"
function aiembed(docs::Union{AbstractString, Vector{<:AbstractString}},
        postprocess::F = normalize;
        api_key::String = API_KEY,
        model::String = MODEL_EMBEDDING,
        http_kwargs::NamedTuple = NamedTuple(), api_kwargs::NamedTuple = NamedTuple(),
        kwargs...) where {F <: Function}
    r = create_embeddings(api_key, docs, model; http_kwargs, api_kwargs...)
    return mapreduce(x -> postprocess(x[:embedding]), hcat, r.response.data)
end

function aiembed_not_norm(docs::Union{AbstractString, Vector{<:AbstractString}},
    api_key::String = API_KEY,
    model::String = MODEL_EMBEDDING,
    http_kwargs::NamedTuple = NamedTuple(), api_kwargs::NamedTuple = NamedTuple(),
    kwargs...)
r = create_embeddings(api_key, docs, model; http_kwargs, api_kwargs...)
return hcat(r.response.data...)
end

"Finds the indices of chunks (represented by embeddings in `emb`) that are closest (cosine similarity) to query embedding (`query_emb`). Returns only `top_k` closest indices."
function find_closest(emb::AbstractMatrix{<:Real},
        query_emb::AbstractVector{<:Real};
        top_k::Int = 100)
    query_emb' * emb |> vec |> sortperm |> reverse |> x -> first(x, top_k)
end
function find_closest(index::ChunkIndex, query_emb::AbstractVector{<:Real}; top_k::Int=100)
    find_closest(index.embeddings, query_emb; top_k)
end

find_closest (generic function with 2 methods)

In [14]:
emb = aiembed_not_norm("Hello, how are you?")

1×1 Matrix{JSON3.Object{Vector{UInt8}, SubArray{UInt64, 1, Vector{UInt64}, Tuple{UnitRange{Int64}}, true}}}:
 {
      "object": "embedding",
       "index": 0,
   "embedding": [
                  -0.008593396,
                  -0.0006966989,
                  0.0034827178,
                  -0.033106014,
                  -0.012054367,
                  0.019125434,
                  -0.0093266005,
                  -0.009270678,
                  -0.017447764,
                  -0.010494756,
                  0.031515334,
                  0.010805435,
                  -0.01688854,
                  -0.008549902,
                  0.00771728,
                  -0.016180191,
                  0.026494753,
                  -0.0075743673,
                  0.029676111,
                  -0.012986405,
                  -0.021399608,
                  0.0033615527,
                  0.018205822,
                  -0.0022943686,
                  0.002022524,
                  -0.0105320

In [15]:
emb = aiembed("Turn me into numbers.")

1536-element Vector{Float64}:
 -0.03651858529992288
 -0.02723107922364567
  0.010464890085946919
 -0.009537455078330004
 -0.007840446064392667
  0.009938686081625268
 -0.011490989094374151
 -0.013069602107339115
 -0.018956514155687637
 -0.0328351562696713
  ⋮
 -0.01949587316011733
  0.01724634914164225
 -0.018838119154715272
  0.013733936112795213
 -0.002927012024039208
 -0.008386383068876384
  0.007860178064554725
  0.0005303154043554185
 -0.027388940224942166

In [16]:
template = PromptTemplate(system_prompt = "You are a knowledgeable assistant.", 
                          user_prompt = "How do I create a DataFrame with {{package}} in Julia?")

PromptTemplate
  system_prompt: String "You are a knowledgeable assistant."
  user_prompt: String "How do I create a DataFrame with {{package}} in Julia?"


In [17]:
rendered_prompt = render(template, package = "DataFrames.jl")

2-element Vector{Dict{String, String}}:
 Dict("role" => "system", "content" => "You are a knowledgeable assistant.")
 Dict("role" => "user", "content" => "How do I create a DataFrame with DataFrames.jl in Julia?")

In [18]:
msg = aigenerate(PromptTemplate(;user_prompt="Say hi five times."))
println(msg.content)

1. Hi!
2. Hi there!
3. Hi, how are you?
4. Hi, good to see you!
5. Hi, what's new?


In [19]:
emb = aiembed("Turn me into numbers.")

1536-element Vector{Float64}:
 -0.03651858529992288
 -0.02723107922364567
  0.010464890085946919
 -0.009537455078330004
 -0.007840446064392667
  0.009938686081625268
 -0.011490989094374151
 -0.013069602107339115
 -0.018956514155687637
 -0.0328351562696713
  ⋮
 -0.01949587316011733
  0.01724634914164225
 -0.018838119154715272
  0.013733936112795213
 -0.002927012024039208
 -0.008386383068876384
  0.007860178064554725
  0.0005303154043554185
 -0.027388940224942166

In [20]:
sum(emb .* emb) == 1.0 #similarity to itself is exact, so it should be 1.0
# it's the same as dot(emb, emb), because the above is just a "dot product"

false

In [21]:
emb2 = aiembed("I like Cauliflower, but it must be grilled.")

1536-element Vector{Float64}:
 -0.00442583357894998
 -0.01989252335485164
 -0.0010416051185806133
 -0.025941268462751833
 -0.004650447082956735
  0.007535622134423764
 -0.0221196773945806
 -0.03540666863159984
  0.0032553128580697123
 -0.018665059332955595
  ⋮
 -0.015172479270653403
  0.006934544123701469
 -0.025207320449659343
 -0.011477429204739466
 -0.011964619213430176
 -0.008839012157674214
  0.011306597201692088
 -0.004732699684423993
 -0.017399631310382326

In [22]:
# similarity between two different texts
sum(emb .* emb2) # 0.73 --> smaller than 1.0 because it's less similar

0.7332274421932716

In [23]:
"Splits `doc` into text chunks of size at most `max_size` (in characters), ie, it accumulates smaller chunks to match the desired size"
function build_chunks(doc::AbstractString;
        max_size::Int = 128,
        split_pattern::Union{String, Regex} = r"\n|\. ",
        join_key::String = "\n")
    ## shortcut if doc is too short
    length(doc) < max_size && return [doc]
    ## proceed
    texts = split(doc, split_pattern)
    doc_chunks = Vector{eltype(texts)}()
    start, counter = 1, 0
    # accumulate chunks until we reach the max size
    for i in eachindex(texts)
        l = length(texts[i])
        # if it doesn't fit, we push all preceeding docs, reset the counter and start a new chunk
        if l == 0 || (counter + l >= max_size)
            push!(doc_chunks, join(texts[start:max(i - 1, 1)], join_key))
            start = i # current text becomes the next chunk
            counter = 0
        end
        counter += l
    end
    # last chunk is never pushed in, so we need to do it manually
    push!(doc_chunks, join(texts[start:end], join_key))
    return doc_chunks
end

build_chunks

In [24]:
doc = """
One wisdom.


Second wisdom.


Third wisdom.
"""
build_chunks(doc; max_size = 20)
# Output:
# 6-element Vector{SubString{String}}:
#  "One wisdom."
#  ""
#  "\nSecond wisdom."
#  ""
#  "\nThird wisdom."
#  ""

6-element Vector{SubString{String}}:
 "One wisdom."
 ""
 "\nSecond wisdom."
 ""
 "\nThird wisdom."
 ""

In [25]:
Pkg.add("DataFrames")

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Manifest.toml`


In [26]:
using DataFrames

In [27]:
dir_raw = joinpath("documents") # this is where the files are saved
files = ["comparison_with_python.txt", "database_style_joins.txt", "what_is_dataframes.txt"]
labels = ["DataFrames-Python", "DataFrames-Joins", "DataFrames-WhatIs"]

indices = ChunkIndex[]
for (fn, lbl) in zip(files, labels)
    doc_raw = read(joinpath(dir_raw, fn), String)
    # split into chunks, if you want to start simple - just do `split(text,"\n\n")`
    doc_chunks = build_chunks(doc_raw; max_size = 256, split_pattern = "\n\n") |>
                 texts -> mapreduce(x -> build_chunks(x;
            max_size = 256,
            split_pattern = r"\n|\. "),
        vcat,
        texts)
    # Notice that we embed all doc_chunks at once, not one by one
    # OpenAI supports embedding multiple documents if they are short enough, ie, it's only because the documentation pages are small
    embeddings = aiembed(doc_chunks) .|> Float32
    index = ChunkIndex(;
        embeddings,
        chunks = doc_chunks,
        sources = fill(lbl, length(doc_chunks)))
    push!(indices, index)
end
index = reduce(vcat, indices) # combine it across several documents

ChunkIndex{SubString{String}}
  id: Symbol ##ChunkIndex#229
  embeddings: Array{Float32}((1536, 228)) Float32[-0.016311117 -0.01765479 … -0.031875867 -0.0243088; 0.01046924 -0.00164377 … -0.007595862 -0.008313151; … ; -0.03530909 -0.0263283 … -0.022456719 0.007854495; -0.024453105 -0.06127415 … -0.018289203 -0.032220628]
  chunks: Array{SubString{String}}((228,))
  sources: Array{String}((228,))


In [28]:
# Define a template for our RAG system
rag_template = PromptTemplate(;
    system_prompt = "Act as a world-class AI assistant and an expert in Julia language. Answer the question based only on the provided context. Be brief and concise.",
    user_prompt = """
      # Context

      {{context}}

      # Question

      {{question}}

      # Answer
      """)

# user question
question = "I like dplyr, what is the equivalent in Julia?"
question_emb = aiembed(question)

# Build the context of similar docs -- take the top 3 closest chunks
idxs = find_closest(index, question_emb; top_k = 3)

# We add 2 chunks before and after each of the closest chunk
close_chunks = [join(index.chunks[max(begin, i - 2):min(end, i + 2)], "\n")
                for i in idxs]
answer = aigenerate(rag_template;
    question,
    context = join(close_chunks, "\n\n"))
println(answer.content)

If you are familiar with `dplyr` in R and looking for an equivalent in Julia, you may find the `DataFramesMeta.jl` package helpful. DataFramesMeta.jl provides convenience syntax similar to dplyr for data manipulation tasks in Julia.


In [29]:
"RAG wrapper that answers the given question and inject the context if needed from `index`"
function airag(index::ChunkIndex, rag_template::PromptTemplate;
        question::AbstractString, top_k::Int = 3, kwargs...)
    question_emb = aiembed(question;)

    idxs = find_closest(index, question_emb; top_k)
    # We add 2 chunks before and after each of the closest chunk
    close_chunks = [join(index.chunks[max(begin, i - 2):min(end, i + 2)], "\n")
                    for i in idxs]
    return aigenerate(rag_template;
        question,
        context = join(close_chunks, "\n\n"),
        kwargs...)
end

airag

In [30]:
question = "I like dplyr, what is the equivalent in Julia?"
answer = airag(index, rag_template; question)

AIMessage
  content: SubString{String}
  status: Int64 200
  tokens: Tuple{Int64, Int64}
  elapsed: Float64 2.139417333


In [31]:
answer

AIMessage
  content: SubString{String}
  status: Int64 200
  tokens: Tuple{Int64, Int64}
  elapsed: Float64 2.139417333
