In [None]:
!pip install python-dotenv openai langchain faiss-cpu langchain-community ollama chromadb

%load_ext dotenv
%dotenv

In [19]:
import os

%reload_ext dotenv

if 'OPENAI_API_KEY' not in os.environ: print('`OPENAI_API_KEY` environment variable is missing.')
if 'OPENAI_API_BASE_URL' not in os.environ: print('`OPENAI_API_BASE_URL` environment variable is missing.')

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_BASE_URL = os.environ.get("OPENAI_API_BASE_URL")

from openai import AzureOpenAI
def get_client(base_url=OPENAI_API_BASE_URL, api_key=OPENAI_API_KEY):
    client = AzureOpenAI(
        api_key=api_key,
        api_version="2023-03-15-preview",
        base_url=base_url
    )
    return client

In [70]:
import glob
files = glob.glob('./output/contents/*')
files[0]

'./output/contents/https-blog-burntsushi-net-transducers'

In [71]:
scraped_files = []
for filepath in files:
    content = '\n'.join([
        line.strip()
        for line
            in open(filepath, 'r').read().strip().split('\n')
        if len(line.strip())
    ])
    filename = filepath.split('/')[-1]
    scraped_files.append((filename, content))

len(scraped_files)

71

In [74]:
print(scraped_files[0][1][:100])

Index 1,600,000,000 Keys with Automata and Rust - Andrew Gallant's Blog
It turns out that finite sta


In [32]:

CLEANED_DIR = './output/contents'
def try_get_cleaned_content(fname):
    fpath = os.path.join(CLEANED_DIR, fname)
    if not os.path.exists(fpath):
        return None
    return open(fpath, 'r').read()

EMBEDDINGS_DIR = "./output/embeddings"
def store_embeddings(fname, embeddings):
    fpath = os.path.join(EMBEDDINGS_DIR, fname)
    import json
    open(fpath, 'w').write(json.dumps(embeddings))

def get_embedding(text, client, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

import ollama
def get_embedding_ollama(text, model="mxbai-embed-large"):
    return ollama.embeddings(
        model=model,
        prompt=text,
    )

def split_text_into_chunks(text):
    from langchain.text_splitter import CharacterTextSplitter
    text_splitter = CharacterTextSplitter(
        separator='\n',
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

import chromadb
def get_chromadb_client():
    return chromadb.Client()

In [None]:
chromadb_client = get_chromadb_client()
chromadb_collection = None
collection_name = 'rust-walkthrough-articles'
try:
    chromadb_collection = chromadb_client.create_collection(collection_name)
except:
    chromadb_collection = chromadb_client.get_collection(collection_name)

# embeddings - generate and store
for (fname, content) in scraped_files[:10]:
    maybe_cleaned_content = try_get_cleaned_content(fname)
    if maybe_cleaned_content is None:
        continue
    content = maybe_cleaned_content
    text_chunks = split_text_into_chunks(content)
    chunked_embeddings = [(fname, i, get_embedding_ollama(chunk)['embedding']) for (i, chunk) in enumerate(text_chunks)]
    chromadb_collection.add(
        ids=[f"{fname}-{i}" for (fname, i, _) in chunked_embeddings],
        embeddings=[embedding for (_, _, embedding) in chunked_embeddings],
        documents = text_chunks,
    )

In [67]:
query = get_embedding_ollama('distributed systems')['embedding']
chromadb_collection.query(query)

{'ids': [['http-laurocaetano-com-programming-2021-01-23-raft-leader-election-rust-0',
   'http-laurocaetano-com-programming-2021-01-23-raft-leader-election-rust-4',
   'https-blog-tarkalabs-com-how-to-build-a-web-application-using-rust-part-iii-ed6511ebaa97-1',
   'http-laurocaetano-com-programming-2021-01-23-raft-leader-election-rust-1',
   'https-www-diegofreijo-com-blog-rlox-vm-a-lox-interpreter-in-rust-part-1-11',
   'https-konghq-com-blog-writing-an-ebpf-xdp-load-balancer-in-rust-22',
   'http-laurocaetano-com-programming-2021-01-23-raft-leader-election-rust-7',
   'https-konghq-com-blog-writing-an-ebpf-xdp-load-balancer-in-rust-5',
   'https-konghq-com-blog-writing-an-ebpf-xdp-load-balancer-in-rust-6',
   'https-onevariable-com-blog-phase-locked-state-machines-2']],
 'distances': [[221.2075958251953,
   226.83206176757812,
   230.14593505859375,
   230.21128845214844,
   230.98770141601562,
   235.8857879638672,
   238.7687530517578,
   240.87452697753906,
   241.87466430664062,


In [68]:
#response = get_client().chat.completions.create(
#  model="gpt-35-turbo", # replace this value with the deployment name you chose when you deployed the associated model.
#  messages = [{
#      "role": "system",
#      "content": ""
#  },
#  {
#      "role": "user",
#      "content": "",
#  }],
#  temperature=2,
#  top_p=0.95,
#  frequency_penalty=0,
#  presence_penalty=0,
#  stop=None)
#
#print(response.choices[0].message.content)
