In [1]:
!pip install python-dotenv openai langchain faiss-cpu langchain-community ollama chromadb tqdm

%load_ext dotenv
%dotenv



In [2]:
import os

%reload_ext dotenv

if 'OPENAI_API_KEY' not in os.environ: print('`OPENAI_API_KEY` environment variable is missing.')
if 'OPENAI_API_BASE_URL' not in os.environ: print('`OPENAI_API_BASE_URL` environment variable is missing.')

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_BASE_URL = os.environ.get("OPENAI_API_BASE_URL")

from openai import AzureOpenAI
def get_client(base_url=OPENAI_API_BASE_URL, api_key=OPENAI_API_KEY):
    client = AzureOpenAI(
        api_key=api_key,
        api_version="2023-03-15-preview",
        base_url=base_url
    )
    return client

In [3]:
import glob
files = glob.glob('./output/contents/*')
print(f"{len(files)} files")
files[0]

322 files


'./output/contents/https-rust-code-maven-com-multi-crate-project'

In [4]:
import json

scraped_files = []
for filepath in files:
    article = json.loads(open(filepath, 'r').read())
    filename = filepath.split('/')[-1]
    scraped_files.append((filename, article))

len(scraped_files)

322

In [5]:
print(scraped_files[0][1])

{'title': 'Rust Multi-crate project in a monorepo', 'link': 'https://rust.code-maven.com/multi-crate-project', 'content': "Rust Multi-crate project in a monorepo\nAs your project growth at one point your might feel that splitting the code into multiple crates could be a good idea. It might make the\ncode cleaner and more reusable across projects. You might even decide to publish some of these crates separately.\nThere are at least two ways to manage this. One is to put each crate in its own repository. It has the advantage of making them totally\nindependent, but it will create some extra overhead.\nThe alternative is to create workspaces. Though the Rust documentation does not\nmention this, this is similar to the idea of using monorepo.\nLet's see how we go about doing this.\nThe Cargo.toml file looks like this:\nWe edit the src/main.rs file to contain:\nAt this point we can cargo run or cargo test`. There is nothing special about it.\nThis is how the directory tree looks like:\nCrat

In [6]:

import json

CLEANED_DIR = './output/contents'
def try_get_cleaned_content(fname):
    fpath = os.path.join(CLEANED_DIR, fname)
    if not os.path.exists(fpath):
        return None
    return json.loads(open(fpath, 'r').read())

EMBEDDINGS_DIR = "./output/embeddings"
def store_embeddings(fname, embeddings):
    fpath = os.path.join(EMBEDDINGS_DIR, fname)
    import json
    open(fpath, 'w').write(json.dumps(embeddings))

def get_embedding(text, client, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

import ollama
def get_embedding_ollama(text, model="mxbai-embed-large"):
    return ollama.embeddings(
        model=model,
        prompt=text,
    )

def split_text_into_chunks(text):
    from langchain.text_splitter import CharacterTextSplitter
    text_splitter = CharacterTextSplitter(
        separator='\n',
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

import chromadb
def get_chromadb_client():
    return chromadb.Client()

In [7]:
from tqdm import tqdm

chromadb_client = get_chromadb_client()
chromadb_collection = None
collection_name = 'rust-walkthrough-articles'
try:
    chromadb_collection = chromadb_client.create_collection(collection_name)
except:
    chromadb_collection = chromadb_client.get_collection(collection_name)

# embeddings - generate and store
for (fname, article) in tqdm(scraped_files[:100]):
    maybe_cleaned_article = try_get_cleaned_content(fname)
    if maybe_cleaned_article is None:
        continue
    article = maybe_cleaned_article
    text_chunks = split_text_into_chunks(article['content'])
    chunked_embeddings = [(fname, i, get_embedding_ollama(chunk)['embedding']) for (i, chunk) in enumerate(text_chunks)]
    chromadb_collection.add(
        ids=[f"{fname}-{i}" for (fname, i, _) in chunked_embeddings],
        embeddings=[embedding for (_, _, embedding) in chunked_embeddings],
        documents = text_chunks,
        metadatas=[{'title': article['title'], 'link': article['link']} for _ in text_chunks]
    )

  9%|▉         | 9/100 [00:06<01:01,  1.49it/s]Created a chunk of size 1067, which is longer than the specified 1000
Created a chunk of size 1067, which is longer than the specified 1000
Created a chunk of size 1067, which is longer than the specified 1000
Created a chunk of size 1067, which is longer than the specified 1000
 21%|██        | 21/100 [00:16<01:27,  1.10s/it]Created a chunk of size 1130, which is longer than the specified 1000
Created a chunk of size 1419, which is longer than the specified 1000
Created a chunk of size 2735, which is longer than the specified 1000
 27%|██▋       | 27/100 [00:25<01:38,  1.35s/it]Created a chunk of size 1359, which is longer than the specified 1000
Created a chunk of size 1097, which is longer than the specified 1000
Created a chunk of size 1089, which is longer than the specified 1000
Created a chunk of size 1122, which is longer than the specified 1000
 32%|███▏      | 32/100 [00:33<01:31,  1.35s/it]Created a chunk of size 1030, which is 

In [16]:
query = get_embedding_ollama('distributed systems')['embedding']
res = chromadb_collection.query(query)
[(id, meta, doc) for (id, meta, doc) in zip(res['ids'][0], res['metadatas'][0], res['documents'][0])]

[('https-manuel-bernhardt-io-posts-2024-01-26-rust-fearless-concurrency-cats-raspberry-pi-2',
  {'link': 'https://manuel.bernhardt.io/posts/2024-01-26-rust-fearless-concurrency-cats-raspberry-pi/',
   'title': 'Fearless concurrency with Rust, cats, and a few Raspberry PIs'},
  'Let’s go!\nAny distributed system needs the following building blocks in order to operate:\ndiscovery: a way to discover other member nodes\ntransport: a way to communicate (the pipes)\nprotocol: a common language to understand each other\nconsensus: a way to agree on the state of things\nAt first, this definition may look somewhat elaborate, so let’s take some time and talk about these different aspects to demystify them.\nIn order to talk to other nodes on the network, we need to know about them. There can be various degrees of discovery in a distributed system, ranging from a simple, hard-coded list of IP addresses on each node to full-fledged group membership (the foundation of clustered applications). If yo

In [68]:
#response = get_client().chat.completions.create(
#  model="gpt-35-turbo", # replace this value with the deployment name you chose when you deployed the associated model.
#  messages = [{
#      "role": "system",
#      "content": ""
#  },
#  {
#      "role": "user",
#      "content": "",
#  }],
#  temperature=2,
#  top_p=0.95,
#  frequency_penalty=0,
#  presence_penalty=0,
#  stop=None)
#
#print(response.choices[0].message.content)
