In [None]:
%pip install chromadb
%pip install openai
%pip install matplotlib
%pip install open-clip-torch
%pip install langchain-text-splitters
%pip install -U langchain-community pypdf
%pip install torch
%pip install tiktoken

## Image Similarity Search using CLIP Model
### Setup

In [None]:
import chromadb
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
from chromadb.utils.data_loaders import ImageLoader
import os
import torch

CHROMA_PATH='./chroma_db'
DATA_PATH='' # images path

chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
multimodal_clip_collection = chroma_client.get_or_create_collection(
    name='multimodal',
    embedding_function=OpenCLIPEmbeddingFunction(device=device),
    data_loader=ImageLoader()
)


image_paths = os.listdir(DATA_PATH)

multimodal_clip_collection.add(
    ids=[str(id) for id in range(len(image_paths))],
    uris=[os.path.join(DATA_PATH, p) for p in image_paths]
)

print(multimodal_clip_collection.count())

### Querying

In [None]:
import pandas as pd
from IPython.display import display, HTML

def print_results(queries, query_res):
    def image_display(uri):
        return f'<img src="{uri}"/>'
    
    for i, query in enumerate(queries):
        display(HTML(f"<h3>Results for '{query}'</h3>"))
        df = pd.DataFrame(reversed([
            {
                "Id": id,
                "Distance": dist,
                "URI": uri,
                "Image": uri
            }
            for id, dist, uri in zip(
                query_res['ids'][i],
                query_res['distances'][i],
                query_res['uris'][i]
            )
        ]))
        with pd.option_context('display.max_colwidth', None):
            display(HTML(df.to_html(escape=False, formatters={'Image': image_display})))

# List of queries to try
query_texts = []

# Execute the queries
query_res = multimodal_clip_collection.query(
    query_texts=query_texts,
    n_results=5,
    include=['distances', 'data', 'uris']
)

# And output the results
print_results(query_texts, query_res)

## Implement a Retrieval Augmented Generation (RAG) Pipeline with LangChain, ChromaDB, and OpenAI
![image info](./RAG_diagram.svg.png)

source: https://en.wikipedia.org/wiki/Retrieval-augmented_generation

### Chunk document with LangChain

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import os

DATA_PATH='' # path to documents

def load_and_chunk_pdfs(folder_path):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    chunks_all_docs = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            path = os.path.join(folder_path, filename)

            loader = PyPDFLoader(path)
            pages = loader.load()
            chunks = splitter.split_documents(pages)
            chunks_all_docs += chunks

    return chunks_all_docs

# 
chunks = load_and_chunk_pdfs(DATA_PATH)

# Total chunks, first and last chunk data
print("Chunks:", len(chunks))
print(chunks[0].metadata)
print(chunks[-1].metadata)



### Create Embeddings and store in DB

In [None]:
from openai import OpenAI
import os
import time
OPENAI_KEY=os.environ['OPENAI_KEY']
openai_client = OpenAI(api_key=OPENAI_KEY)

# Post request to openai, use embedding model
BATCH_SIZE=50
embeddings = []
documents = [chunk.page_content for chunk in chunks]
for i in range(0, len(documents), BATCH_SIZE):
    batch = documents[i:i+BATCH_SIZE]
    
    resp = openai_client.embeddings.create(
        model="text-embedding-3-small",
        input=batch
    )
    embeddings += [r.embedding for r in resp.data]
    time.sleep(0.05)


metadatas = [chunk.metadata for chunk in chunks]
ids = [str(id) for id in range(len(chunks))]

rag_collection = chroma_client.get_or_create_collection('pdf-rag')

    
rag_collection.add(
    ids=ids,
    documents=documents,
    metadatas=metadatas,
    embeddings=embeddings
)
    

### Querying

In [None]:
# Query text to try
query_text = ''

def get_query_embedding(query_text): 
    return openai_client.embeddings.create(
        model="text-embedding-3-small",
        input=query_text
    ).data[0].embedding

results = rag_collection.query(
    query_embeddings=[get_query_embedding(query_text)],
    include=["documents", "metadatas", "distances"]
)

retrieved_texts = [doc for doc in results['documents'][0]]
retrieved_metadatas = [md for md in results['metadatas'][0]]

context = "\n\n".join(
    f"Source: {md['source']}\n Page: {md['page']}\nText: {text}"
    for text, md in zip(retrieved_texts, retrieved_metadatas)
)

response = openai_client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"Answer this question using only the following context:\n{context}\nQuestion: {query_text}"}
    ],
    temperature=0
)

content = response.choices[0].message.content
print(content)
