[Reference](https://levelup.gitconnected.com/building-vector-databases-with-fastapi-and-chromadb-0a1cd96fab08)

In [1]:
from pydantic import BaseModel

#extending the BaseModel
class User(BaseModel):
    user: str
    age: int

user = User(user="Om",age="21")
print(user)

user='Om' age=21


# Setting up FastAPI

In [3]:
pip install fastapi

Collecting fastapi
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m987.7 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting starlette<0.38.0,>=0.37.2 (from fastapi)
  Downloading starlette-0.37.2-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi-cli>=0.0.2 (from fastapi)
  Downloading fastapi_cli-0.0.4-py3-none-any.whl (9.5 kB)
Collecting httpx>=0.23.0 (from fastapi)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting python-multipart>=0.0.7 (from fastapi)
  Downloading python_multipart-0.0.9-py3-none-any.whl (22 kB)
Collecting ujson!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,>=4.0.1 (from fastapi)
  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x8

In [4]:
from fastapi import FastAPI

app = FastAPI()


@app.get("/")
async def root():
    return {"message": "Whatchamacallit"}

# Building a real-world API using FastAPI

## Chunking the PDF Document using Langchain

In [5]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("files/samples.pdf")
pages = loader.load()

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100,length_function=len,
        is_separator_regex=False)
chunks = text_splitter.split_documents(pages)

## Generating word embeddings for the chunks using an open-source embedding model

In [9]:
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

## Uploading word embeddings to the vector database

In [10]:
from langchain_chroma import Chroma

ids = [str(i) for i in range(1, len(chunks) + 1)]
Chroma.from_documents(pages, embedding_function, persist_directory="chroma_db", ids=ids)

## Fetching the nearest neighbouring chunks to the user query using similarity search

In [11]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma(persist_directory="chroma_db",embedding_function=embedding_function)
results = db.similarity_search(query.query, k=query.neighbours)

## Deleting the database

In [12]:
if "chroma_db" in os.listdir():
  shutil.rmtree("chroma_db")
  print(f"Deleted database and its contents.")
else:
  raise FileNotFoundError("Database not found.")

# Create endpoints for the functions in FastAPI

```
functions.py
```

In [13]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_chroma import Chroma
import warnings
import shutil
import os

warnings.filterwarnings('ignore')

#Creating the database
def create_db():

    loader = PyPDFLoader("files/samples.pdf")
    pages = loader.load()


    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100,length_function=len,
        is_separator_regex=False)
    chunks = text_splitter.split_documents(pages)
    print(len(chunks))

    ids = [str(i) for i in range(1, len(chunks) + 1)]

    # create the open-source embedding function
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    # Create the Chroma database with IDs
    Chroma.from_documents(pages, embedding_function, persist_directory="chroma_db", ids=ids)


#Deleting the database
def delete_persisted_db():
    if "chroma_db" in os.listdir():
        shutil.rmtree("chroma_db")
        print(f"Deleted database and its contents.")
    else:
        raise FileNotFoundError("Database not found.")

```
main.py
```

In [14]:
from fastapi import FastAPI, HTTPException
from models import Query
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from functions import create_db, delete_persisted_db

app = FastAPI()


@app.get("/")
async def root():
    return {"message": "Whatchamacallit"}

#Create database
@app.get("/create/")
async def create_database():
    create_db()
    return {"message": "Database created."}

#Delete database
@app.delete("/delete/")
async def delete_database():
    try:
        delete_persisted_db()
        return {"message": "Database deleted."}
    except FileNotFoundError as e:
        raise HTTPException(status_code=404, detail=str(e))

#Fetch Chunks
@app.post("/neighbours/")
async def fetch_item(query: Query):
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma(persist_directory="chroma_db",embedding_function=embedding_function)
    results = db.similarity_search(query.query, k=query.neighbours)
    return {"message": "Nearest neighbours found.", "results": results}

```
models.py
```

In [15]:
# Create a basic model for the FastAPI

from pydantic import BaseModel

class Query(BaseModel):
    query: str
    neighbours: int = 3