# 1 Imports

In [1]:
import pandas as pd
import numpy as np
import pypdf
import os
import requests

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

In [2]:
from dotenv import load_dotenv

# 2 Paths and envs

Load envs and paths

In [3]:
load_dotenv('.env')

True

In [4]:
# Data folder
data_folder = r"../data" 

DOC_PATH = os.path.join(data_folder, os.listdir(data_folder)[0])

In [5]:
# API keys
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
llangchain_key = os.environ['LANGCHAIN_API_KEY']

# 3 Ingestion

### 3.1 Loaders

In [6]:
loader = PyPDFLoader(DOC_PATH)

In [7]:
pages = loader.load()

### 3.2 Chunks

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
chunks = text_splitter.split_documents(pages)

### 3.3 Embeddings

In [9]:
texts = [chunk.page_content for chunk in chunks]

In [10]:
embed = HuggingFaceEmbeddings()

  embed = HuggingFaceEmbeddings()
  embed = HuggingFaceEmbeddings()
  from tqdm.autonotebook import tqdm, trange


### 3.4 Vector DB

In [11]:
CHROMA_PATH = "test"

In [12]:
# embed the chunks as vectors and load them into the database
db_chroma = Chroma.from_documents(documents=chunks,embedding=embed, persist_directory=CHROMA_PATH)

### 3.5 POST request to the server

In [14]:
base_url = 'http://localhost:8080'

In [15]:
def get_server_health():
    response = requests.get(f'{base_url}/health')
    return response.json()

In [16]:
def post_completion(user_input):

    # Vector DB search
    docs = db_chroma.similarity_search_with_score(query, k=15)

    # Context
    context = "\n\n".join([doc.page_content for doc, _score in docs])

    prompt = f"{context}\nUser: {user_input}\nAssistant:"
    data = {
        'prompt': prompt,
        'temperature': 0.6,
        'top_k': 20,
        'top_p': 0.95,
        'n_predict': 400,
        'stop': ["</s>", "Assistant:", "User:"]
    }
    headers = {'Content-Type': 'application/json'}
    response = requests.post(f'{base_url}/completion', json=data, headers=headers)
    if response.status_code == 200:
        return response.json()['content'].strip()
    else:
        return "Error processing your request. Please try again."

In [13]:
# this is an example of a user question (query)
query = 'Tell me about the neccesarry permissions for coffee exports'

In [17]:
post_completion(user_input=query)

'In order to export coffee, you need to have the following permissions: \n1. Permission to export \n2. Permission to export coffee\n3. Permission to export coffee with a certain quality level\n4. Permission to export coffee with a certain quality level and a certain country of origin\n5. Permission to export coffee with a certain quality level, a certain country of origin, and a certain destination'

In [18]:
query2  = "What certifications do I need?"

In [19]:
post_completion(query2)

'You need the following certifications:  \n \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE (Instituto Hondureño de Café)  \n\uf0b7 IHCAFE 