In [None]:
!pip3 install -q -r requirements.txt
!pip3 install -q huggingface-hub
!huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False
!huggingface-cli download dranger003/SFR-Embedding-Mistral-GGUF ggml-sfr-embedding-mistral-q4_k_m.gguf --local-dir . --local-dir-use-symlinks False

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
import torch
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from numba import cuda
import os
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)
from langchain.prompts import ChatPromptTemplate
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.embeddings import LlamaCppEmbeddings

In [None]:
def create_vector_emb(pdf_file,chunk_size = 300):
    model = LlamaCppEmbeddings(model_path='ggml-sfr-embedding-mistral-q4_k_m.gguf',n_gpu_layers=-1)
    loader = PyPDFLoader(pdf_file)
    doc = loader.load_and_split()
    chunk_size
    text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=50,
            length_function=len,
            add_start_index=True,
        )
    chunks = text_splitter.split_documents(doc)
    DB_file_name = f"{pdf_file.split('/')[-1].split('.')[0]}"
    db = Chroma.from_documents(
            chunks, model, persist_directory=DB_file_name
        )
    db.persist()
    print('db created sucessfuly')
    del model

    device = cuda.get_current_device()
    device.reset()

In [None]:
pdf_file_name = 'American-Fiction-Read-The-Screenplay.pdf'
DB_file_name = pdf_file_name.split('/')[-1].split('.')[0]
#create the vector embeddings of the PDF
if not os.path.exists(DB_file_name):
    print('creating db embeddings')
    create_vector_emb(pdf_file_name)
    print('vector embeddings created')
else:
    print('vector embeddings already exist')

In [None]:
model = LlamaCppEmbeddings(model_path='ggml-sfr-embedding-mistral-q4_k_m.gguf')

In [None]:
DB_file_name = 'American-Fiction-Read-The-Screenplay_150'
db = Chroma(persist_directory=DB_file_name, embedding_function=model)

In [None]:
question = 'what dataset was used to train the model'
results = db.similarity_search_with_relevance_scores(question, k=80)

In [None]:
from langchain_community.llms import LlamaCpp
#In this case I'm parsing the whole LLM to the gpu
n_gpu_layers = -1
n_batch = 512
model_path = 'mistral-7b-instruct-v0.2.Q4_K_M.gguf'
llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    temperature=.5,
    n_ctx=10000,
)

In [None]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""
def get_prompt(question):
    #TODO find a tricker way to find better queries using to optimize context size
    #k=80
    results = db.similarity_search_with_relevance_scores(question, k=100)
    context_text= "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=question)
    #return HumanMessage(content=prompt)
    return prompt

In [None]:
prompt = get_prompt('How many african-american characters are there?')
prompt = prompt[8:]
llm.invoke(prompt)