In [None]:
#install dependencies
!pip3 install -q -r requirements.txt

In [None]:
#downlaod model used
!pip3 install huggingface-hub
!huggingface-cli download TheBloke/neural-chat-7B-v3-1-GGUF neural-chat-7b-v3-1.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False

In [2]:
from langchain_community.embeddings import HuggingFaceEmbeddings
import torch
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from numba import cuda
import os
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)
from langchain.prompts import ChatPromptTemplate

In [3]:
def create_vector_emb(pdf_file):
    model_name = "hkunlp/instructor-xl"
    model_kwargs = {'device': 'cuda'}
    encode_kwargs = {'normalize_embeddings': False}
    with torch.no_grad():
        hf = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs
        )

    loader = PyPDFLoader(pdf_file)
    doc = loader.load_and_split()
    #chunk_size = 300
    text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=150,
            chunk_overlap=5,
            length_function=len,
            add_start_index=True,
        )

    chunks = text_splitter.split_documents(doc)
    DB_file_name = pdf_file.split('/')[-1].split('.')[0]
    db = Chroma.from_documents(
            chunks, hf, persist_directory=DB_file_name
        )
    db.persist()
    print('db created sucessfuly')
    del hf

    device = cuda.get_current_device()
    device.reset()

In [4]:
pdf_file_name = 'yolo.pdf'
DB_file_name = pdf_file_name.split('/')[-1].split('.')[0]

In [6]:
from langchain_community.llms import LlamaCpp
n_gpu_layers = -1
n_batch = 512 
model_path = 'neural-chat-7b-v3-1.Q4_K_M.gguf'
llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    temperature=.5,
    n_ctx=10000,
)

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3070 Ti, compute capability 8.6, VMM: yes
llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from neural-chat-7b-v3-1.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = intel_neural-chat-7b-v3-1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.

In [7]:
#create the vector embeddings of the PDF
if not os.path.exists(DB_file_name):
    print('creating db embeddings')
    create_vector_emb(pdf_file_name)
    print('vector embeddings created')
else:
    print('vector embeddings already exist')

vector embeddings already exist


In [8]:
#initiallize embeddings for searching queries
model_name = "hkunlp/instructor-xl"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [9]:
db = Chroma(persist_directory=DB_file_name, embedding_function=hf)

In [10]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""
def get_prompt(question):
    results = db.similarity_search_with_relevance_scores(question, k=80)
    context_text= "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=question)
    return HumanMessage(content=prompt)

In [11]:
messages = [
    SystemMessage(content="You are a helpful deep learning assistant."),
    HumanMessage(content="Hi AI, how are you today?"),
    AIMessage(content="I'm great thank you. How can I help you?"),
]
def make_question(prompt,messages,llm=llm):
    messages.append(prompt)
    try:
        answer = llm.invoke(messages)
    except: 
        messages = [
                    SystemMessage(content="You are a helpful deep learning assistant."),
                    HumanMessage(content="Hi AI, how are you today?"),
                    AIMessage(content="I'm great thank you. How can I help you?"),
        ]
        messages.append(prompt)
        answer = llm.invoke(messages)
    messages.append(AIMessage(content = answer))
    print(answer)
    return messages

In [None]:
prompt = get_prompt('How many african-american characters are there?')
messages = make_question(prompt,messages,llm=llm)