## requeriments.txt

In [None]:
%%capture
# !pip install bitsandbytes==0.40.0
# !pip install accelerate==0.21.0
# !pip install langchain
# !pip install langchain_community
# !pip install PyPDF2
# !pip install pypdf
# !pip install pinecone
# !pip install sentence-transformers
# !pip install -U pinecone-client langchain
# !pip install langchain_pinecone
# !pip install gradio

In [None]:
import os
import torch
import pinecone
import transformers
import gradio as gr
from pinecone import Pinecone
from torch import cuda, bfloat16
from langchain.vectorstores import Pinecone
from pinecone import Pinecone, ServerlessSpec
from langchain.llms import HuggingFacePipeline
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from transformers import StoppingCriteria, StoppingCriteriaList
from langchain.chains import StuffDocumentsChain, LLMChain, ConversationalRetrievalChain

## model.py

In [None]:
%%capture
model_id = 'meta-llama/Llama-2-7b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

model.eval()

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    stopping_criteria=stopping_criteria,
    temperature=0.1,
    max_new_tokens=512,
    repetition_penalty=1.1
)

llm = HuggingFacePipeline(pipeline=generate_text)

## app.py

In [None]:
def gui(path):
  loader = PyPDFLoader(path)
  documents = loader.load()

  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
  texts = text_splitter.split_documents(documents)

  os.environ['PINECONE_API_KEY'] = "73439b95-7aed-4b14-a18c-ad93cf4e9bef"
  embeddings = HuggingFaceEmbeddings()
  index_name = "hannah"
  docsearch = PineconeVectorStore.from_documents(texts, embeddings, index_name=index_name)

  qa_chain = ConversationalRetrievalChain.from_llm(
    llm,
    docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=False,
  )

  chat_history = []

  query = """you're a helpful assistente
          - Detail this file and sum my expenses:
  """

  result = qa_chain({'question': query, 'chat_history': chat_history})
  answer = result["answer"]

  clean_answer = answer.split("Helpful Answer:")[-1].strip() if "Helpful Answer:" in answer else answer
  return (clean_answer)

demo = gr.Interface(
    gui,
    [
        gr.UploadButton("Upload a file"),
    ],
    "text",
)

demo.launch(debug=True)