In [None]:
!pip install PyPDF2 qdrant_client langchain openai tiktoken


In [None]:
!pip install sentence_transformers

In [None]:
import os
from PyPDF2 import PdfReader, PdfFileMerger

from langchain import HuggingFaceHub
from langchain.llms import OpenAI
from langchain.schema import retriever
from langchain.chains import RetrievalQA
from langchain.vectorstores import Qdrant
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter

from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings


from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams



In [None]:
user_1 ={
    'Certificado de discapacidad': True,
    'Carne de discapacidad': True
}
user_2 ={
    'Certificado de discapacidad': False,
    'Carne de discapacidad': True
}


In [None]:
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['HUGGINGFACEHUB_API_TOKEN'] = HUGGINGFACEHUB_API_TOKEN
os.environ['QDRANT_KEY_TOKEN'] = QDRANT_KEY_TOKEN

In [None]:
def main():
    path_pdf = 'beca.pdf'
    url_db = "https://a6cf0f79-0f82-46e5-b761-32c76ff34c3e.us-east-1-0.aws.cloud.qdrant.io:6333"
    collection_name= "test_collection"
    size_length_stored=768  # Depends on the embedding model, ex: HuggingFace 768, OpenAI 1536. More infor https://huggingface.co/spaces/mteb/leaderboard
    query = 'Puedo tener una beca ?'
    user_1 ={
    'Certificado de discapacidad': True,
    'Carne de discapacidad': True

    }

    embedding = HuggingFaceEmbeddings()
    llm= OpenAI()

    vectorstore = vectorstore_save_data(url_db, collection_name, size_length_stored, embedding, path_pdf)
    response = query_to_db(query, user_2, llm, vectorstore)

    print(response)



In [None]:
main()

client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
) model_name='sentence-transformers/all-mpnet-base-v2' cache_folder=None model_kwargs={} encode_kwargs={}
Uploading data to Qdrant vectorstore!
Uploaded!


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
{'result': ' No, según el Reglamento de Becas, los documentos requeridos para postular a una Beca Inclusión son un Certificado de Discapacidad emitido por el establecimiento de salud del Ministerio de Salud (MINSA) y un Carné de Discapacidad emitido por CONADIS. Por lo tanto, el usuario no puede postular a una Beca Inclusión ya que no cuenta con el Certificado de Discapacidad.', 'source': [Document(page_content='documento. 

In [None]:
def vectorstore_save_data(url_db, collection_name, size_length_stored, embedding, path_pdf):

    client_db = init_client_db(url_db)
    create_collection(client_db, collection_name, size_length_stored)
    vectorstore = get_vectorstore(client_db, collection_name, embedding)
    save_vector_data(path_pdf, vectorstore)

    return vectorstore

## Save data into db


In [None]:
def init_client_db(url_db):
    client_db = QdrantClient(
        url=url_db,
        api_key = os.getenv("QDRANT_KEY_TOKEN")
    )

    return client_db

In [None]:
def create_collection(client_db, collection_name, size_length_stored):

    collection_creator = client_db.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=size_length_stored, distance=Distance.COSINE),
    )
    return True

In [None]:
def get_vectorstore(client_db, collection_name, embeddings):

    vectorstore = Qdrant(

        client = client_db,
        collection_name = collection_name,
        embeddings= embeddings
    )
    return vectorstore

In [None]:
def save_vector_data(path_pdf, vectorstore):
    text = pdf_to_text(path_pdf)
    chunks = get_chunks_from_long_text(text)
    print('Uploading data to Qdrant vectorstore!')
    vectorstore.add_texts(chunks)
    print('Uploaded!')
    return True


In [None]:
def pdf_to_text(pdf_direction):
    reader =PdfReader(pdf_direction)
    text = ""
    num_pages = len(reader.pages)

    for i in range(num_pages):
        page = reader.pages[i]
        text += page.extract_text()

    return text

In [None]:
def get_chunks_from_long_text(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )

    chunks = text_splitter.split_text(text)
    return chunks

## Query db

In [None]:
def query_to_db(query, user, llm, vectorstore):

    query_enriched = query_template(query, user)
    response = query_response(llm, vectorstore, query_enriched)

    return response

In [None]:
def query_template(query, user):

    prompt_template = PromptTemplate(
        input_variables = ["query", "user"],
        template = "El usuario con la siguiente información: {user} pregunta lo siguiente: {query}. Responder en base a la información del usuario "
    )
    query_final = prompt_template.format(query=query,user=user)
    return query_final

In [None]:
def query_response(llm, vectorstore, query_enriched):
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever= vectorstore.as_retriever(),
        verbose= True,
        return_source_documents=True
    )

    response = qa(query_enriched)
    answer = {
        'result':response['result'],
        'source':  response['source_documents']
    }
    return answer