First install all the required libraries

In [1]:
!pip install langchain
!pip install tiktoken
!pip install faiss-cpu
!pip install openai
!pip install PyPDF2

Collecting langchain
  Downloading langchain-0.0.251-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.5.14-py3-none-any.whl (26 kB)
Collecting langsmith<0.1.0,>=0.0.11 (from langchain)
  Downloading langsmith-0.0.18-py3-none-any.whl (31 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain)
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.6.0,>=0.5.7->langchain)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclas

Import the required libraries

In [2]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
import os
#import en_core_web_sm
import pickle
import json
from flask import Flask, request, render_template, redirect
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import VectorDBQA, RetrievalQA, ConversationalRetrievalChain
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import FAISS
from langchain import PromptTemplate
import os

Define the functions needs to preprocess text from pdf.

In [3]:
def preprocess_texts(raw_text):
    '''
    @param raw_text: the concatinated text to be processed
    @return texts: the splitted and tokenized text
    '''
    text_splitter = CharacterTextSplitter(
                        separator = "\n",
                        chunk_size = 1024,
                        chunk_overlap  = 200,
                        length_function = len,
                    )
    texts = text_splitter.split_text(raw_text)
    return texts

def read_pdf_text(path, preprocess_langchain=False):
    '''
    @param path: the pdf object path
    @param preprocess_langchain: preprocessing flag from langchain
    @return texts: all the text from the pdf concatinated
    '''
    reader = PdfReader(path)
    raw_text = ''

    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            raw_text += text

    if preprocess_langchain:
        texts = preprocess_texts(raw_text)
    else:
        texts = raw_text
    return texts

def process_all_pdfs(directory_path, preprocess_langchain=False):
    '''
    @param directory_path: get the directory of the documentstore
    @param preprocess_langchain: if the preprocess for langchain to optimize token in chunks should be done
    @param returns: all the concatinated texts from pdfs
    '''
    all_texts = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.pdf'):
            filepath = os.path.join(directory_path, filename)
            texts = read_pdf_text(filepath, preprocess_langchain)
            all_texts.extend(texts)
    return all_texts

Provide your openAI api key

In [4]:
os.environ["OPENAI_API_KEY"] = "your openai api key"

Provide the embedding model name and the llm

In [5]:
embedding_model_name ='text-embedding-ada-002'
llm_model_name = 'gpt-3.5-turbo-16k'

Initialize the embeddings

In [6]:
# initialize the embeddings using openAI ada text embedding library and the llm model using gpt-3.5-turbo-16k
embeddings = OpenAIEmbeddings(model=embedding_model_name)
llm = OpenAI(temperature=0, model_name=llm_model_name)



Ingest and read the pdf file; process them in chunks

In [7]:
# initialize and read the *.pdf object
texts = process_all_pdfs('/content/document_store', preprocess_langchain=True) # replace 'directory_path' with your directory

In [8]:
print(f"processed text chunks are: {texts}")
print(f"total number of text chunk is {len(texts)}")


total number of text chunk is 11


Obtain the embedding for using the pretrained embeddings; initialize the FAISS document store using the preprocessed text and the pretrained embeddings of the text.

In [9]:
# initialize the FAISS document store using the preprocessed text and initialized embeddings
docsearch = FAISS.from_texts(texts, embeddings)
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":2})

Define the prompt template; A prompt template has 3 key things


*   Instruction
*   Context
*   Question



In [10]:
prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end.
       provide the answer in a easy and understandable way.
       if the question is not related to the context, please answer with "I do not have it in my context".

       Context: {context}

       User: {question}
       System: """

In [None]:
prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end.
       provide the answer in a easy and understandable way.
       if the question is not related to the context, please answer with some made up texts.

       Context: {context}

       User: {question}
       System: """

In [None]:
prompt_template = """You are a helpful AI assistant. Use the following pieces of context to answer the question at the end.
       provide the answer in a easy and understandable way.
       if the question is not related to the context, please answer rude and angry behaviour.

       Context: {context}

       User: {question}
       System: """

Initialize the qa_prompt object which will be initialized by prompt template from langchain. Here, input variables and the prompt template is required

In [11]:
qa_prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)

Define the conversational memory buffer to store ongoing conversation in the memory; Initialize a question answering object by providing all the things that we have initialized before. i.e, the retriever object, memory, llm and qa prompt object

In [12]:
# Create a conversation buffer memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, combine_docs_chain_kwargs={'prompt': qa_prompt})


Try to do conversation with the system

In [13]:
chat_history = []
while True:
  query = input("")
  result = qa({"question": query, "chat_history": chat_history})
  print(f'answer: {result["answer"]}')
  chat_history.append((query, result["answer"]))

what is this document about?
answer: This document provides tips and information on how to adapt to vision loss and improve visibility in everyday tasks. It suggests ways to brighten the lighting in your room, use bold markers and paper for writing, and put colored tape on stairs to prevent falls. It also mentions the importance of getting tested for low vision and seeking vision rehabilitation programs and aids.


KeyboardInterrupt: ignored

The following part is just for 1 shot question answering where we are not using any kind of memory.

In [None]:
qa_prompt = PromptTemplate(input_variables=["context",  "question"], template=prompt_template)

chain_type_kwargs = {"prompt": qa_prompt}
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, chain_type_kwargs=chain_type_kwargs, return_source_documents=True)


In [None]:
query = "How can I take care of my eyes?"
result = qa({"query": query})
print(f"answer: {result['result']}")
print(f"source documents: {result['source_documents']}")

answer: 
I'm sorry, that question is not related to the context provided. Please ask a question related to the information provided.
source documents: [Document(page_content='This order prescribes a uniform system for classifying, \nsafeguarding, and declassifying national security \ninformation, including information relating to defense \nagainst transnational terrorism. Our democratic principles \nrequire that the American people be informed of the \nactivities of their Government. Also, our Nation’s progress \ndepends on the free flow of information both within the \nGovernment and to the American people. Nevertheless, \nthroughout our history, the national defense has required \nthat certain information be maintained in confidence in \norder to protect our citizens, our democratic institutions, \nour homeland security, and our interactions with \nforeign nations. Protecting information critical to our \nNation’s security and demonstrating our commitment \nto open Government through