In [22]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
import json
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
import pickle


In [4]:
loader = DirectoryLoader('pdfs/', glob="./*.pdf", loader_cls=PyPDFLoader)

In [5]:
print(loader.load())



In [22]:
def get_pdf_text():
    text = ""
    directory_pdf = 'pdfs/'
    pdf_docs = os.listdir(directory_pdf)
    for pdf in pdf_docs:
        if pdf.endswith('.pdf'):
            print(pdf)
            pdf_reader = PdfReader(directory_pdf+pdf)
            for page in pdf_reader.pages:
                text += page.extract_text()
    with open('pdfs/output.txt', 'w') as file:
        file.write(text)
    return text


raw_text = get_pdf_text()

CBAHI-PHC-Standards v1.1 Effective Feb.2017.pdf
DENTAL CBAHI.pdf
CBAHI Ambulatory 2019.pdf
CBAHI 4TH EDITION National Healthcare Priorities.pdf
NATIONAL STANDARDS FOR ACUTE CORONARY SYNDROME SER_240115_145528.pdf
VTE REVISED POLICY - VERSION 5.pdf


In [20]:
# print(raw_text)

In [6]:
def get_text_chunks():
    text = open('pdfs/output.txt', 'r').read()
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    print("lenghth of chunks ",len(chunks))
    print(type(chunks))
    return chunks

text_chunks = get_text_chunks()

Created a chunk of size 2772, which is longer than the specified 1000
Created a chunk of size 2477, which is longer than the specified 1000
Created a chunk of size 2606, which is longer than the specified 1000
Created a chunk of size 2897, which is longer than the specified 1000
Created a chunk of size 4434, which is longer than the specified 1000
Created a chunk of size 4190, which is longer than the specified 1000
Created a chunk of size 4645, which is longer than the specified 1000
Created a chunk of size 3941, which is longer than the specified 1000
Created a chunk of size 3990, which is longer than the specified 1000
Created a chunk of size 3334, which is longer than the specified 1000
Created a chunk of size 4020, which is longer than the specified 1000
Created a chunk of size 3522, which is longer than the specified 1000
Created a chunk of size 3578, which is longer than the specified 1000
Created a chunk of size 4413, which is longer than the specified 1000
Created a chunk of s

lenghth of chunks  1611
<class 'list'>


In [7]:
### To save the chunk into text 
def save_chunks_to_text_file(chunks, filename):
    with open(filename, 'w') as file:
        for chunk in chunks:
            file.write(chunk + '\n')

# Assuming text_chunks is your list of chunks
save_chunks_to_text_file(text_chunks, 'pdfs/chunks.txt')


In [1]:
def load_chunks_from_text_file(filename):
    with open(filename, 'r') as file:
        return file.read().splitlines()

loaded_chunks = load_chunks_from_text_file('pdfs/chunks.txt')
print(loaded_chunks)



In [8]:
# to save it as json
def save_chunks(chunks, filename):
    with open(filename, 'w') as file:
        json.dump(chunks, file)

save_chunks(text_chunks, 'pdfs/chunks.json')

In [9]:
def load_chunks(filename):
    with open(filename, 'r') as file:
        return json.load(file)

loaded_chunks = load_chunks('pdfs/chunks.json')
print(type(loaded_chunks))

<class 'list'>


In [10]:
loaded_chunks = load_chunks('pdfs/chunks.json')
print(loaded_chunks)



In [23]:
def get_vectorstore(text_chunks):
    # embeddings = OpenAIEmbeddings()
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore


vectorstore = get_vectorstore(loaded_chunks)
print(vectorstore)
print(type(vectorstore))

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer


  return self.fget.__get__(instance, owner)()


max_seq_length  512
<langchain.vectorstores.faiss.FAISS object at 0x2a6c1e110>
<class 'langchain.vectorstores.faiss.FAISS'>


In [24]:
# save the vectorstore to a file
def save_vectorstore(vectorstore, filename):
    with open(filename, 'wb') as file:
        pickle.dump(vectorstore, file)

# Save the vectorstore to a file
save_vectorstore(vectorstore, 'vectorstore.pkl')


In [25]:
def load_vectorstore(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

# Load the vectorstore from the file
loaded_vectorstore = load_vectorstore('vectorstore.pkl')
print(loaded_vectorstore)

<langchain.vectorstores.faiss.FAISS object at 0x11b104a50>
