# Document Loader and Q&A
This script will load all PDFs from a directory and then upload to a vector store

PyPDFDirectoryLoader loads all PDFs in a folder one page at a time (e.g. a PDF with 6 pages will produce a loader with a length of six).

In [None]:
# Import the necessary packages
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.document_loaders import PyMuPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

import os
from dotenv import load_dotenv

In [None]:


OPENAI_API_KEY = load_dotenv("OPENAI_API_KEY")

target_directory = "/home/stubbs/Documents/pdfs/"
files = []

for pdf_file in os.listdir(target_directory):
    print(pdf_file)
    files.append(target_directory+pdf_file)
print(f'\nThere are {len(files)} PDF files to load.')
    
    # loader = PyMuPDFLoader(target_directory+pdf_file)
    # data = loader.load()
    # print(data[0])

## Load Data
1. Load a document
2. Check length and chunk if necessary
3. Use OpenAI Embeddings

In [None]:
text_list = []
for count, doc in enumerate(files, start=1):
    print('--------------')
    print(f'Document #: {count}')
    loader = UnstructuredPDFLoader(doc)
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 0,
    length_function = len,
    )
    texts=text_splitter.split_documents(data)
    text_list.append(texts) 
    print(f'This document was split into {len(texts)} documents')
    print(texts)
    # print(data[0])
    print('--------------')
    print(f'\n')

## Create embeddings of docs to get them ready for semantic search

In [None]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone


In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [None]:
# Initialize Pinecone
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment = "us-east4-gcp"
)
index_name="langchain2"

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 400,
chunk_overlap = 20,
length_function = len,
separators=["\n\n", "\n", " ", ""]
)
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

# data_list=[]
# for count, doc in enumerate(files, start=1):
#     print('--------------')
#     print(f'Document #: {count}')
#     loader = UnstructuredPDFLoader(doc)
#     data = loader.load()
#     texts = text_splitter.split_documents(data)
    
    
#     data_list.append(data)

# for data in data_list:
#     texts=text_splitter.split_documents(data)
#     print(texts)
    
# print(f'This document was split into {len(texts)} documents')
# print(texts)
# Pinecone.from_texts(
#     [t.page_content for t in texts],
#     embeddings,
#     index_name=index_name
# )
# print(data[0])
# print('--------------')
# print(f'\n')

In [None]:
docsearch = Pinecone.from_texts([t.page_content for t in text_list], embeddings, index_name=index_name)

## Chunk the data