# Document Loader
Document Loader is a class that loads Documents from various sources.

Listed below are some examples of Document Loaders.

- PyPDFLoader: Loads PDF files
- CSVLoader: Loads CSV files
- UnstructuredHTMLLoader: Loads HTML files
- JSONLoader: Loads JSON files
- TextLoader: Loads text files
- DirectoryLoader: Loads documents from a directory

In [None]:
!pip install langchain langchain_community pypdf

In [7]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.5.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.5.0-py3-none-any.whl (303 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/303.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/303.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.4/303.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.5.0


In [2]:
from langchain_core.documents import Document

document = Document(page_content="this is my document")

document.__dict__

{'id': None,
 'metadata': {},
 'page_content': 'this is my document',
 'type': 'Document'}

In [4]:
document.metadata['source'] = "conyent/file.pdf"
document.metadata['page']= 0

document.metadata

{'source': 'conyent/file.pdf', 'page': 0}

## 1. load()

In [5]:
from langchain_community.document_loaders import PyPDFLoader

loader =  PyPDFLoader("/content/MIASDB Excerpta Medica 1994.pdf")

In [8]:
# Load Documents
docs = loader.load()

len(docs)

5

In [9]:
# Check Documents
docs[0:10]

[Document(metadata={'producer': 'Qt 5.3.0', 'creator': '', 'creationdate': '2015-08-24T13:30:22+00:00', 'moddate': '2015-08-25T13:22:46+01:00', 'title': '', 'rgid': 'PB:243788073_AS:104388743925772@1401899624762', 'source': '/content/MIASDB Excerpta Medica 1994.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='See\tdiscussions,\tstats,\tand\tauthor\tprofiles\tfor\tthis\tpublication\tat:\t\nhttp://www.researchgate.net/publication/243788073\nThe\tmammographic\timage\tanalysis\tsociety\ndigital\tmammogram\tdatabase.\tExerpta\tMedica\nARTICLE\n\t·\tJANUARY\t1994\nCITATIONS\n170\n3\tAUTHORS\n,\tINCLUDING:\nJohn\tSuckling\nUniversity\tof\tCambridge\n264\n\t\nPUBLICATIONS\n\t\t\t\n14,688\n\t\nCITATIONS\n\t\t\t\nSEE\tPROFILE\nAvailable\tfrom:\tJohn\tSuckling\nRetrieved\ton:\t24\tAugust\t2015'),
 Document(metadata={'producer': 'Qt 5.3.0', 'creator': '', 'creationdate': '2015-08-24T13:30:22+00:00', 'moddate': '2015-08-25T13:22:46+01:00', 'title': '', 'rgid': 'PB:243788073_AS:1

## 2. aload()
- Asynchronously loads Documents and returns them as a list[Document].

In [10]:
# Load Documents asynchronously
docs = await loader.aload()

## 3. load_and_split()
- Loads Documents and automatically splits them into chunks using TextSplitter , and returns them as a list[Document].

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=0)

docs = loader.load_and_split(text_splitter)

In [12]:
len(docs)

4

In [13]:
docs[0:10]

[Document(metadata={'producer': 'Qt 5.3.0', 'creator': '', 'creationdate': '2015-08-24T13:30:22+00:00', 'moddate': '2015-08-25T13:22:46+01:00', 'title': '', 'rgid': 'PB:243788073_AS:104388743925772@1401899624762', 'source': '/content/MIASDB Excerpta Medica 1994.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='See\tdiscussions,\tstats,\tand\tauthor\tprofiles\tfor\tthis\tpublication\tat:\t\nhttp://www.researchgate.net/publication/243788073'),
 Document(metadata={'producer': 'Qt 5.3.0', 'creator': '', 'creationdate': '2015-08-24T13:30:22+00:00', 'moddate': '2015-08-25T13:22:46+01:00', 'title': '', 'rgid': 'PB:243788073_AS:104388743925772@1401899624762', 'source': '/content/MIASDB Excerpta Medica 1994.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='The\tmammographic\timage\tanalysis\tsociety\ndigital\tmammogram\tdatabase.\tExerpta\tMedica\nARTICLE\n\t·\tJANUARY\t1994\nCITATIONS\n170'),
 Document(metadata={'producer': 'Qt 5.3.0', 'creator': '', 'crea

## 4. lazy_load()
- Loads Documents sequentially and returns them as an Iterator[Document].

In [14]:
loader.lazy_load()

<generator object PyPDFLoader.lazy_load at 0x790f819e8e50>

In [15]:
# Load Documents sequentially
docs = loader.lazy_load()
for doc in docs:
    print(doc.metadata)
    break  # Used to limit the output length

{'producer': 'Qt 5.3.0', 'creator': '', 'creationdate': '2015-08-24T13:30:22+00:00', 'moddate': '2015-08-25T13:22:46+01:00', 'title': '', 'rgid': 'PB:243788073_AS:104388743925772@1401899624762', 'source': '/content/MIASDB Excerpta Medica 1994.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}


## 5. alazy_load()
- Asynchronously loads Documents sequentially and returns them as an AsyncIterator[Document]

In [16]:
loader.alazy_load()

# Load Documents asynchronously and sequentially
docs = loader.alazy_load()
async for doc in docs:
    print(doc.metadata)
    break  # Used to limit the output length

{'producer': 'Qt 5.3.0', 'creator': '', 'creationdate': '2015-08-24T13:30:22+00:00', 'moddate': '2015-08-25T13:22:46+01:00', 'title': '', 'rgid': 'PB:243788073_AS:104388743925772@1401899624762', 'source': '/content/MIASDB Excerpta Medica 1994.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}
