In [None]:
from langchain_community.document_loaders import RecursiveUrlLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_transformers import BeautifulSoupTransformer
import pickle
import unicodedata
import re
from tqdm import tqdm

In [60]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()

In [112]:
loader  = RecursiveUrlLoader('https://python.langchain.com/api_reference/',
                             max_depth=12,
                             extractor=bs4_extractor)

In [113]:
page = loader.load()

In [114]:
len(page)

5339

In [82]:
for i, j in enumerate(page):
    print(i, j.metadata['source'])

0 https://python.langchain.com/api_reference/
1 https://python.langchain.com/api_reference/ibm/index.html
2 https://python.langchain.com/api_reference/exa/index.html
3 https://python.langchain.com/api_reference/nomic/index.html
4 https://python.langchain.com/api_reference/index.html
5 https://python.langchain.com/api_reference/weaviate/index.html
6 https://python.langchain.com/api_reference/_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf
7 https://python.langchain.com/api_reference/_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf
8 https://python.langchain.com/api_reference/standard_tests/index.html
9 https://python.langchain.com/api_reference/search.html
10 https://python.langchain.com/api_reference/google_community/index.html
11 https://python.langchain.com/api_reference/milvus/index.html
12 https://python.langchain.com/api_reference/_static/pygments.css?v=8f2a1f02
13 https://python.langchain.com/api_reference/neo4j/index.html
14 https://python.la

In [None]:
def clean_text(text):
    text = text.encode('latin1', errors='ignore').decode('utf-8', errors='ignore')

    text = unicodedata.normalize('NFKC', text)

    text = ''.join(c for c in text if unicodedata.category(c)[0] != 'C')
    text = re.sub(r"LangChain\s+documentation$", "", text).strip()
    text = text.split("|")[0].strip()
    return text

In [116]:
clean_text('langchain-fireworks: 0.3.0 â€” ðŸ¦œðŸ”— LangChain  documentation')

'langchain-fireworks: 0.3.0'

In [117]:
def remove_blank_lines(text):
    lines = text.splitlines()
    while lines and lines[0].strip() == "":
        lines.pop(0)
    while lines and lines[-1].strip() == "":
        lines.pop()
    return "\n".join(lines)

In [118]:
def extract_main_content(text, marker_leading="LangChain Python API Reference", marker_trailing="© Copyright 2025, LangChain Inc."):
    main_content = text.split(marker_leading, 1)[-1]
    main_content = main_content.split(marker_trailing, 1)[0]
    main_content = main_content.strip()
    main_content = remove_blank_lines(main_content)
    return main_content

In [119]:
text = """





runnables — 🦜🔗 LangChain  documentation



































Skip to main content


Back to top




Ctrl+K
























    Reference










Ctrl+K







Docs










GitHub



X / Twitter








Ctrl+K















    Reference











Docs










GitHub



X / Twitter







Section Navigation
Base packages

Core
Langchain
agents
callbacks
chains
chat_models
embeddings
evaluation
globals
hub
indexes
memory
model_laboratory
output_parsers
retrievers
runnables
HubRunnable
OpenAIFunction
OpenAIFunctionsRouter


smith
storage


Text Splitters
Community
Experimental

Integrations

AI21
Anthropic
AstraDB
AWS
Azure Ai
Azure Dynamic Sessions
Cerebras
Chroma
Cohere
Deepseek
Elasticsearch
Exa
Fireworks
Google Community
Google GenAI
Google VertexAI
Groq
Huggingface
IBM
Milvus
MistralAI
MongoDB
Neo4J
Nomic
Nvidia Ai Endpoints
Ollama
OpenAI
Perplexity
Pinecone
Postgres
Prompty
Qdrant
Redis
Sema4
Snowflake
Sqlserver
Standard Tests
Tavily
Together
Unstructured
Upstage
VoyageAI
Weaviate
XAI
























LangChain Python API Reference
langchain: 0.3.25
runnables









runnables#
LangChain Runnable and the LangChain Expression Language (LCEL).
The LangChain Expression Language (LCEL) offers a declarative method to build
production-grade programs that harness the power of LLMs.
Programs created using LCEL and LangChain Runnables inherently support
synchronous, asynchronous, batch, and streaming operations.
Support for async allows servers hosting the LCEL based programs
to scale better for higher concurrent loads.
Batch operations allow for processing multiple inputs in parallel.
Streaming of intermediate outputs, as they’re being generated, allows for
creating more responsive UX.
This module contains non-core Runnable classes.
Classes


runnables.hub.HubRunnable
An instance of a runnable stored in the LangChain Hub.

runnables.openai_functions.OpenAIFunction
A function description for ChatOpenAI

runnables.openai_functions.OpenAIFunctionsRouter
A runnable that routes to the selected function.






















      © Copyright 2025, LangChain Inc.











"""

cleaned = extract_main_content(text)

print(f"After:\n{cleaned!r}")


After:
'langchain: 0.3.25\nrunnables\n\n\n\n\n\n\n\n\n\nrunnables#\nLangChain Runnable and the LangChain Expression Language (LCEL).\nThe LangChain Expression Language (LCEL) offers a declarative method to build\nproduction-grade programs that harness the power of LLMs.\nPrograms created using LCEL and LangChain Runnables inherently support\nsynchronous, asynchronous, batch, and streaming operations.\nSupport for async allows servers hosting the LCEL based programs\nto scale better for higher concurrent loads.\nBatch operations allow for processing multiple inputs in parallel.\nStreaming of intermediate outputs, as they’re being generated, allows for\ncreating more responsive UX.\nThis module contains non-core Runnable classes.\nClasses\n\n\nrunnables.hub.HubRunnable\nAn instance of a runnable stored in the LangChain Hub.\n\nrunnables.openai_functions.OpenAIFunction\nA function description for ChatOpenAI\n\nrunnables.openai_functions.OpenAIFunctionsRouter\nA runnable that routes to the

In [None]:
count = 0
documentation = {}

for count, document in enumerate(tqdm(page, desc="Processing documents")):
    metadata = document.metadata
    content = document.page_content

    if 'source' in metadata and not content.strip():
        continue

    if 'title' in metadata:
        name = clean_text(metadata['title'])
    else:
        url = metadata.get('source', '')
        name = url.rstrip('/').split('/')[-1] or f"doc_{count}"

    name = name.strip().lower().replace(" ", "_")

    trimmed = extract_main_content(content)

    documentation[name] = trimmed
    count += 1

Processing documents: 100%|██████████| 5339/5339 [00:00<00:00, 8196.88it/s]


In [123]:
for i in list(documentation.keys())[:500]:
  print(i)

langchain_python_api_reference
langchain-ibm:_0.3.11
langchain-exa:_0.2.1
langchain-nomic:_0.1.4
langchain-weaviate:_0.0.4
pydata-sphinx-theme.js?digest=8878045cc6db502f8baf
pydata-sphinx-theme.css?digest=8878045cc6db502f8baf
langchain-tests:_0.3.19
search_-
langchain-google-community:_2.0.7
search
google_speech_to_text
pygments.css?v=8f2a1f02
langchain-neo4j:_0.4.0
langchain-groq:_0.3.2
langchain-anthropic:_0.3.13
langchain-redis:_0.2.1
vertex_ai_search
documentai_warehouse
gmail
langchain-huggingface:_0.2.0
langchain-ai21:_1.1.0
langchain-mistralai:_0.2.10
googlesearchrun
langchain-upstage:_0.6.0
custom.css?v=8e9fa5b3
langchain-elasticsearch:_0.3.2
bq_storage_vectorstores
langchain-azure-dynamic-sessions:_0.2.0
langchain:_0.3.25
langchain-chroma:_0.2.4
langchain-community:_0.3.24
langchain-xai:_0.2.3
langchain-ollama:_0.3.3
langchain-sqlserver:_0.1.2
vertex_check_grounding
drive
langchain-prompty:_0.1.1
langchain-unstructured:_0.1.6
calendar
langchain-voyageai:_0.1.4
langchain-astrad

In [124]:
print(documentation['textsplitter'])

langchain-text-splitters: 0.3.8
base
TextSplitter









TextSplitter#


class langchain_text_splitters.base.TextSplitter(chunk_size: int = 4000, chunk_overlap: int = 200, length_function: ~typing.Callable[[str], int] = <built-in function len>, keep_separator: bool | ~typing.Literal['start', 'end'] = False, add_start_index: bool = False, strip_whitespace: bool = True)[source]#
Interface for splitting text into chunks.
Create a new TextSplitter.

Parameters:

chunk_size (int) – Maximum size of chunks to return
chunk_overlap (int) – Overlap in characters between chunks
length_function (Callable[[str], int]) – Function that measures the length of given chunks
keep_separator (Union[bool, Literal['start', 'end']]) – Whether to keep the separator and where to place it
in each corresponding chunk (True=’start’)
add_start_index (bool) – If True, includes chunk’s start index in metadata
strip_whitespace (bool) – If True, strips whitespace from the start and end of
every document



Methods




In [None]:
try:
    dataset= open('LangDataset', 'wb')
    pickle.dump(documentation, dataset)
    dataset.close()

except:
    print("Something went wrong")