In [None]:
from langchain_community.document_loaders import RecursiveUrlLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_transformers import BeautifulSoupTransformer
from bs4 import BeautifulSoup
import pickle
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from dotenv import load_dotenv

In [4]:
load_dotenv()

True

In [9]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()

In [81]:
loader  = RecursiveUrlLoader('https://python.langchain.com/api_reference/',
                             max_depth=17,
                             extractor=bs4_extractor)

In [82]:
page = loader.lazy_load()

## Preprocessing Functions

In [18]:
base = ' https://python.langchain.com/api_reference/ibm/index.html'

base_class = base.split('/')[4]
base_class

'ibm'

In [50]:
import unicodedata
import re

def clean_text(text):
    text = text.encode('latin1', errors='ignore').decode('utf-8', errors='ignore')

    text = unicodedata.normalize('NFKC', text)

    text = ''.join(c for c in text if unicodedata.category(c)[0] != 'C')
    text = re.sub(r"LangChain\s+documentation$", "", text).strip()
    text = text.split("|")[0].strip()
    return text

In [51]:
clean_text('langchain-fireworks: 0.3.0 â€” ðŸ¦œðŸ”— LangChain  documentation')

'langchain-fireworks: 0.3.0'

In [53]:
def remove_blank_lines(text):
    lines = text.splitlines()
    while lines and lines[0].strip() == "":
        lines.pop(0)
    while lines and lines[-1].strip() == "":
        lines.pop()
    lines = "\n".join(lines)
    return re.sub(r'\n{3,}', '\n\n', lines.strip())

In [54]:
def extract_main_content(text, marker_leading="LangChain Python API Reference", marker_trailing="© Copyright 2025, LangChain Inc."):
    main_content = text.split(marker_leading, 1)[-1]
    main_content = main_content.split(marker_trailing, 1)[0]
    main_content = main_content.strip()
    main_content = remove_blank_lines(main_content)
    return main_content

In [55]:
text = '''





runnables — 🦜🔗 LangChain  documentation



































Skip to main content


Back to top




Ctrl+K
























    Reference










Ctrl+K







Docs










GitHub



X / Twitter








Ctrl+K















    Reference











Docs










GitHub



X / Twitter







Section Navigation
Base packages

Core
Langchain
agents
callbacks
chains
chat_models
embeddings
evaluation
globals
hub
indexes
memory
model_laboratory
output_parsers
retrievers
runnables
HubRunnable
OpenAIFunction
OpenAIFunctionsRouter


smith
storage


Text Splitters
Community
Experimental

Integrations

AI21
Anthropic
AstraDB
AWS
Azure Ai
Azure Dynamic Sessions
Cerebras
Chroma
Cohere
Deepseek
Elasticsearch
Exa
Fireworks
Google Community
Google GenAI
Google VertexAI
Groq
Huggingface
IBM
Milvus
MistralAI
MongoDB
Neo4J
Nomic
Nvidia Ai Endpoints
Ollama
OpenAI
Perplexity
Pinecone
Postgres
Prompty
Qdrant
Redis
Sema4
Snowflake
Sqlserver
Standard Tests
Tavily
Together
Unstructured
Upstage
VoyageAI
Weaviate
XAI
























LangChain Python API Reference
langchain: 0.3.25
runnables









runnables#
LangChain Runnable and the LangChain Expression Language (LCEL).
The LangChain Expression Language (LCEL) offers a declarative method to build
production-grade programs that harness the power of LLMs.
Programs created using LCEL and LangChain Runnables inherently support
synchronous, asynchronous, batch, and streaming operations.
Support for async allows servers hosting the LCEL based programs
to scale better for higher concurrent loads.
Batch operations allow for processing multiple inputs in parallel.
Streaming of intermediate outputs, as they’re being generated, allows for
creating more responsive UX.
This module contains non-core Runnable classes.
Classes


runnables.hub.HubRunnable
An instance of a runnable stored in the LangChain Hub.

runnables.openai_functions.OpenAIFunction
A function description for ChatOpenAI

runnables.openai_functions.OpenAIFunctionsRouter
A runnable that routes to the selected function.






















      © Copyright 2025, LangChain Inc.










'''

text = extract_main_content(text)

In [56]:
print(text)

langchain: 0.3.25
runnables

runnables#
LangChain Runnable and the LangChain Expression Language (LCEL).
The LangChain Expression Language (LCEL) offers a declarative method to build
production-grade programs that harness the power of LLMs.
Programs created using LCEL and LangChain Runnables inherently support
synchronous, asynchronous, batch, and streaming operations.
Support for async allows servers hosting the LCEL based programs
to scale better for higher concurrent loads.
Batch operations allow for processing multiple inputs in parallel.
Streaming of intermediate outputs, as they’re being generated, allows for
creating more responsive UX.
This module contains non-core Runnable classes.
Classes

runnables.hub.HubRunnable
An instance of a runnable stored in the LangChain Hub.

runnables.openai_functions.OpenAIFunction
A function description for ChatOpenAI

runnables.openai_functions.OpenAIFunctionsRouter
A runnable that routes to the selected function.


## Dataset Creation

In [83]:
from tqdm import tqdm
from collections import defaultdict

documentation = defaultdict(list)

for i, document in enumerate(tqdm(page, desc="Processing documents")):
    content = document.page_content
    content = extract_main_content(content)
    base_class = document.metadata['source']
    class_list = base_class.split('/')
    root_topic, topic = class_list[4], class_list[-1].replace('.html', '')
    if root_topic == '':
        root_topic = 'Langchain'
    documentation[root_topic].append(f"\n\n## Class Objects: {topic}\n{content}")

Processing documents: 775it [04:01,  3.20it/s]


In [84]:
print(len(documentation['google_genai']))

16

In [85]:
documentation = {
    topic: "\n".join(contents)
    for topic, contents in documentation.items()
}

In [86]:
print(type(documentation['google_genai']))
print(len(documentation['google_genai']))

<class 'str'>
144531


In [91]:
documentation.keys()

dict_keys(['Langchain', 'perplexity', 'google_genai', 'ollama', 'chroma', 'pinecone', 'reference.html', 'nvidia_ai_endpoints', 'upstage', 'aws', 'anthropic', 'fireworks', '_static', 'cerebras', 'search.html', 'sqlserver', 'redis', '_modules', 'prompty', 'text_splitters', 'standard_tests', 'mistralai', 'mongodb', 'together', 'groq', 'cohere', 'experimental', 'nomic', 'openai', 'azure_dynamic_sessions', 'postgres', 'milvus', 'snowflake', 'neo4j', 'xai', 'unstructured', 'qdrant', 'tavily', 'astradb', 'community', 'ibm', 'core', 'google_vertexai', 'azure_ai', 'huggingface', 'elasticsearch', 'google_community', 'langchain', 'weaviate', 'ai21', 'deepseek', 'exa', 'voyageai', 'index.html', 'sema4'])

In [95]:
documentation.pop('_static')
documentation.pop('index.html')
documentation.pop('search.html')

'\n\n## Class Objects: search\nSearch - 🦜🔗 LangChain  documentation\n\nSkip to main content\n\nBack to top\n\nCtrl+K\n\n    Reference\n  \n\nCtrl+K\n\nDocs\n\nGitHub\n\nX / Twitter\n\nCtrl+K\n\n    Reference\n  \n\nDocs\n\nGitHub\n\nX / Twitter\n\nSearch\n\nError\nPlease activate JavaScript to enable the search functionality.\n\nCtrl+K'

In [96]:
documentation.keys()

dict_keys(['Langchain', 'perplexity', 'google_genai', 'ollama', 'chroma', 'pinecone', 'reference.html', 'nvidia_ai_endpoints', 'upstage', 'aws', 'anthropic', 'fireworks', 'cerebras', 'sqlserver', 'redis', '_modules', 'prompty', 'text_splitters', 'standard_tests', 'mistralai', 'mongodb', 'together', 'groq', 'cohere', 'experimental', 'nomic', 'openai', 'azure_dynamic_sessions', 'postgres', 'milvus', 'snowflake', 'neo4j', 'xai', 'unstructured', 'qdrant', 'tavily', 'astradb', 'community', 'ibm', 'core', 'google_vertexai', 'azure_ai', 'huggingface', 'elasticsearch', 'google_community', 'langchain', 'weaviate', 'ai21', 'deepseek', 'exa', 'voyageai', 'sema4'])

## Saving Data

In [97]:
import pickle

try:
    dataset= open('LangDataset.pickle', 'wb')
    pickle.dump(documentation, dataset)
    dataset.close()

except:
    print("Something went wrong")

## Function Testing

In [41]:
from collections import defaultdict

grouped = defaultdict(list)

for base_class, content in dataset.items():
    class_list = base_class.split('/')
    root_topic, topic = class_list[4], class_list[-1]
    grouped[root_topic].append(f"\n## Class Objects: {topic}\n{content}")

In [42]:
grouped

defaultdict(list,
            {'': ['\n## Class Objects: \nabc'],
             'ibm': ['\n## Class Objects: index.html\nabc',
              '\n## Class Objects: index\nbcd']})

In [43]:
combined_dataset = {
    topic: "\n".join(contents)
    for topic, contents in grouped.items()
}

In [44]:
combined_dataset

{'': '\n## Class Objects: \nabc',
 'ibm': '\n## Class Objects: index.html\nabc\n\n## Class Objects: index\nbcd'}

In [49]:
import re

text = '''My name

is



DArshil





Pungalia
'''
print(text)
text = re.sub(r'\n{3,}', '\n\n', text.strip())

My name

is



DArshil





Pungalia



In [10]:
def loadData(file):
    dbfile = open(file, 'rb')
    db = pickle.load(dbfile)

    return db

In [11]:
data = loadData(r'datasets\LangDatasetChunked.pickle')

In [16]:
print(data[0]['text'])

### Instruction: Learn about the perplexity LangChain API.

### Part 2 - Module:index(chunk1)

langchain-perplexity: 0.1.1

langchain-perplexity: 0.1.1#
This package provides the Perplexity integration for LangChain.

chat_models#
Classes

chat_models.ChatPerplexity
Perplexity AI Chat models API.


## Instruction-Response Format

In [5]:
def loadData(file):
    dbfile = open(file, 'rb')
    db = pickle.load(dbfile)

    return db

In [31]:
data = loadData(r'datasets\LangDatasetBetter.pickle')

In [36]:
print(len(data))
print(type(data))

52
<class 'dict'>


In [45]:
data.keys()

dict_keys(['Langchain', 'perplexity', 'google_genai', 'ollama', 'chroma', 'pinecone', 'reference.html', 'nvidia_ai_endpoints', 'upstage', 'aws', 'anthropic', 'fireworks', 'cerebras', 'sqlserver', 'redis', '_modules', 'prompty', 'text_splitters', 'standard_tests', 'mistralai', 'mongodb', 'together', 'groq', 'cohere', 'experimental', 'nomic', 'openai', 'azure_dynamic_sessions', 'postgres', 'milvus', 'snowflake', 'neo4j', 'xai', 'unstructured', 'qdrant', 'tavily', 'astradb', 'community', 'ibm', 'core', 'google_vertexai', 'azure_ai', 'huggingface', 'elasticsearch', 'google_community', 'langchain', 'weaviate', 'ai21', 'deepseek', 'exa', 'voyageai', 'sema4'])

In [25]:
model = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.3
)

parser = JsonOutputParser()

In [50]:
prompt = PromptTemplate(
    template='Generate as many unique questions and answer pairs for the given {topic} from the text and output in a JSON format in which "instuction" contains the question and "response" contains the answer, questions should include both documentation and implementation questions like code etc:\n{text}',
    input_variables=['topic', 'text'],
    partial_variables={'format_instruction': parser.get_format_instructions()})

In [51]:
prompt

PromptTemplate(input_variables=['text', 'topic'], input_types={}, partial_variables={'format_instruction': 'Return a JSON object.'}, template='Generate as many unique questions and answer pairs for the given {topic} from the text and output in a JSON format in which "instuction" contains the question and "response" contains the answer, questions should include both documentation and implementation questions like code etc:\n{text}')

In [52]:
chain = prompt | model | parser

In [53]:
from tqdm import tqdm

instruction_key_pairs = []

for i, (key, value) in enumerate(tqdm(data.items(), desc="Processing documents", total=len(data))):
    pairs = chain.invoke({'topic': key, 'text': value})
    instruction_key_pairs.append(pairs)

Processing documents: 100%|██████████| 52/52 [13:26<00:00, 15.51s/it]


In [57]:
tuning_examples = 0

for i in instruction_key_pairs:
    tuning_examples += len(i)

tuning_examples

1968

In [58]:
instruction_key_pairs_combined = [pair for topic in instruction_key_pairs for pair in topic]

In [59]:
len(instruction_key_pairs_combined)

1968

In [60]:
upload = open(r'datasets\InsReDataset.pickle', 'wb')
pickle.dump(instruction_key_pairs_combined, upload)
upload.close()