In [2]:

from langchain_community.document_loaders import ArxivLoader

arxiv_loader = ArxivLoader("1706.03762", filter=["title", "authors", "summary", "published", "updated", "categories", "doi", "journal_ref"]) 
arxiv_data = arxiv_loader.load()

print(arxiv_data)

[Document(metadata={'Published': '2023-08-02', 'Title': 'Attention Is All You Need', 'Authors': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin', 'Summary': 'The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer, based\nsolely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to be\nsuperior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014\nEnglish-to-German translation task, improving over the existing best results,\nincluding ensembles by over 2 BLEU. On the WMT 2014 English-to-French\ntr

In [12]:
# How to recurseively split a document into 

from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
split_docs = text_splitter.split_documents(arxiv_data)

with open("arxiv_split_docs.txt", "w") as f:
    for doc in split_docs:
        f.write(doc.page_content + "\n\n")

    
type(split_docs[0])


# Using CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
split_docs_char = text_splitter.split_documents(arxiv_data)

with open("arxiv_split_docs_char.txt", "w") as f:
    for doc in split_docs_char:
        f.write(doc.page_content + "\n\n")



In [27]:
from langchain_text_splitters import HTMLHeaderTextSplitter

headers_to_split = [
    ("h1","Header 1"),
    ("h2","Header 2"),
]
html_splitter = HTMLHeaderTextSplitter(
    headers_to_split
)
# split_html_docs = html_splitter.split_text()
split_html_docs = html_splitter.split_text_from_url("https://en.wikipedia.org/wiki/Artificial_intelligence")    

for doc in split_html_docs:
    print(doc.page_content)
    print("\n---\n")


Jump to content  
Main menu  
Main menu  
move to sidebar  
hide  
Navigation  
Main page  
Contents  
Current events  
Random article  
About Wikipedia  
Contact us  
Contribute  
Help  
Learn to edit  
Community portal  
Recent changes  
Upload file  
Special pages  
Search  
Search  
Appearance  
Donate  
Create account  
Log in  
Personal tools  
Donate  
Create account  
Log in  
Pages for logged out editors  
learn more  
Contributions  
Talk  
CentralNotice

---

Contents

---

move to sidebar  
hide  
(Top)  
1  
Goals  
Toggle Goals subsection  
1.1  
Reasoning and problem-solving  
1.2  
Knowledge representation  
1.3  
Planning and decision-making  
1.4  
Learning  
1.5  
Natural language processing  
1.6  
Perception  
1.7  
Social intelligence  
1.8  
General intelligence  
2  
Techniques  
Toggle Techniques subsection  
2.1  
Search and optimization  
2.1.1  
State space search  
2.1.2  
Local search  
2.2  
Logic  
2.3  
Probabilistic methods for uncertain reasoning  
2.

In [32]:
# how to split json data 
import json
import requests

json_data= requests.get("https://api.smith.langchain.com/openapi.json").json()

from langchain_text_splitters import RecursiveJsonSplitter

json_splitter = RecursiveJsonSplitter(
    max_chunk_size=300)
split_json_docs = json_splitter.split_json(json_data)

for doc in split_json_docs[:3]:  # Display only the first 3 documents
    print(doc)
    print("\n---\n")

{'openapi': '3.1.0', 'info': {'title': 'LangSmith', 'version': '0.1.0'}, 'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'tags': ['tracer-sessions'], 'summary': 'Get Tracing Project Prebuilt Dashboard', 'description': 'Get a prebuilt dashboard for a tracing project.'}}}}

---

{'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'operationId': 'get_tracing_project_prebuilt_dashboard_api_v1_sessions__session_id__dashboard_post', 'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}}

---

{'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'parameters': [{'name': 'session_id', 'in': 'path', 'required': True, 'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}}, {'name': 'accept', 'in': 'header', 'required': False, 'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'title': 'Accept'}}]}}}}

---

