#### How to split JSON

This json splitter split json data while allowing control over chunk sizes. It traverses json data depth first and build smaller json chunks. It attempts to keep nested json objets whole but will split them if needed to keep chunk between a min_chunk_size and the max_chunk_size.

If the value is not a nested json, but rather

In [1]:
import json
import requests

with open("sample.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)


json_data

{'links': {'self': 'http://example.com/articles',
  'next': 'http://example.com/articles?page[offset]=2',
  'last': 'http://example.com/articles?page[offset]=10'},
 'data': [{'type': 'articles',
   'id': '1',
   'attributes': {'title': 'JSON:API paints my bikeshed!'},
   'relationships': {'author': {'links': {'self': 'http://example.com/articles/1/relationships/author',
      'related': 'http://example.com/articles/1/author'},
     'data': {'type': 'people', 'id': '9'}},
    'comments': {'links': {'self': 'http://example.com/articles/1/relationships/comments',
      'related': 'http://example.com/articles/1/comments'},
     'data': [{'type': 'comments', 'id': '5'},
      {'type': 'comments', 'id': '12'}]}},
   'links': {'self': 'http://example.com/articles/1'}}],
 'included': [{'type': 'people',
   'id': '9',
   'attributes': {'firstName': 'Dan',
    'lastName': 'Gebhardt',
    'twitter': 'dgeb'},
   'links': {'self': 'http://example.com/people/9'}},
  {'type': 'comments',
   'id': '5'

In [2]:
from langchain_text_splitters import RecursiveJsonSplitter
json_splitter=RecursiveJsonSplitter(max_chunk_size=300)
json_chunk=json_splitter.split_json(json_data)


In [3]:
json_chunk

[{'links': {'self': 'http://example.com/articles',
   'next': 'http://example.com/articles?page[offset]=2',
   'last': 'http://example.com/articles?page[offset]=10'}},
 {'data': [{'type': 'articles',
    'id': '1',
    'attributes': {'title': 'JSON:API paints my bikeshed!'},
    'relationships': {'author': {'links': {'self': 'http://example.com/articles/1/relationships/author',
       'related': 'http://example.com/articles/1/author'},
      'data': {'type': 'people', 'id': '9'}},
     'comments': {'links': {'self': 'http://example.com/articles/1/relationships/comments',
       'related': 'http://example.com/articles/1/comments'},
      'data': [{'type': 'comments', 'id': '5'},
       {'type': 'comments', 'id': '12'}]}},
    'links': {'self': 'http://example.com/articles/1'}}]},
 {'included': [{'type': 'people',
    'id': '9',
    'attributes': {'firstName': 'Dan',
     'lastName': 'Gebhardt',
     'twitter': 'dgeb'},
    'links': {'self': 'http://example.com/people/9'}},
   {'type': '

In [4]:
for chunk in json_chunk[:3]:
    print(chunk)

{'links': {'self': 'http://example.com/articles', 'next': 'http://example.com/articles?page[offset]=2', 'last': 'http://example.com/articles?page[offset]=10'}}
{'data': [{'type': 'articles', 'id': '1', 'attributes': {'title': 'JSON:API paints my bikeshed!'}, 'relationships': {'author': {'links': {'self': 'http://example.com/articles/1/relationships/author', 'related': 'http://example.com/articles/1/author'}, 'data': {'type': 'people', 'id': '9'}}, 'comments': {'links': {'self': 'http://example.com/articles/1/relationships/comments', 'related': 'http://example.com/articles/1/comments'}, 'data': [{'type': 'comments', 'id': '5'}, {'type': 'comments', 'id': '12'}]}}, 'links': {'self': 'http://example.com/articles/1'}}]}
{'included': [{'type': 'people', 'id': '9', 'attributes': {'firstName': 'Dan', 'lastName': 'Gebhardt', 'twitter': 'dgeb'}, 'links': {'self': 'http://example.com/people/9'}}, {'type': 'comments', 'id': '5', 'attributes': {'body': 'First!'}, 'relationships': {'author': {'data

In [5]:
## the splitter can also output document
docs = json_splitter.create_documents(texts=[json_data])
for doc in docs[:3]:
    print(doc)

page_content='{"links": {"self": "http://example.com/articles", "next": "http://example.com/articles?page[offset]=2", "last": "http://example.com/articles?page[offset]=10"}}'
page_content='{"data": [{"type": "articles", "id": "1", "attributes": {"title": "JSON:API paints my bikeshed!"}, "relationships": {"author": {"links": {"self": "http://example.com/articles/1/relationships/author", "related": "http://example.com/articles/1/author"}, "data": {"type": "people", "id": "9"}}, "comments": {"links": {"self": "http://example.com/articles/1/relationships/comments", "related": "http://example.com/articles/1/comments"}, "data": [{"type": "comments", "id": "5"}, {"type": "comments", "id": "12"}]}}, "links": {"self": "http://example.com/articles/1"}}]}'
page_content='{"included": [{"type": "people", "id": "9", "attributes": {"firstName": "Dan", "lastName": "Gebhardt", "twitter": "dgeb"}, "links": {"self": "http://example.com/people/9"}}, {"type": "comments", "id": "5", "attributes": {"body": "

In [6]:
texts=json_splitter.split_text(json_data)
print(texts[0])
print(texts[1])


{"links": {"self": "http://example.com/articles", "next": "http://example.com/articles?page[offset]=2", "last": "http://example.com/articles?page[offset]=10"}}
{"data": [{"type": "articles", "id": "1", "attributes": {"title": "JSON:API paints my bikeshed!"}, "relationships": {"author": {"links": {"self": "http://example.com/articles/1/relationships/author", "related": "http://example.com/articles/1/author"}, "data": {"type": "people", "id": "9"}}, "comments": {"links": {"self": "http://example.com/articles/1/relationships/comments", "related": "http://example.com/articles/1/comments"}, "data": [{"type": "comments", "id": "5"}, {"type": "comments", "id": "12"}]}}, "links": {"self": "http://example.com/articles/1"}}]}
