# 3. Working with JSON Data

In [50]:
import json
import pandas as pd

In [51]:
with open('../data/hn_dump.json', 'r') as f:
    data = json.load(f)

In [52]:
data[0]

{'created_at': '2015-03-10T16:51:05.000Z',
 'title': 'Goodbye MongoDB, Hello PostgreSQL',
 'url': 'http://developer.olery.com/blog/goodbye-mongodb-hello-postgresql/',
 'author': 'YorickPeterse',
 'points': 802,
 'story_text': None,
 'comment_text': None,
 'num_comments': 374,
 'story_id': None,
 'story_title': None,
 'story_url': None,
 'parent_id': None,
 'created_at_i': 1426006265,
 'relevancy_score': 5901,
 '_tags': ['story', 'author_YorickPeterse', 'story_9178773'],
 'objectID': '9178773',
 '_highlightResult': {'title': {'value': 'Goodbye <em>MongoDB</em>, Hello PostgreSQL',
   'matchLevel': 'full',
   'fullyHighlighted': False,
   'matchedWords': ['mongodb']},
  'url': {'value': 'http://developer.olery.com/blog/goodbye-<em>mongodb</em>-hello-postgresql/',
   'matchLevel': 'full',
   'fullyHighlighted': False,
   'matchedWords': ['mongodb']},
  'author': {'value': 'YorickPeterse',
   'matchLevel': 'none',
   'matchedWords': []}}}

In [53]:
from sqlite3 import connect

In [54]:
DB_PATH = '../sqlite-olt.db'

In [55]:
with connect('../sqlite-olt.db') as db:
    db.execute("create table if not exists hn_items_raw(data)")

In [56]:
%%timeit

for item in data:
    with connect(DB_PATH) as db:
        db.execute("insert into hn_items_raw(data) values (?)", (json.dumps(item),))

4.25 s ± 57.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [57]:
%%timeit
with connect(DB_PATH) as db:
    for item in data:
        db.execute("insert into hn_items_raw(data) values (?)", (json.dumps(item),))

95.7 ms ± 2.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Let's tabulate the data

In [58]:
with connect(DB_PATH) as db:
    db.execute("drop view if exists hn_items_fields")
    db.execute("""
        create view if not exists hn_items_fields as
        select 
            json_extract(data, '$.created_at') as created_at,
            json_extract(data, '$.title') as title,
            json_extract(data, '$.url') as url,
            json_extract(data, '$.author') as author,
            json_extract(data, '$.points') as points,
            json_extract(data, '$.relevancy_score') as relevancy_score
        from hn_items_raw
    """)

In [59]:
with connect(DB_PATH) as db:
    print(pd.read_sql('select * from hn_items_fields', db))

                       created_at  \
0        2015-03-10T16:51:05.000Z   
1        2020-05-23T18:33:05.000Z   
2        2011-11-06T07:05:13.000Z   
3        2015-04-21T23:08:12.000Z   
4        2020-05-24T11:42:01.000Z   
...                           ...   
1339995  2019-10-08T19:24:46.000Z   
1339996  2019-10-07T16:07:13.000Z   
1339997  2019-09-16T14:04:32.000Z   
1339998  2019-09-15T17:29:17.000Z   
1339999  2019-09-12T21:55:23.000Z   

                                                     title  \
0                        Goodbye MongoDB, Hello PostgreSQL   
1        Jepsen Disputes MongoDB's Data Consistency Claims   
2                                        Don't use MongoDB   
3                       Call Me Maybe: MongoDB Stale Reads   
4                                    Jepsen: MongoDB 4.2.6   
...                                                    ...   
1339995  ReMarkable Raises 15M USD Series A from Spark ...   
1339996  Test-Driven Development for Big Data and Apach... 

In [73]:
filter_wapo_query = """
select json_extract(data, '$.created_at'), json_extract(data, '$.url')
from hn_items_raw
where json_extract(data, '$.url') like '%washingtonpost.com%'
"""

In [74]:
%%timeit
with connect(DB_PATH) as db:
    df = pd.read_sql(filter_wapo_query, db)

3.61 s ± 98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### How can we speed this up ? 

In [77]:
create_idx_query = "create index if not exists idx_filter_wapo on hn_items_raw (json_extract(data, '$.url') like '%washingtonpost.com%')"
with connect(DB_PATH) as db:
    print(db.execute(create_idx_query))

<sqlite3.Cursor object at 0x7fbb7c1b0c00>


In [78]:
%%timeit
with connect(DB_PATH) as db:
    df = pd.read_sql(filter_wapo_query, db)

3.43 s ± 49.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
