# 3. Working with JSON Data

In [None]:
import json
import pandas as pd
from gzip import GzipFile

In [None]:
with GzipFile('../data/hn_dump.json.gz', 'r') as fin:
    data = json.loads(fin.read().decode('utf-8'))

In [None]:
len(data)

In [None]:
data[100]

In [None]:
df = pd.DataFrame(data).set_index('objectID')

In [None]:
df.info()

In [None]:
from sqlite3 import connect

In [None]:
DB_PATH = '../sqlite-olt.db'

In [None]:
with connect(DB_PATH) as db:
    db.execute("create table if not exists hn_items_raw(data)")

### ~Dumping~ Writing Schemaless Data to a Relational Database

In [None]:
COUNT_ITEMS=1000

In [None]:
with connect(DB_PATH) as db:
        db.execute("DELETE FROM hn_items_raw")

### 1st Way

In [None]:
%%timeit

for item in data[:COUNT_ITEMS]:
    with connect(DB_PATH) as db:
        db.execute("insert into hn_items_raw(data) values (?)", (json.dumps(item),))

### Clearing the DB to re-run the experiment

In [None]:
with connect(DB_PATH) as db:
        db.execute("DELETE FROM hn_items_raw")

### 2nd Way

In [None]:
%%timeit
with connect(DB_PATH) as db:
    for item in data[:COUNT_ITEMS]:
        db.execute("insert into hn_items_raw(data) values (?)", (json.dumps(item),))

In [None]:
with connect(DB_PATH) as db:
        db.execute("DELETE FROM hn_items_raw")

### 3rd Way

In [None]:
%%timeit
with connect(DB_PATH) as db:
    db.executemany("insert into hn_items_raw(data) values (?)", 
                   [(json.dumps(item),) for item in data[:COUNT_ITEMS]]
                  )

In [None]:
with connect(DB_PATH) as db:
        db.execute("DELETE FROM hn_items_raw")

### It's usually smart to write data in batches

In [None]:
def make_chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
for chunk in make_chunks(data, 1000):
    with connect(DB_PATH) as db:
        db.executemany("insert into hn_items_raw(data) values (?)", 
                       [(json.dumps(item),) for item in chunk]
                      )

### Let's tabulate (~ *normalize*) the data

```
objectID
created_at
title
url
author
points
story_text
comment_text
comment_text_length
num_comments
story_id
story_title
story_url
parent_id
relevancy_score
tags
```

In [None]:
with connect(DB_PATH) as db:
    db.execute("drop view if exists hn_items_fields")
    db.execute("""
        create view if not exists hn_items_fields as
        select 
            json_extract(data, '$.created_at') as created_at,
            json_extract(data, '$.title') as title,
            json_extract(data, '$.url') as url,
            json_extract(data, '$.author') as author,
            json_extract(data, '$.points') as points,
            json_extract(data, '$.comment_text') as comment_text,
            length(json_extract(data, '$.comment_text')) as comment_text_length,
            json_extract(data, '$.story_text') as story_text,
            json_extract(data, '$.story_id') as story_id,
            json_extract(data, '$.story_title') as story_title,
            json_extract(data, '$.story_url') as story_url,
            json_extract(data, '$.story_text') as story_text,
            json_extract(data, '$.parent_id') as parent_id,
            json_extract(data, '$.relevancy_score') as relevancy_score,
            json_extract(data, '$._tags') as tags 
        from hn_items_raw
    """)

### Let's see what we got

In [None]:
with connect(DB_PATH) as db:
    hn_items_fields = pd.read_sql('select * from hn_items_fields', db)
    

hn_items_fields

In [None]:
hn_items_fields

### Let's find the most frequent authors in our data

In [None]:
query_1 = """
select json_extract(data, '$.author') as author, count(*) as count_author_comments
from hn_items_raw
group by author
order by count_author_comments desc
"""

with connect(DB_PATH) as db:
    frequent_authors_1 = pd.read_sql(query_1, db)
    
frequent_authors_1  

In [None]:
query_2 = """
select author, count(*) as count_author_comments
from hn_items_fields
group by author
order by count_author_comments desc
"""

with connect(DB_PATH) as db:
    frequent_authors_1 = pd.read_sql(query_1, db)
    
frequent_authors_1.head(20)

In [None]:
filter_author_query = """
select json_extract(data, '$.author'), json_extract(data, '$.objectID')
from hn_items_raw
where json_extract(data, '$.author') = 'luu'
"""

In [None]:
%%timeit
with connect(DB_PATH) as db:
    luu_df = pd.read_sql(filter_author_query, db)

### How can we speed this up ? 

- We would usually create an index, but indices are defined on **columns**; where's the column here?
- There's none, we haven't persisted data on disk, hence SQLite doesn't know where to look for pre-computed results
- Instead we want to "cache" the result for the computation `json_extract(data, '$.author')`. 
- This is called an expression
- In these senarios we create an *index on expression*


Index on expression format 

`CREATE INDEX idx_name on TABLE_NAME (<expression_here>)`

In `<expression_here>` we usually copy-paste the predicate from the `WHERE` clause.

In [None]:
create_author_idx_query = """
create index if not exists idx_author on hn_items_raw (json_extract(data, '$.author'))
"""

In [None]:
with connect(DB_PATH) as db:
    db.execute(create_author_idx_query)

In [None]:
%%timeit
with connect(DB_PATH) as db:
    luu_df = pd.read_sql(filter_author_query, db)

### Indices on expressions come usually handy in time-oriented computations.

Say we want to filter comments posted on Sundays.

The query would look like

In [None]:
sunday_comments="""
select json_extract(data, '$.comment_text'), datetime(json_extract(data, '$.created_at'))
from hn_items_raw
where strftime('%w', datetime(json_extract(data, '$.created_at'))) = '0'
"""

In [None]:
%%timeit

with connect(DB_PATH) as db:
    sunday_comments_df = pd.read_sql(sunday_comments, db)
    
sunday_comments_df

The predicate expression here is a little bit more complex, but certainly optimizable.

Let's create an index

In [None]:
create_index_on_sunday_comments_query =\
"""
create index if not exists 
idx_comments_on_sundays on 
hn_items_raw (strftime('%w', datetime(json_extract(data, '$.created_at'))))
"""

In [None]:
with connect(DB_PATH) as db:
    db.execute(create_index_on_sunday_comments_query)

In [None]:
%%timeit

with connect(DB_PATH) as db:
    sunday_comments_df = pd.read_sql(sunday_comments, db)
    
sunday_comments_df