# 3. Working with JSON Data

In [41]:
import json
import pandas as pd
from gzip import GzipFile

In [42]:
with GzipFile('../data/hn_dump.json.gz', 'r') as fin:
    data = json.loads(fin.read().decode('utf-8'))

In [43]:
len(data)

100471

In [44]:
data[100]

{'created_at': '2012-04-03T21:25:57.000Z',
 'title': '',
 'url': '',
 'author': 'crag',
 'points': 15,
 'story_text': None,
 'comment_text': 'Let me add another database that\'s "underestimated" (by mainstream corporate America): SQLite3.<p>SQLite is fast, small, portable, easy &#38; simple to maintain and backup, AND reliable. And unless you are running a high traffic site (or application) it could handle everything a small (even medium) business would need.<p>Why small companies get talked into running MSQL or Oracle or MySQL is beyond me. And even if (and that\'s a big IF) they needed more "power", there\'s Postgres.<p>PS: Sorry for hijacking this thread. I\'m a big fan boy of both SQLite and Postgres.',
 'num_comments': None,
 'story_id': 3793973,
 'story_title': 'Postgres 9.2 will feature linear read scalability up to 64 cores',
 'story_url': 'http://rhaas.blogspot.com/2012/04/did-i-say-32-cores-how-about-64.html',
 'parent_id': 3794160,
 'created_at_i': 1333488357,
 'relevancy_sc

In [45]:
df = pd.DataFrame(data).set_index('objectID')

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100471 entries, 4616844 to 2226207
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   created_at        100471 non-null  object 
 1   title             91960 non-null   object 
 2   url               86089 non-null   object 
 3   author            100471 non-null  object 
 4   points            99126 non-null   float64
 5   story_text        36569 non-null   object 
 6   comment_text      21000 non-null   object 
 7   num_comments      79471 non-null   float64
 8   story_id          21003 non-null   float64
 9   story_title       20977 non-null   object 
 10  story_url         20532 non-null   object 
 11  parent_id         21000 non-null   float64
 12  created_at_i      100471 non-null  int64  
 13  relevancy_score   88902 non-null   float64
 14  _tags             100471 non-null  object 
 15  _highlightResult  100471 non-null  object 
dtypes: float64(5), int

In [47]:
from sqlite3 import connect

In [48]:
DB_PATH = '../sqlite-olt.db'

In [49]:
with connect(DB_PATH) as db:
    db.execute("create table if not exists hn_items_raw(data)")

### ~Dumping~ Writing Schemaless Data to a Relational Database

In [50]:
COUNT_ITEMS=1000

In [51]:
with connect(DB_PATH) as db:
        db.execute("DELETE FROM hn_items_raw")

### 1st Way

In [52]:
%%timeit

for item in data[:COUNT_ITEMS]:
    with connect(DB_PATH) as db:
        db.execute("insert into hn_items_raw(data) values (?)", (json.dumps(item),))

934 ms ± 18.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Clearing the DB to re-run the experiment

In [53]:
with connect(DB_PATH) as db:
        db.execute("DELETE FROM hn_items_raw")

### 2nd Way

In [54]:
%%timeit
with connect(DB_PATH) as db:
    for item in data[:COUNT_ITEMS]:
        db.execute("insert into hn_items_raw(data) values (?)", (json.dumps(item),))

34.3 ms ± 778 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [55]:
with connect(DB_PATH) as db:
        db.execute("DELETE FROM hn_items_raw")

### 3rd Way

In [56]:
%%timeit
with connect(DB_PATH) as db:
    db.executemany("insert into hn_items_raw(data) values (?)", 
                   [(json.dumps(item),) for item in data[:COUNT_ITEMS]]
                  )

31.6 ms ± 1.79 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [57]:
with connect(DB_PATH) as db:
        db.execute("DELETE FROM hn_items_raw")

### It's usually smart to write data in batches

In [59]:
def make_chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [60]:
for chunk in make_chunks(data, 1000):
    with connect(DB_PATH) as db:
        db.executemany("insert into hn_items_raw(data) values (?)", 
                       [(json.dumps(item),) for item in chunk]
                      )

1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
471


### Let's tabulate (~ *normalize*) the data

```
objectID
created_at
title
url
author
points
story_text
comment_text
comment_text_length
num_comments
story_id
story_title
story_url
parent_id
relevancy_score
tags
```

In [37]:
with connect(DB_PATH) as db:
    db.execute("drop view if exists hn_items_fields")
    db.execute("""
        create view if not exists hn_items_fields as
        select 
            json_extract(data, '$.created_at') as created_at,
            json_extract(data, '$.title') as title,
            json_extract(data, '$.url') as url,
            json_extract(data, '$.author') as author,
            json_extract(data, '$.points') as points,
            json_extract(data, '$.comment_text') as comment_text,
            length(json_extract(data, '$.comment_text')) as comment_text_length,
            json_extract(data, '$.story_text') as story_text,
            json_extract(data, '$.story_id') as story_id,
            json_extract(data, '$.story_title') as story_title,
            json_extract(data, '$.story_url') as story_url,
            json_extract(data, '$.story_text') as story_text,
            json_extract(data, '$.parent_id') as parent_id,
            json_extract(data, '$.relevancy_score') as relevancy_score,
            json_extract(data, '$._tags') as tags 
        from hn_items_raw
    """)

### Let's see what we got

In [38]:
with connect(DB_PATH) as db:
    hn_items_fields = pd.read_sql('select * from hn_items_fields', db)
    

hn_items_fields

Unnamed: 0,created_at,title,url,author,points,comment_text,comment_text_length,story_text,story_id,story_title,story_url,story_text:1,parent_id,relevancy_score,tags


In [39]:
hn_items_fields

Unnamed: 0,created_at,title,url,author,points,comment_text,comment_text_length,story_text,story_id,story_title,story_url,story_text:1,parent_id,relevancy_score,tags


### Let's find the most frequent authors in our data

In [None]:
query_1 = """
select json_extract(data, '$.author') as author, count(*) as count_author_comments
from hn_items_raw
group by author
order by count_author_comments desc
"""

with connect(DB_PATH) as db:
    frequent_authors_1 = pd.read_sql(query_1, db)
    
frequent_authors_1  

In [None]:
query_2 = """
select author, count(*) as count_author_comments
from hn_items_fields
group by author
order by count_author_comments desc
"""

with connect(DB_PATH) as db:
    frequent_authors_1 = pd.read_sql(query_1, db)
    
frequent_authors_1.head(20)

In [None]:
filter_author_query = """
select json_extract(data, '$.author'), json_extract(data, '$.objectID')
from hn_items_raw
where json_extract(data, '$.author') = 'luu'
"""

In [None]:
%%timeit
with connect(DB_PATH) as db:
    luu_df = pd.read_sql(filter_author_query, db)

### How can we speed this up ? 

- We would usually create an index, but indices are defined on **columns**; where's the column here?
- There's none, we haven't persisted data on disk, hence SQLite doesn't know where to look for pre-computed results
- Instead we want to "cache" the result for the computation `json_extract(data, '$.author')`. 
- This is called an expression
- In these senarios we create an *index on expression*


Index on expression format 

`CREATE INDEX idx_name on TABLE_NAME (<expression_here>)`

In `<expression_here>` we usually copy-paste the predicate from the `WHERE` clause.

In [None]:
create_author_idx_query = """
create index if not exists idx_author on hn_items_raw (json_extract(data, '$.author'))
"""

In [None]:
with connect(DB_PATH) as db:
    db.execute(create_author_idx_query)

In [None]:
%%timeit
with connect(DB_PATH) as db:
    luu_df = pd.read_sql(filter_author_query, db)

### Indices on expressions come usually handy in time-oriented computations.

Say we want to filter comments posted on Sundays.

The query would look like

In [None]:
sunday_comments="""
select json_extract(data, '$.comment_text'), datetime(json_extract(data, '$.created_at'))
from hn_items_raw
where strftime('%w', datetime(json_extract(data, '$.created_at'))) = '0'
"""

In [None]:
%%timeit

with connect(DB_PATH) as db:
    sunday_comments_df = pd.read_sql(sunday_comments, db)
    
sunday_comments_df

The predicate expression here is a little bit more complex, but certainly optimizable.

Let's create an index

In [None]:
create_index_on_sunday_comments_query =\
"""
create index if not exists 
idx_comments_on_sundays on 
hn_items_raw (strftime('%w', datetime(json_extract(data, '$.created_at'))))
"""

In [None]:
with connect(DB_PATH) as db:
    db.execute(create_index_on_sunday_comments_query)

In [None]:
%%timeit

with connect(DB_PATH) as db:
    sunday_comments_df = pd.read_sql(sunday_comments, db)
    
sunday_comments_df