In [None]:
from pathlib import Path
import sqlite3

import pandas as pd

## Load data: `metadata.csv` and find `json` files

In [None]:
DATASET_VERSION = 'v7'

p = Path('/raid/covid_data/data') / DATASET_VERSION

n_json_files = len(list(p.glob('**/*json')))

print(f'Found {n_json_files:,d} JSON files for the CORD-19 (version {DATASET_VERSION}).')

In [None]:
df = pd.read_csv(p / 'metadata.csv')

print(f'Found {len(df):,d} article entries for the CORD-19 (version {DATASET_VERSION})')

## Create associative table `article_id_2_sha`

In [None]:
df2 = pd.DataFrame(data=df[['cord_uid', 'sha']])
df2 = df2.set_index(['cord_uid']).apply(lambda x: x.str.split(';').explode()).reset_index()
df2.rename(columns={'cord_uid':'article_id', 'sha':'sha'}, inplace=True)

In [None]:
with sqlite3.connect(f'cord19_{DATASET_VERSION}.db') as db:
    conn = db.cursor()

    db.execute(
        """CREATE TABLE IF NOT EXISTS article_id_2_sha
        (
            article_id TEXT,
            sha TEXT
        );
        """
        )

    df2.to_sql(name='article_id_2_sha', con=db, index=False, if_exists='append')
    
    display(pd.read_sql('SELECT * FROM article_id_2_sha LIMIT 5;', db))

## Create `articles` table

In [None]:
df.rename(columns={
    'cord_uid':'article_id', 
    'sha':'sha',
    'source_x':'publisher', 
    'title':'title', 
    'doi':'doi', 
    'pmcid':'pmc_id', 
    'pubmed_id':'pm_id', 
    'license':'licence',
    'abstract':'abstract', 
    'publish_time':'date', 
    'authors':'authors', 
    'journal':'journal',
    'Microsoft Academic Paper ID':'microsoft_id', 
    'WHO #Covidence':'covidence_id', 
    'has_pdf_parse':'has_pdf_parse',
    'has_pmc_xml_parse':'has_pmc_xml_parse', 
    'full_text_file':'fulltext_directory', 
    'url':'url'
}, inplace=True)

In [None]:
df.head(n=2)

In [None]:
for c_ in df.columns:
    print(f'column {repr(c_):>20s} has {df[c_].isna().sum():>15,d} NULL')

### Drop column `sha`, for which we have the associative table

In [None]:
df.drop('sha', axis=1, inplace=True)

### Some `article_id` appear twice, so drop duplicates

In [None]:
df.drop_duplicates('article_id', keep='first', inplace=True)

### Create table

In [None]:
with sqlite3.connect(f'cord19_{DATASET_VERSION}.db') as db:
    db.execute(
        """CREATE TABLE IF NOT EXISTS articles
        (
            article_id TEXT PRIMARY KEY, 
            publisher TEXT, 
            title TEXT, 
            doi TEXT, 
            pmc_id TEXT, 
            pm_id INTEGER, 
            licence TEXT,
            abstract TEXT, 
            date DATETIME, 
            authors TEXT, 
            journal TEXT,
            microsoft_id INTEGER, 
            covidence_id TEXT, 
            has_pdf_parse BOOLEAN,
            has_pmc_xml_parse BOOLEAN, 
            fulltext_directory TEXT, 
            url TEXT
        );
        """
        )

    df.to_sql(name='articles', con=db, index=False, if_exists='append')

    display(pd.read_sql('SELECT * FROM articles LIMIT 3', db))

## Create `sentences` table