Fetch the source data, in this case from an open shared Google drive. We cache the source data in the `in` directory for further processing and to avoid re-fetching each time this notebook is run.

Note that there appears to be a bug in rdf-tabular where the filenames of CSV files need to be in lower case.

In [None]:
import requests
from pathlib import Path

sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)

sources = [
    ('cn8_2012.csv', '1P7YyFF6qXKXWVtR0Vt3kkvFPOjThMQH8'),
    ('cn8_2013.csv', '1de-Le9ungrbdoGyvWI_RwmEhNpTmR-70'),
    ('cn8_2014.csv', '1oC3jlItfsUshd54KOR7yn9NxpR83iCbC'),
    ('cn8_2015.csv', '1H54-FYrCFa1DylCBg38RAPAeCtkGq4la'),
    ('cn8_2016.csv', '11fLsnoiWzTcA1d3nSDWvyrKQEHwIf6Hz')
]

for filename, google_id in sources:
    sourceFile = sourceFolder / filename

    if not (sourceFile.exists() and sourceFile.is_file()):
        response = requests.get(f'https://drive.google.com/uc?export=download&id={google_id}')
        with open(sourceFile, 'wb') as f:
            f.write(response.content)

This data is already in [Tidy Data format](http://vita.had.co.nz/papers/tidy-data.pdf) and so just needs a JSON metadata file for each CSV file.

We copy across the CSV files and fill out a template JSON metadata file for each CSV file, putting the results in the `out` directory for further processing.

In [None]:
import json
from itertools import islice

metadataTemplate = json.load(open('metadata/cn8_template.json'))

destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True)

for filename, google_id in sources:
    sourceFile = sourceFolder / filename
    with open(sourceFile) as src:
        destFile = destinationFolder / filename
        with open(destFile, 'w') as dst:
            for line in islice(src, 1000):
                dst.write(line)
    metadataFile = destinationFolder / (filename + '-metadata.json')
    metadataTemplate['url'] = filename
    with open(metadataFile, 'w') as meta:
        json.dump(metadataTemplate, meta, indent=2)
