In [1]:
import pandas as pd
import io
import requests
from pathlib import Path

In [2]:
CONST_HEADERS = [
    'observation',
    'data_marking',
    'statistical_unit_eng',
    'statistical_unit_cym',
    'measure_type_eng',
    'measure_type_cym',
    'observation_type',
    'empty',
    'obs_type_value',
    'unit_multiplier',
    'unit_of_measure_eng',
    'unit_of_measure_cym',
    'confidentuality',
    'empty1',
    'geographic_area',
    'empty2',
    'empty3',
    'time_dim_item_id',
    'time_dim_item_label_eng',
    'time_dim_item_label_cym',
    'time_type',
    'empty4',
    'statistical_population_id',
    'statistical_population_label_eng',
    'statistical_population_label_cym',
    'cdid',
    'cdiddescrip',
    'empty5',
    'empty6',
    'empty7',
    'empty8',
    'empty9',
    'empty10',
    'empty11',
    'empty12'
]

In [3]:
CONST_TOPIC = [
    'dim_id_',
    'dimension_label_eng_',
    'dimension_label_cym_',
    'dim_item_id_',
    'dimension_item_label_eng_',
    'dimension_item_label_cym_',
    'is_total_',
    'is_sub_total_',
]

Source CSV files are in Google Drive. Each file has an ID that can be used to fetch the contents directly. To find the ID of a file in Google Drive, right click on the file and "get shareable link". This will give you something like `https://drive.google.com/open?id=11fLsnoiWzTcA1d3nSDWvyrKQEHwIf6Hz` and the ID is the bit after `id=` in the URL.

We'll download each file and cache it locally. Note that if a source file is changed in Google Drive, it won't be downloaded again unless removed from the 'in' directory.

In [4]:
sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)

sources = [
    ('CN8_Non-EU_cod_2012.csv', '1P7YyFF6qXKXWVtR0Vt3kkvFPOjThMQH8'),
    ('CN8_Non-EU_cod_2013.csv', '1de-Le9ungrbdoGyvWI_RwmEhNpTmR-70'),
    ('CN8_Non-EU_cod_2014.csv', '1oC3jlItfsUshd54KOR7yn9NxpR83iCbC'),
    ('CN8_Non-EU_cod_2015.csv', '1H54-FYrCFa1DylCBg38RAPAeCtkGq4la'),
    ('CN8_Non-EU_cod_2016.csv', '11fLsnoiWzTcA1d3nSDWvyrKQEHwIf6Hz')
]

for filename, google_id in sources:
    sourceFile = sourceFolder / filename

    if not (sourceFile.exists() and sourceFile.is_file()):
        response = requests.get(f'https://drive.google.com/uc?export=download&id={google_id}')
        with open(sourceFile, 'wb') as f:
            f.write(response.content)

Now read each of these CSV files into a Pandas DataFrame in turn and transform into WDA style output in the 'out' directory.

In [5]:
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True)

for filename, google_id in sources:
    oldDF = pd.read_csv(sourceFolder / filename)
    newDF = pd.DataFrame()
    
    newDF["observation"] = oldDF["svalue"]
    
    for col in CONST_HEADERS[1:]:
        newDF[col] = ""
    
    newDF['time_dim_item_id'] = oldDF["year"]
    newDF['time_dim_item_label_eng'] = oldDF["year"]
    newDF['time_type'] = "year"
    
    for counter, dimension in enumerate(["flow", "comcode", "country"], 1):
        for col in CONST_TOPIC:
            newDF[col + str(counter)] = ""

        newDF['dim_id_' + str(counter)] = dimension
        newDF['dimension_label_eng_' + str(counter)] = dimension

        newDF['dim_item_id_' + str(counter)] = oldDF[dimension]
        newDF['dimension_item_label_eng_' + str(counter)] = oldDF[dimension]
        
    newDF.to_csv(destinationFolder / filename)