# BVL CSV 2 Datacube

## Read CSV

In [1]:
import pandas as pd

path = 'data/result.csv'
dataframe = pd.read_csv(path, index_col=0)

## Transform

Currently the transformation works only for a "matrix" CSV table with rows and columns. Also the dimension names, labels etc ... are static and not parameterized.

In [2]:
from rdflib import Namespace, Graph, RDF, RDFS, Literal, XSD
from slugify import slugify

bvl = Namespace('http://le-online.de/')
qb = Namespace('http://purl.org/linked-data/cube#')

def add_to_graph(tuples, graph):
    for t in tuples:
        graph.add(t)
    return graph

def instanciate_dimension_element(instance, ns):
    return ns[slugify(instance)]

def create_dimension_elements(mapping, ns):
    dimEls = []
    for l in mapping[0]:
        d = instanciate_dimension_element(l, ns)
        dimEls.append( (d, RDF.type, mapping[1]) )
        dimEls.append( (d, RDFS.label, Literal(l, lang='de')) )
    return dimEls

def create_observation(idx, value, measure, dimEls_t, dims_t, ns, ds):
    ob_instance = ns['obs-' + str(idx)]
    return [
        (ob_instance, RDF.type, qb.Observation),
        (ob_instance, qb.dataSet, ds),
        (ob_instance, measure, Literal(value, datatype=XSD.float)),
        (ob_instance, dims_t[0], dimEls_t[0]),
        (ob_instance, dims_t[1], dimEls_t[1])
    ]

dataset_instance = 'dataset'
dsd_instance = 'dsd'
cs_instance = 'cd'
measure_count_instance_cs = 'countCS'
dimension_feature_instance_cs = 'featureCS'
dimension_category_instance_cs = 'categoryCS'
measure_count_instance = 'count'
dimension_feature_instance = 'feature'
dimension_category_instance = 'category'
    
g = Graph()
g.add((bvl[dataset_instance], RDF.type, qb.DataSet))
g.add((bvl[dataset_instance], qb.structure, bvl[dsd_instance]))

g.add((bvl[dsd_instance], RDF.type, qb.DataStructureDefinition))
g.add((bvl[dsd_instance], qb.component, bvl[dimension_feature_instance_cs]))
g.add((bvl[dsd_instance], qb.component, bvl[dimension_category_instance_cs]))
g.add((bvl[dsd_instance], qb.component, bvl[measure_count_instance_cs]))

g.add((bvl[dimension_feature_instance_cs], RDF.type, qb.ComponentSpecification))
g.add((bvl[dimension_feature_instance_cs], qb.dimension, bvl[dimension_feature_instance]))

g.add((bvl[dimension_category_instance_cs], RDF.type, qb.ComponentSpecification))
g.add((bvl[dimension_category_instance_cs], qb.dimension, bvl[dimension_category_instance]))

g.add((bvl[measure_count_instance_cs], RDF.type, qb.ComponentSpecification))
g.add((bvl[measure_count_instance_cs], qb.measure, bvl[measure_count_instance]))

g.add((bvl[measure_count_instance], RDF.type, qb.MeasureProperty))
g.add((bvl[measure_count_instance], RDFS.label, Literal('Anzahl', lang='de')))
g.add((bvl[measure_count_instance], RDFS.label, Literal('Count', lang='en')))
g.add((bvl[measure_count_instance], RDFS.range, XSD.decimal))

g.add((bvl[dimension_feature_instance], RDF.type, qb.DimensionProperty))     
g.add((bvl[dimension_feature_instance], RDFS.label, Literal('Merkmal', lang='de')))     
g.add((bvl[dimension_feature_instance], RDFS.label, Literal('Feature', lang='en')))     

g.add((bvl[dimension_category_instance], RDF.type, qb.DimensionProperty))     
g.add((bvl[dimension_category_instance], RDFS.label, Literal('Kategorie', lang='de')))     
g.add((bvl[dimension_category_instance], RDFS.label, Literal('Category', lang='en')))  

mappings = [
    (dataframe.columns, bvl[dimension_feature_instance]),
    (dataframe.index, bvl[dimension_category_instance])
]

for mapping in mappings:
    dimEls = create_dimension_elements(mapping, bvl)
    add_to_graph(dimEls, g)

for idx, row_t in enumerate(dataframe.iterrows()):
    row_dimEl = instanciate_dimension_element(row_t[0], bvl)
    for idy, i_t in enumerate(row_t[1].iteritems()):
        item_dimEl = instanciate_dimension_element(i_t[0], bvl)
        value = i_t[1]
        ob = create_observation(str(idx) + str(idy), value, bvl[measure_count_instance], (row_dimEl, item_dimEl), (bvl[dimension_category_instance], bvl[dimension_feature_instance]),bvl, bvl[dataset_instance])
        add_to_graph(ob, g)

## Write Datacube

In [3]:
output = 'data/datacube.ttl'
g.serialize(destination=output, format='turtle', encoding='utf-8')