# Create CDI dataset description

In [1]:
# Import necessary libraries and modules
import rdflib
from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.namespace import NamespaceManager
from rdflib.namespace import RDF, RDFS, SKOS, XSD, OWL
import os
import pandas as pd


In [2]:
example_namespace = "http://example1.org/"
example_namespace_abbr = "ex1"

## Read CDI profile

In [3]:
g = Graph()
g.parse('../data/output/cdi_profile1.ttl', format='turtle')
# Define the namespace with the URL
cdi = Namespace("http://ddialliance.org/Specification/DDI-CDI/1.0/RDF/")
g.bind("cdi", cdi)
ex1 = Namespace(example_namespace)
g.bind(example_namespace_abbr, ex1)

## Read dataset and code lists

In [4]:
data = pd.read_excel("../data/input/dagstuhl2024/Example - GDP and Employment.xlsx", sheet_name="GDP_DATA")
data

Unnamed: 0,INDICATOR,REF_AREA,EC2,TIME_PERIOD,VALUE,UNIT_MEASURE,UNIT_MULT
0,GDP,AFG,EC2_ISIC3_A01,2022,376,USD,6
1,GDP,ALB,EC2_ISIC3_A01,2022,869,USD,6
2,GDP,DZA,EC2_ISIC3_A01,2022,339,USD,6
3,GDP,ASM,EC2_ISIC3_A01,2022,479,USD,6
4,GDP,AND,EC2_ISIC3_A01,2022,668,USD,6
...,...,...,...,...,...,...,...
77,GDP,ARM,EC2_ISIC3_B05,2023,798,USD,6
78,GDP,ABW,EC2_ISIC3_B05,2023,1090,USD,6
79,GDP,AUS,EC2_ISIC3_B05,2023,1114,USD,6
80,GDP,AUT,EC2_ISIC3_B05,2023,1086,USD,6


In [5]:
code_lists = {
    "CL_REF_AREA" :  pd.read_excel("../data/input/dagstuhl2024/Example - GDP and Employment.xlsx", sheet_name="CL_REF_AREA"),
    "CL_EC2" :  pd.read_excel("../data/input/dagstuhl2024/Example - GDP and Employment.xlsx", sheet_name="CL_EC2")}
display(code_lists["CL_REF_AREA"])
display(code_lists["CL_EC2"])

Unnamed: 0,CODE,NAME
0,AFG,Afghanistan
1,ALB,Albania
2,DZA,Algeria
3,ASM,American Samoa
4,AND,Andorra
5,AGO,Angola
6,AIA,Anguilla
7,ATG,Antigua and Barbuda
8,ARG,Argentina
9,ARM,Armenia


Unnamed: 0,CODE,NAME
0,EC2_ISIC3_A01,"01 - Agriculture, hunting and related service ..."
1,EC2_ISIC3_A02,"02 - Forestry, logging and related service act..."
2,EC2_ISIC3_B05,"05 - Fishing, aquaculture and service activiti..."


# Populate profile with individuals

In [6]:
variableIDs = list(data)
variableIDs

['INDICATOR',
 'REF_AREA',
 'EC2',
 'TIME_PERIOD',
 'VALUE',
 'UNIT_MEASURE',
 'UNIT_MULT']

### 1. DimensionalDataSet

In [7]:
g.add((ex1['GDP_DATA_EX1'], RDF.type, cdi.DimensionalDataSet))


<Graph identifier=N91f24a6c1c7447bdb399c33eea231139 (<class 'rdflib.graph.Graph'>)>

### 2. DimensionalDataStructure

In [8]:
g.add((ex1['GDP/DSD'],  RDF.type, cdi.DimensionalDataStructure))


<Graph identifier=N91f24a6c1c7447bdb399c33eea231139 (<class 'rdflib.graph.Graph'>)>

### 3. Data structure components

In [9]:
dimensions = [
    {"id":"REF_AREA",    "name": "Reference area", "cl":code_lists["CL_REF_AREA"]},
    {"id":"EC2",         "name": "Economic activity at 2-digit level", "cl": code_lists["CL_EC2"]},
    {"id":"TIME_PERIOD", "name": "Time period", "cl": None}
]

measures = [{"id": "VALUE"}]

attributes = [
    {"id":"INDICATOR",    "name": "Indicator", "cl": None},
    {"id":"UNIT_MEASURE", "name": "Unit of measurement", "cl": None},
    {"id":"UNIT_MULT",    "name": "Unit multiplier", "cl": None}
]



               

In [10]:
for dc in dimensions:
        
    g.add((ex1[f"DimensionComponent/{dc['id']}"], 
           RDF.type, 
           cdi.DimensionComponent))
        
    if dc['id'] == 'REF_AREA':
        g.add((ex1[f"DimensionComponent/{dc['id']}"], 
               cdi['DataStructureComponent-specialization'], 
               cdi['GeoRole']))

    if dc['id'] == 'TIME_PERIOD':
        
        g.add((ex1[f"DimensionComponent/{dc['id']}"], 
               cdi['DataStructureComponent-specialization'], 
               cdi['TimeRole']))
        
    g.add((ex1['GDP/DSD'], 
           cdi.hasComponent, 
           ex1[f"DimensionComponent/{dc['id']}"]))

    
    #----- Variable Cascade

    g.add((ex1[f"RepresentedVariable/{dc['id']}"], 
           RDF.type, 
           cdi.RepresentedVariable))

    g.add((ex1[f"RepresentedVariable/{dc['id']}"], 
           RDFS['label'], 
           Literal(dc['name'], lang = 'en')))
    
    g.add((ex1[f"DimensionComponent/{dc['id']}"], 
           cdi.DataStructureComponent_isDefinedBy_RepresentedVariable, 
           ex1[f"RepresentedVairable/{dc['id']}"]))


    #----- Add Code lists as instances of skos:ConceptScheme
    
    if dc['cl'] is not None:
        
        code_list_id = f"CL_{dc['id']}"
        code_list = dc['cl']
        
        g.add((
            ex1[f"ConceptScheme/{code_list_id}"], 
            RDF.type,
            SKOS.ConceptScheme
        ))
        
        g.add((
            ex1[f"SubstantiveValueDomain/{code_list_id}"], 
            RDF.type,
            cdi.SubstantiveValueDomain
        ))
        
        g.add((
            ex1[f"SubstantiveValueDomain/{code_list_id}"], 
            cdi.SubstantiveValueDomain_takesValuesFrom_EnumerationDomain,
            ex1[f"ConceptScheme/{code_list_id}"]
        ))


        g.add((ex1[f"RepresentedVairable/{dc['id']}"], 
               cdi.RepresentedVariable_takesSubstantiveValuesFrom_SubstantiveValueDomain, 
               ex1[f"SubstantiveValueDomain/{code_list_id}"]))

        #--------ADD INVIDUAL CODES


        for i, row in dc["cl"].iterrows():

            g.add((
                ex1[f"Concept/{code_list_id}/{row['CODE']}"], 
                RDF.type,
                SKOS.Concept,
            ))
            
            g.add((
                ex1[f"Concept/{code_list_id}/{row['CODE']}"], 
                SKOS.inScheme,
                ex1[f"SubstantiveValueDomain/{code_list_id}"],
            ))


            g.add((ex1[f"Concept/{code_list_id}/{row['CODE']}"], 
               RDFS['label'], 
               Literal(row['NAME'], lang = 'en')
            ))



In [11]:
for a in attributes:

    g.add((ex1[f"AttributeComponent/{a['id']}"], 
           RDF.type, 
           cdi.AttributeComponent))
    
    g.add((ex1['GDP/DSD'], 
           cdi.hasComponent, 
           ex1[f"AttributeComponent/{a['id']}"]))
    
    
    #----- Variable Cascade

    g.add((ex1[f"RepresentedVariable/{a['id']}"], 
           RDF.type, 
           cdi.RepresentedVariable))

    g.add((ex1[f"RepresentedVariable/{a['id']}"], 
           RDFS['label'], 
           Literal(dc['name'], lang = 'en')))
    
    g.add((ex1[f"AttributeComponent/{a['id']}"], 
           cdi.DataStructureComponent_isDefinedBy_RepresentedVariable, 
           ex1[f"RepresentedVairable/{a['id']}"]))


    #----- Add Code lists as instances of skos:ConceptScheme
    
    if a['cl'] is not None:
        
        code_list_id = f"CL_{a['id']}"
        code_list = a['cl']
        
        g.add((
            ex1[f"ConceptScheme/{code_list_id}"], 
            RDF.type,
            SKOS.ConceptScheme
        ))
        
        g.add((
            ex1[f"SubstantiveValueDomain/{code_list_id}"], 
            RDF.type,
            cdi.SubstantiveValueDomain
        ))
        
        g.add((
            ex1[f"SubstantiveValueDomain/{code_list_id}"], 
            cdi.SubstantiveValueDomain_takesValuesFrom_EnumerationDomain,
            ex1[f"ConceptScheme/{code_list_id}"]
        ))


        g.add((ex1[f"RepresentedVairable/{a['id']}"], 
               cdi.RepresentedVariable_takesSubstantiveValuesFrom_SubstantiveValueDomain, 
               ex1[f"SubstantiveValueDomain/{code_list_id}"]))

        #--------ADD INVIDUAL CODES


        for i, row in a["cl"].iterrows():

            g.add((
                ex1[f"Concept/{code_list_id}/{row['CODE']}"], 
                RDF.type,
                SKOS.Concept,
            ))
            
            g.add((
                ex1[f"Concept/{code_list_id}/{row['CODE']}"], 
                SKOS.inScheme,
                ex1[f"SubstantiveValueDomain/{code_list_id}"],
            ))


            g.add((ex1[f"Concept/{code_list_id}/{row['CODE']}"], 
               RDFS['label'], 
               Literal(row['NAME'], lang = 'en')
            ))


    

In [12]:
for m in measures:
        
    g.add((ex1[f"MeasureComponent/{m['id']}"], 
           RDF.type, 
           cdi.MeasureComponent))
    
    g.add((ex1['GDP/DSD'], 
           cdi.hasComponent, 
           ex1[f"MeasureComponent/{m['id']}"]))
    
#----- Variable Cascade

    g.add((ex1[f"RepresentedVariable/{m['id']}"], 
           RDF.type, 
           cdi.RepresentedVariable))

    g.add((ex1[f"RepresentedVariable/{m['id']}"], 
           RDFS['label'], 
           Literal(dc['name'], lang = 'en')))
    
    g.add((ex1[f"MeasureComponent/{m['id']}"], 
           cdi.DataStructureComponent_isDefinedBy_RepresentedVariable, 
           ex1[f"RepresentedVairable/{m['id']}"]))




## Save cdif profile

In [13]:
g.serialize(destination=f"../data/output/cdif_profile_GDP.ttl", format='turtle')

<Graph identifier=N91f24a6c1c7447bdb399c33eea231139 (<class 'rdflib.graph.Graph'>)>

In [14]:
g.serialize(destination=f"../data/output/cdif_profile_GDP.jsonld", format='json-ld')



<Graph identifier=N91f24a6c1c7447bdb399c33eea231139 (<class 'rdflib.graph.Graph'>)>