# Create CDI dataset description

In [7]:
# Import necessary libraries and modules
import rdflib
from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.namespace import NamespaceManager
from rdflib.namespace import RDF, RDFS, SKOS, XSD, OWL
import os
import pandas as pd
import re


In [8]:

# Function to clean and rename columns
def clean_column_name(col_name):
    # Replace spaces with underscores
    col_name = col_name.replace(' ', '_')
    # Remove all types of parentheses and keep only alphanumeric and underscores
    col_name = re.sub(r'[^\w]', '', col_name)
    # Convert to uppercase
    return col_name.upper()

In [9]:
example_namespace = "http://example2.org/"
example_namespace_abbr = "ex2"

## Read CDI profile

In [10]:
g = Graph()
g.parse('../data/output/cdi_profile1.ttl', format='turtle')
# Define the namespace with the URL
cdi = Namespace("http://ddialliance.org/Specification/DDI-CDI/1.0/RDF/")
g.bind("cdi", cdi)
ex2 = Namespace(example_namespace)
g.bind(example_namespace_abbr, ex2)

## Read dataset and code lists

In [11]:
data = pd.read_excel("../data/input/dagstuhl2024/Example - GDP and Employment.xlsx", sheet_name="EMPLOYMENT_DATA")
data.columns = [clean_column_name(col) for col in data.columns]
data

Unnamed: 0,COUNTRY,ACTIVITY,SEX,YEAR,EMPLOYMENT_RATE_PERCENT,POPULATION_MILLION
0,4,ISIC4_A01,F,2022,0.019,5.4
1,8,ISIC4_A01,F,2022,0.019,3.0
2,12,ISIC4_A01,F,2022,0.010,4.8
3,16,ISIC4_A01,F,2022,0.016,3.4
4,20,ISIC4_A01,F,2022,0.019,0.5
...,...,...,...,...,...,...
163,51,ISIC4_A03,M,2023,0.048,
164,533,ISIC4_A03,M,2023,0.039,
165,36,ISIC4_A03,M,2023,0.020,
166,40,ISIC4_A03,M,2023,0.042,


In [12]:
code_lists = {
    "CL_COUNTRY" :  pd.read_excel("../data/input/dagstuhl2024/Example - GDP and Employment.xlsx", sheet_name="CL_COUNTRY"),
    "CL_ACTIVITY" :  pd.read_excel("../data/input/dagstuhl2024/Example - GDP and Employment.xlsx", sheet_name="CL_ACTIVITY"),
    "CL_SEX" :  pd.read_excel("../data/input/dagstuhl2024/Example - GDP and Employment.xlsx", sheet_name="CL_SEX")}
display(code_lists["CL_COUNTRY"])
display(code_lists["CL_ACTIVITY"])
display(code_lists["CL_SEX"])

Unnamed: 0,CODE,NAME
0,4,Afghanistan
1,8,Albania
2,12,Algeria
3,16,American Samoa
4,20,Andorra
5,24,Angola
6,660,Anguilla
7,28,Antigua and Barbuda
8,32,Argentina
9,51,Armenia


Unnamed: 0,CODE,NAME
0,ISIC4_A01,"01 - Crop and animal production, hunting and r..."
1,ISIC4_A02,02 - Forestry and logging
2,ISIC4_A03,03 - Fishing and aquaculture


Unnamed: 0,CODE,NAME
0,M,Male
1,F,Female
2,T,Total


# Populate profile with individuals

In [13]:
variableIDs = list(data)
variableIDs

['COUNTRY',
 'ACTIVITY',
 'SEX',
 'YEAR',
 'EMPLOYMENT_RATE_PERCENT',
 'POPULATION_MILLION']

### 1. DimensionalDataSet

In [14]:
g.add((ex2['EMP_DATA_EX2'], RDF.type, cdi.DimensionalDataSet))


<Graph identifier=Nf0bd1b286b5d4b87a09cd6965ff9c174 (<class 'rdflib.graph.Graph'>)>

### 2. DimensionalDataStructure

In [15]:
g.add((ex2['EMP/DSD'],  RDF.type, cdi.DimensionalDataStructure))


<Graph identifier=Nf0bd1b286b5d4b87a09cd6965ff9c174 (<class 'rdflib.graph.Graph'>)>

### 3. Data structure components

In [16]:
dimensions = [
    {"id":"COUNTRY",  "name": "Country or area", "cl":code_lists["CL_COUNTRY"]},
    {"id":"ACTIVITY", "name": "Economic activity", "cl": code_lists["CL_ACTIVITY"]},
    {"id":"SEX",      "name": "Sex", "cl": code_lists["CL_SEX"]},
    {"id":"YEAR",     "name": "Year", "cl": None}
]

measures = [
    {"id":"EMPLOYMENT_RATE_PERCENT",  "name": "Employment rate", "cl":None},
    {"id":"POPULATION_MILLION", "name": "Economic activity", "cl": None}
]

attributes = []

               

In [17]:
for dc in dimensions:
        
    g.add((ex2[f"DimensionComponent/{dc['id']}"], 
           RDF.type, 
           cdi.DimensionComponent))
        
    if dc['id'] == 'COUNTRY':
        g.add((ex2[f"DimensionComponent/{dc['id']}"], 
               cdi['DataStructureComponent-specialization'], 
               cdi['GeoRole']))

    if dc['id'] == 'YEAR':
        
        g.add((ex2[f"DimensionComponent/{dc['id']}"], 
               cdi['DataStructureComponent-specialization'], 
               cdi['TimeRole']))
        
    g.add((ex2['EMP/DSD'], 
           cdi.hasComponent, 
           ex2[f"DimensionComponent/{dc['id']}"]))

    
    #----- Variable Cascade

    g.add((ex2[f"RepresentedVariable/{dc['id']}"], 
           RDF.type, 
           cdi.RepresentedVariable))

    g.add((ex2[f"RepresentedVariable/{dc['id']}"], 
           RDFS['label'], 
           Literal(dc['name'], lang = 'en')))
    
    g.add((ex2[f"DimensionComponent/{dc['id']}"], 
           cdi.DataStructureComponent_isDefinedBy_RepresentedVariable, 
           ex2[f"RepresentedVairable/{dc['id']}"]))


    #----- Add Code lists as instances of skos:ConceptScheme
    
    if dc['cl'] is not None:
        
        code_list_id = f"CL_{dc['id']}"
        code_list = dc['cl']
        
        g.add((
            ex2[f"ConceptScheme/{code_list_id}"], 
            RDF.type,
            SKOS.ConceptScheme
        ))
        
        g.add((
            ex2[f"SubstantiveValueDomain/{code_list_id}"], 
            RDF.type,
            cdi.SubstantiveValueDomain
        ))
        
        g.add((
            ex2[f"SubstantiveValueDomain/{code_list_id}"], 
            cdi.SubstantiveValueDomain_takesValuesFrom_EnumerationDomain,
            ex2[f"ConceptScheme/{code_list_id}"]
        ))


        g.add((ex2[f"RepresentedVairable/{dc['id']}"], 
               cdi.RepresentedVariable_takesSubstantiveValuesFrom_SubstantiveValueDomain, 
               ex2[f"SubstantiveValueDomain/{code_list_id}"]))

        #--------ADD INVIDUAL CODES


        for i, row in dc["cl"].iterrows():

            g.add((
                ex2[f"Concept/{code_list_id}/{row['CODE']}"], 
                RDF.type,
                SKOS.Concept,
            ))
            
            g.add((
                ex2[f"Concept/{code_list_id}/{row['CODE']}"], 
                SKOS.inScheme,
                ex2[f"SubstantiveValueDomain/{code_list_id}"],
            ))


            g.add((ex2[f"Concept/{code_list_id}/{row['CODE']}"], 
               RDFS['label'], 
               Literal(row['NAME'], lang = 'en')
            ))



In [18]:
for a in attributes:

    g.add((ex2[f"AttributeComponent/{a['id']}"], 
           RDF.type, 
           cdi.AttributeComponent))
    
    g.add((ex2['EMP/DSD'], 
           cdi.hasComponent, 
           ex1[f"AttributeComponent/{a['id']}"]))
    
    
    #----- Variable Cascade

    g.add((ex2[f"RepresentedVariable/{a['id']}"], 
           RDF.type, 
           cdi.RepresentedVariable))

    g.add((ex2[f"RepresentedVariable/{a['id']}"], 
           RDFS['label'], 
           Literal(dc['name'], lang = 'en')))
    
    g.add((ex2[f"AttributeComponent/{a['id']}"], 
           cdi.DataStructureComponent_isDefinedBy_RepresentedVariable, 
           ex1[f"RepresentedVairable/{a['id']}"]))


    #----- Add Code lists as instances of skos:ConceptScheme
    
    if a['cl'] is not None:
        
        code_list_id = f"CL_{a['id']}"
        code_list = a['cl']
        
        g.add((
            ex2[f"ConceptScheme/{code_list_id}"], 
            RDF.type,
            SKOS.ConceptScheme
        ))
        
        g.add((
            ex2[f"SubstantiveValueDomain/{code_list_id}"], 
            RDF.type,
            cdi.SubstantiveValueDomain
        ))
        
        g.add((
            ex2[f"SubstantiveValueDomain/{code_list_id}"], 
            cdi.SubstantiveValueDomain_takesValuesFrom_EnumerationDomain,
            ex2[f"ConceptScheme/{code_list_id}"]
        ))


        g.add((ex2[f"RepresentedVairable/{a['id']}"], 
               cdi.RepresentedVariable_takesSubstantiveValuesFrom_SubstantiveValueDomain, 
               ex2[f"SubstantiveValueDomain/{code_list_id}"]))

        #--------ADD INVIDUAL CODES


        for i, row in a["cl"].iterrows():

            g.add((
                ex2[f"Concept/{code_list_id}/{row['CODE']}"], 
                RDF.type,
                SKOS.Concept,
            ))
            
            g.add((
                ex2[f"Concept/{code_list_id}/{row['CODE']}"], 
                SKOS.inScheme,
                ex2[f"SubstantiveValueDomain/{code_list_id}"],
            ))


            g.add((ex2[f"Concept/{code_list_id}/{row['CODE']}"], 
               RDFS['label'], 
               Literal(row['NAME'], lang = 'en')
            ))


    

In [19]:
for m in measures:
        
    g.add((ex2[f"MeasureComponent/{m['id']}"], 
           RDF.type, 
           cdi.MeasureComponent))
    
    g.add((ex2['EMP/DSD'], 
           cdi.hasComponent, 
           ex2[f"MeasureComponent/{m['id']}"]))
    
#----- Variable Cascade

    g.add((ex2[f"RepresentedVariable/{m['id']}"], 
           RDF.type, 
           cdi.RepresentedVariable))

    g.add((ex2[f"RepresentedVariable/{m['id']}"], 
           RDFS['label'], 
           Literal(dc['name'], lang = 'en')))
    
    g.add((ex2[f"MeasureComponent/{m['id']}"], 
           cdi.DataStructureComponent_isDefinedBy_RepresentedVariable, 
           ex2[f"RepresentedVairable/{m['id']}"]))




## Save cdif profile

In [20]:
g.serialize(destination=f"../data/output/cdif_profile_EMP.ttl", format='turtle')

<Graph identifier=Nf0bd1b286b5d4b87a09cd6965ff9c174 (<class 'rdflib.graph.Graph'>)>

In [21]:
g.serialize(destination=f"../data/output/cdif_profile_EMP.jsonld", format='json-ld')



<Graph identifier=Nf0bd1b286b5d4b87a09cd6965ff9c174 (<class 'rdflib.graph.Graph'>)>