# Population of the Knowledge database by elements of the Content database

This code is divided as follows :
1) Connection and import of data stored in the Content DB
2) Population of the Knowledge DB

2021-05-04

## 1) Connection and import of data stored in the Content DB

### Import librairies

In [1]:
import os 
import re
import logging
import sys
import pyodbc
import hashlib
import pandas as pd
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, POST, DIGEST, GET
from SPARQLWrapper import JSON, INSERT, DELETE
import sparql_dataframe

### Connect to the Content database and to the SPARQL endpoint of the Knowledge database

In [2]:
def connect_db(DSN, DBA, UID, PWD):

    connection = pyodbc.connect('DSN={};DBA={};UID={};PWD={}'.format(DSN, 
                                                                     DBA,
                                                                     UID,
                                                                     PWD))
    cursor = connection.cursor()

    return connection, cursor


def connect_virtuoso(DSN, UID, PWD):

    sparql = SPARQLWrapper(DSN)
    sparql.setHTTPAuth(DIGEST)
    sparql.setCredentials(UID, PWD)
    sparql.setMethod(GET)

    return sparql


"LOGIN" and "PASSWORD" should be replaced by your own access

In [4]:
# Connection to CDB 
connection, cursor = connect_db('VirtuosoKapcode', 
                                'ESTAT', 
                                'LOGIN', 
                                'PASSWORD')
# Connection to the KDB 
endpoint = "http://ks-devapp.ad.kappasante.lan:8890/sparql/"
sparql = connect_virtuoso(endpoint, 
                          'LOGIN', 
                          'PASSWORD')


### Define SQL queries to get content 

In [5]:
def select_query(columns, table, conditions=None): 

    if conditions:

        query = """
            SELECT {}
            FROM {}
            WHERE {}

        """.format(columns, table, conditions)

    else:

        query = """

        SELECT {}
        FROM {}
        """.format(columns, table)

    return query


### Apply the SQL queries 

In [6]:
# Get all elements from all tables :
eurostat_links = pd.read_sql(select_query('*',
                                          'dat_link_info', 
                                          'eurostat = 1'), 
                             connection)
foreign_links = pd.read_sql(select_query('*', 
                                         'dat_link_info', 
                                         'eurostat = 0'), 
                            connection)
concepts = pd.read_sql(select_query('*', 
                                    'dat_concept'), 
                       connection)
furtherinfo = pd.read_sql(select_query('*', 
                                       'dat_further_info'),
                          connection)
relatedconcepts = pd.read_sql(select_query('*', 
                                           'dat_related_concepts'),
                              connection)
statisticaldata = pd.read_sql(select_query('*', 
                                           'dat_statistical_data'),
                              connection)
sources = pd.read_sql(select_query('*',
                                   'dat_sources'),
                      connection)


## 2) Population of the Knowledge DB

### Define SPARQL queries from the previously loaded content to populate the KDB

In [7]:
def insert_basic_query(DB, method):  # works for eurostat links, foreign links

    for i, row in DB.iterrows():
        id_ = row[0]
        title_ = row[1].replace("'", "")
        url_ = row[2]
        eurostat_ = row[3]

        query_insert_P = """

        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
        PREFIX estat: <https://ec.europa.eu/eurostat/resource/ontology/> 
        PREFIX dct: <http://purl.org/dc/terms/>

        INSERT { GRAPH <https://ec.europa.eu/eurostat/resource/ontology/> { 

           estat:""" + str(id_) + """ rdf:about '""" + str(id_) + """'.
           estat:""" + str(id_) + """ dct:source '""" + str(url_) + """'. 
           estat:""" + str(id_) + """ dct:title '""" + str(title_) + """'. 
           estat:""" + str(id_) + """ estat:EurostatResource '""" + str(eurostat_) + """'. 

        } }
        ;
        """

        sparql.setQuery(query_insert_P)
        sparql.method = method
        sparql.setReturnFormat(JSON)
        results = sparql.query().response.read()


def insert_queries(DB, classe, method): 

    threshold = len(DB.columns)
    if threshold == 5:  # concept table

        for i, row in DB.iterrows():

            id_ = row[0]
            def_ = row[1]
            redirection_ = row[2]
            homepage_ = row[3]
            # last_update_ = str(row[4])

            # Get info from the eurostat_links
            url_ = eurostat_links[eurostat_links['id'] == id_].url.item()
            title_ = eurostat_links[eurostat_links['id'] == id_].title.item()

            # Construct the query 
            query = """

                PREFIX estat: <https://ec.europa.eu/eurostat/resource/ontology/> 
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

                INSERT { GRAPH <https://ec.europa.eu/eurostat/resource/ontology/> { 

                   estat:""" + str(id_) + """ skos:concept '""" + str(id_) + """'.
                   estat:""" + str(id_) + """ skos:definition '""" + str(def_) + """'. 
                   estat:""" + str(id_) + """ estat:isRedirectedTo '""" + str(redirection_) + """'. 
                   estat:""" + str(id_) + """ estat:linkToResource '""" + str(url_) + """'. 
                   estat:""" + str(id_) + """ skos:prefLabel '""" + str(title_) + """'. 
                   
        } }
        ;
                """

    elif threshold == 3:  # links, related concept, statistical data and source

        for i, row in DB.iterrows():

            id_ = row[0]
            concept_id_ = row[1]
            link_id_ = row[2]

            # Construct the query 
            query = """

                PREFIX estat: <https://ec.europa.eu/eurostat/resource/ontology/> 
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

                INSERT { GRAPH <https://ec.europa.eu/eurostat/resource/ontology/> { 

                   estat:""" + str(concept_id_) + ' ' + classe + """ '""" + str(link_id_) + """'. 

                } }
                ;
            """

    sparql.setQuery(query)
    sparql.method = method
    sparql.setReturnFormat(JSON)
    results = sparql.query().response.read()


### Populate the KDB

In [8]:
# Populate
insert_basic_query(eurostat_links, "POST")
insert_basic_query(foreign_links, "POST")
insert_queries(concepts, '', "POST")
insert_queries(furtherinfo, 'estat:furtherInformation', "POST")
insert_queries(relatedconcepts, 'skos:related', 'POST')
insert_queries(statisticaldata, 'estat:statisticalInformation', 'POST')
insert_queries(sources, 'estat:sourceInformation', 'POST')

### See added statements 

In [9]:

SelectAllStatements = """
SELECT * FROM <https://ec.europa.eu/eurostat/resource/ontology/>
WHERE { ?s ?p ?o

}
"""

statements_df = sparql_dataframe.get(endpoint, SelectAllStatements)
print(statements_df.shape)
print(statements_df.tail())


(10000, 3)
                                                      s  \
9995  https://ec.europa.eu/eurostat/resource/ontolog...   
9996  https://ec.europa.eu/eurostat/resource/ontolog...   
9997  https://ec.europa.eu/eurostat/resource/ontolog...   
9998  https://ec.europa.eu/eurostat/resource/ontolog...   
9999  https://ec.europa.eu/eurostat/resource/ontolog...   

                                                     p     o  
9995  http://www.w3.org/1999/02/22-rdf-syntax-ns#about  1461  
9996  http://www.w3.org/1999/02/22-rdf-syntax-ns#about  1462  
9997  http://www.w3.org/1999/02/22-rdf-syntax-ns#about  1463  
9998  http://www.w3.org/1999/02/22-rdf-syntax-ns#about  1464  
9999  http://www.w3.org/1999/02/22-rdf-syntax-ns#about  1465  
