In [1]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', None)

import rdflib
from urllib.parse import quote
from rdflib import Graph, URIRef, Literal, BNode, Namespace
from rdflib.namespace import FOAF, RDF, OWL, RDFS, XSD

## 1. Upload Data Sources

#### MMDS_Courses

In [2]:
df_mmds_courses = pd.read_csv('../data/processed/mmds_courses.csv')
df_mmds_courses.ECTS = np.where(df_mmds_courses.ECTS.str.isdigit(), df_mmds_courses.ECTS, '18')
df_mmds_courses.ECTS = df_mmds_courses.ECTS.astype(int)

display(df_mmds_courses.head())
print ('Size df_mmds_courses: ',len(df_mmds_courses))

Unnamed: 0,module_code,Name of Module,Offered,Language,ECTS,moduleCategory
0,CS 450,Programming Course,HWS,E,6,Fundamentals
1,CS 460,Database Technology,FSS,E,6,Fundamentals
2,CS 470,Python for Data Scientists,FSS,E,6,Fundamentals
3,,Multivariate Analyses,HWS,E,6,Fundamentals
4,,Tutorial Multivariate Analyses,HWS,E,2,Fundamentals


Size df_mmds_courses:  48


In [3]:
df_mmds_courses[df_mmds_courses.module_code.isna()]

Unnamed: 0,module_code,Name of Module,Offered,Language,ECTS,moduleCategory
3,,Multivariate Analyses,HWS,E,6,Fundamentals
4,,Tutorial Multivariate Analyses,HWS,E,2,Fundamentals
5,,Empirische Methoden der Politikwissenschaft,HWS,G/E,6,Fundamentals
46,,Legal and Ethical Aspects of Privacy,HWS,E,3,Responsible Data Science


In [4]:
mmds_modules = df_mmds_courses.moduleCategory.value_counts().rename_axis('module').reset_index(name="counts")
mmds_modules

Unnamed: 0,module,counts
0,Data Analytics,24
1,Data Management,15
2,Fundamentals,6
3,Responsible Data Science,3


#### MMDS_Courses_Data

In [5]:
mmds_courses_data = pd.read_csv('../data/processed/mmds_courses_data.csv')
display(mmds_courses_data.head())
print ('Size mmds_courses_data: ',len(mmds_courses_data))

Unnamed: 0,module_code,module_name,property,property_value
0,AC 651,Additional Course – Data Management,Aim of module,"The course falls into the data management area of the MMDS and covers topics related to data management, but is not di- rectly equivalent to any course in the MMDS module cata- logue. The course level equals a regular course in MMDS study program. The module can be taken either at the University of Mannheim or at any other university in Germany or abroad."
1,AC 651,Additional Course – Data Management,C 651,Additional Course – Data Management
2,AC 651,Additional Course – Data Management,Duration of module,1 Semester
3,AC 651,Additional Course – Data Management,ECTS,Max. 18
4,AC 651,Additional Course – Data Management,Form of module,Depends on course


Size mmds_courses_data:  737


In [6]:
#Nulls in property_value column
len(mmds_courses_data[mmds_courses_data.property_value.isna()])

22

## 2. URIs Creation

#### Programs : https://www.wim.uni-mannheim.de/en/academics/programs/

        - Base URL : https://www.uni-mannheim.de/en/academics/
        - MMDS : https://www.uni-mannheim.de/en/academics/programs/mannheim-master-in-data-science/
        - BI.  : https://www.uni-mannheim.de/en/academics/programs/masters-program-in-business-informatics/
        - Courses : https://www.uni-mannheim.de/en/academics/courses/

In [7]:
baseURL = Namespace('https://www.uni-mannheim.de/en/academics/')
mmdsModuleURL = Namespace(baseURL+'programs/mannheim-master-in-data-science/modules/')
mmdsURL = Namespace(baseURL+'programs/mannheim-master-in-data-science/')
courseURL = Namespace(baseURL+'courses/')
propertyURL = Namespace(baseURL+'property/')
professorURL = Namespace(baseURL+'professor/')

In [8]:
def create_uri(name:str,baseURL):
    '''Takes a string and returns a valid namespace URI'''
    quoted = quote(name.replace(" ", "_"))
    return baseURL[quoted]

### 2.1 Module URIs

In [9]:
mmds_modules['moduleURI'] = mmds_modules.module.apply(lambda x: create_uri(x,mmdsModuleURL))

display (mmds_modules)
print('Example: ',mmds_modules.moduleURI[0])

Unnamed: 0,module,counts,moduleURI
0,Data Analytics,24,https://www.uni-mannheim.de/en/academics/programs/mannheim-master-in-data-science/modules/Data_Analytics
1,Data Management,15,https://www.uni-mannheim.de/en/academics/programs/mannheim-master-in-data-science/modules/Data_Management
2,Fundamentals,6,https://www.uni-mannheim.de/en/academics/programs/mannheim-master-in-data-science/modules/Fundamentals
3,Responsible Data Science,3,https://www.uni-mannheim.de/en/academics/programs/mannheim-master-in-data-science/modules/Responsible_Data_Science


Example:  https://www.uni-mannheim.de/en/academics/programs/mannheim-master-in-data-science/modules/Data_Analytics


### 2.2 Course URIs

In [10]:
# Eliminate instances without code
mmds_courses = df_mmds_courses[df_mmds_courses.module_code.notna()].copy()
# Create URIs
mmds_courses['courseURI'] = mmds_courses.module_code.str.lower()+' '+mmds_courses['Name of Module'].str.lower()
mmds_courses['courseURI'] = mmds_courses.courseURI.apply(lambda x: create_uri(x,courseURL))

display (mmds_courses.head())
print('Example: ',mmds_courses['courseURI'][0])

Unnamed: 0,module_code,Name of Module,Offered,Language,ECTS,moduleCategory,courseURI
0,CS 450,Programming Course,HWS,E,6,Fundamentals,https://www.uni-mannheim.de/en/academics/courses/cs_450_programming_course
1,CS 460,Database Technology,FSS,E,6,Fundamentals,https://www.uni-mannheim.de/en/academics/courses/cs_460_database_technology
2,CS 470,Python for Data Scientists,FSS,E,6,Fundamentals,https://www.uni-mannheim.de/en/academics/courses/cs_470_python_for_data_scientists
6,AC 651,Additional Course – Data Management,HWS/FSS,E,18,Data Management,https://www.uni-mannheim.de/en/academics/courses/ac_651_additional_course_%E2%80%93_data_management
7,CS 500,Advanced Software Engineering,HWS,E,6,Data Management,https://www.uni-mannheim.de/en/academics/courses/cs_500_advanced_software_engineering


Example:  https://www.uni-mannheim.de/en/academics/courses/cs_450_programming_course


### 2.3 Property URIs

#### Properties selection

In [11]:
mmds_course_property = mmds_courses_data['property'].value_counts().rename_axis('property').reset_index(name="counts").copy()
mmds_course_property = mmds_course_property[mmds_course_property.counts>16]
mmds_course_property.head()

Unnamed: 0,property,counts
0,Aim of module,32
1,ECTS,32
2,Level,32
3,Person in charge,32
4,Prerequisites,32


Selected properties

In [12]:
property_list = ['Duration of assessment','Prerequisites','Form of assessment','Aim of module',
                 'Workload','Lecturer','Duration of module','Semester','Methods',
                 'Range of application','Admission requirements']
print (property_list)

['Duration of assessment', 'Prerequisites', 'Form of assessment', 'Aim of module', 'Workload', 'Lecturer', 'Duration of module', 'Semester', 'Methods', 'Range of application', 'Admission requirements']


Filtrate dataframe, including only these properties

In [13]:
mmds_courses_data_ = mmds_courses_data[mmds_courses_data.property.isin(property_list)].copy()
print ('Size mmds_courses_data_: ',len(mmds_courses_data_))

Size mmds_courses_data_:  320


Create course URI, property URI, and professor URI

In [14]:
#create course URI
mmds_courses_data_
mmds_courses_data_['courseURI'] = mmds_courses_data_.module_code.str.lower()+' '+mmds_courses_data_.module_name.str.lower()
mmds_courses_data_['courseURI'] = mmds_courses_data_.courseURI.apply(lambda x: create_uri(x,courseURL))

#create property URI
mmds_courses_data_['propertyURI'] = mmds_courses_data_.property.str.lower()
mmds_courses_data_['propertyURI'] = mmds_courses_data_.propertyURI.apply(lambda x: create_uri(x,propertyURL))
mmds_courses_data_.head()

#Unique list of properties
propertyURI = list(mmds_courses_data_.propertyURI.unique())
propertyURI

[rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/property/aim_of_module'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/property/duration_of_module'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/property/lecturer'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/property/prerequisites'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/property/range_of_application'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/property/semester'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/property/workload'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/property/admission_requirements'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/property/duration_of_assessment'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/property/form_of_assessment'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/property/methods')]

### 2.4 Lecturer URIs

In [15]:
#create professor URI
lecturere_mask = mmds_courses_data_.property=='Lecturer'
mmds_courses_data_.loc[lecturere_mask,'property_value'] = mmds_courses_data_.loc[lecturere_mask,'property_value'].str.replace('.','',regex=True).str.replace('; ',',',regex=True).str.replace(',nan','',regex=True)
mmds_courses_data_.loc[lecturere_mask,'property_value'] = mmds_courses_data_.loc[lecturere_mask,'property_value'].str.replace('Lecturer at the host university','LHU',regex=True).str.replace('Prof ','',regex=True).str.replace('Dr ','',regex=True).str.replace('Junior ','',regex=True).str.replace('Professor ','',regex=True).str.replace('Simone Paolo Ponzetto','Simone Ponzetto',regex=True)
mmds_courses_data_.loc[lecturere_mask,'property_value'] = mmds_courses_data_.loc[lecturere_mask,'property_value'].str.split(",")
mmds_courses_data_ = mmds_courses_data_.explode('property_value')
mmds_courses_data_.loc[lecturere_mask,'property_value'] = mmds_courses_data_[lecturere_mask].property_value.apply(lambda x: create_uri(x,professorURL))

print ('Size mmds_courses_data_: ',len(mmds_courses_data_))

#Unique list of Lecturers
lecturerURI = list(mmds_courses_data_.loc[lecturere_mask,'property_value'].unique())
lecturerURI

Size mmds_courses_data_:  321


  mmds_courses_data_.loc[lecturere_mask,'property_value'] = mmds_courses_data_[lecturere_mask].property_value.apply(lambda x: create_uri(x,professorURL))


[rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/professor/LHU'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/professor/Ursula_Rost'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/professor/Heiko_Paulheim'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/professor/Marcus_Kessel'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/professor/Colin_Atkinson'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/professor/Guido_Moerkotte'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/professor/Matthias_Krause'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/professor/Rainer_Gemulla'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/professor/Frederik_Armknecht'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/professor/Roland_Lei%C3%9Fa'),
 rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/professor/MKW'),
 rdflib.term.URIRef('https:

## 3. KG Building

In [16]:
baseURL = Namespace('https://www.uni-mannheim.de/en/academics/')
mmdsModuleURL = Namespace(baseURL+'programs/mannheim-master-in-data-science/modules/')
mmdsURL = Namespace(baseURL+'programs/mannheim-master-in-data-science/')
courseURL = Namespace(baseURL+'courses/')
propertyURL = Namespace(baseURL+'property/')
professorURL = Namespace(baseURL+'professor/')

In [17]:
g = Graph()
g.bind("foaf", FOAF)
g.bind("courseKG", baseURL)
g.bind("prop",propertyURL)
g.bind("owl",OWL)
g.bind("xsd",XSD)

In [18]:
baseURL.programs

rdflib.term.URIRef('https://www.uni-mannheim.de/en/academics/programs')

### 3.1 Ontology

#### Classes

In [19]:
g.add((baseURL.programs, RDF.type, RDFS.Class))
g.add((baseURL.module, RDF.type, RDFS.Class))
g.add((baseURL.course, RDF.type, RDFS.Class))
g.add((baseURL.people, RDF.type, RDFS.Class ))
g.add((baseURL.people, RDFS.subClassOf, FOAF.Person))
g.add((baseURL.professor,RDFS.subClassOf,baseURL.people))

<Graph identifier=Nbf51302a2d714227b08ae1d6a683cde1 (<class 'rdflib.graph.Graph'>)>

#### Properties

In [20]:
# Set all properties as DatatypeProperty
for prop in propertyURI:
    g.add((prop, RDF.type, OWL.DatatypeProperty))
    g.add((prop, RDFS.domain, baseURL.course))
    g.add((prop, RDFS.range, XSD.string))
    
# Change property lecturer to ObjectProperty
g.set((propertyURL.lecturer, RDF.type, OWL.ObjectProperty))
g.set((propertyURL.lecturer, RDFS.domain, baseURL.course))
g.set((propertyURL.lecturer, RDFS.range, baseURL.professor))

# isCourseOf property
g.add((propertyURL.isCourseOf, RDF.type, OWL.ObjectProperty))
g.add((propertyURL.isCourseOf, RDFS.domain, baseURL.course))
g.add((propertyURL.isCourseOf, RDFS.range, baseURL.module))

# isModuleOf property
g.add((propertyURL.isModuleOf, RDF.type, OWL.ObjectProperty))
g.add((propertyURL.isModuleOf, RDFS.domain, baseURL.module))
g.add((propertyURL.isModuleOf, RDFS.range, baseURL.programs))

# courseCode property
g.add((propertyURL.courseCode, RDF.type, OWL.DatatypeProperty))
g.add((propertyURL.courseCode, RDFS.domain, baseURL.course))
g.add((propertyURL.courseCode, RDFS.range, XSD.string))

# courseName property
g.add((propertyURL.courseName, RDF.type, OWL.DatatypeProperty))
g.add((propertyURL.courseName, RDFS.domain, baseURL.course))
g.add((propertyURL.courseName, RDFS.range, XSD.string))

# ects property
g.add((propertyURL.ects, RDF.type, OWL.DatatypeProperty))
g.add((propertyURL.ects, RDFS.domain, baseURL.course))
g.add((propertyURL.ects, RDFS.range, XSD.integer))

# language property
g.add((propertyURL.language, RDF.type, OWL.DatatypeProperty))
g.add((propertyURL.language, RDFS.domain, baseURL.course))
g.add((propertyURL.language, RDFS.range, XSD.string))

# offering property
g.add((propertyURL.offering, RDF.type, OWL.DatatypeProperty))
g.add((propertyURL.offering, RDFS.domain, baseURL.course))
g.add((propertyURL.offering, RDFS.range, XSD.string))

# programName property
g.add((propertyURL.programName, RDF.type, OWL.DatatypeProperty))
g.add((propertyURL.programName, RDFS.domain, baseURL.programs))
g.add((propertyURL.programName, RDFS.range, XSD.string))

# moduleName property
g.add((propertyURL.moduleName, RDF.type, OWL.DatatypeProperty))
g.add((propertyURL.moduleName, RDFS.domain, baseURL.module))
g.add((propertyURL.moduleName, RDFS.range, XSD.string))

# professorName property
g.add((propertyURL.professorName, RDF.type, OWL.DatatypeProperty))
g.add((propertyURL.professorName, RDFS.domain, baseURL.professor))
g.add((propertyURL.professorName, RDFS.range, XSD.string))


<Graph identifier=Nbf51302a2d714227b08ae1d6a683cde1 (<class 'rdflib.graph.Graph'>)>

### 3.2 Assertion - Box

#### Programs

In [21]:
g.add((URIRef(mmdsURL), RDF.type, baseURL.programs))
g.add((URIRef(mmdsURL), propertyURL.programName, Literal('mannheim-master-in-data-science')))

<Graph identifier=Nbf51302a2d714227b08ae1d6a683cde1 (<class 'rdflib.graph.Graph'>)>

#### Modules

Define modules and connect them with literal values (moduleName)

In [22]:
for row in mmds_modules.index:
    g.add((URIRef(mmds_modules.loc[row,'moduleURI']), RDF.type, baseURL.module))
    g.add((mmds_modules.loc[row,'moduleURI'], propertyURL.moduleName, Literal(mmds_modules.loc[row,'module'])))

Connect modules with programs

In [23]:
for item in mmds_modules.moduleURI.iteritems():
    g.add((item[1],propertyURL.isModuleOf,URIRef(mmdsURL)))

#### Professors

In [24]:
for profURI in lecturerURI:
    g.add((profURI, RDF.type, baseURL.professor))
    g.add((profURI, propertyURL.professorName, Literal(str(profURI).replace(str(professorURL),''))))

Connect to public KG

In [25]:
# OWL sameAs links to other (public) knowledge graphs
sameAs = {
    'https://www.uni-mannheim.de/en/academics/professor/Ursula_Rost' : ['https://dblp.org/pid/r/URost'],
    'https://www.uni-mannheim.de/en/academics/professor/Heiko_Paulheim': ['https://dblp.org/pid/39/4064', 'http://www.wikidata.org/entity/Q23709849'],
    'https://www.uni-mannheim.de/en/academics/professor/Marcus_Kessel' : ['https://dblp.org/pid/133/2140'],
    'https://www.uni-mannheim.de/en/academics/professor/Colin_Atkinson' : ['https://dblp.org/pid/a/ColinAtkinson', 'https://www.wikidata.org/entity/Q40100239'],
    'https://www.uni-mannheim.de/en/academics/professor/Guido_Moerkotte' : ['https://dblp.org/pid/m/GuidoMoerkotte','https://www.wikidata.org/entity/Q40472650'],
    'https://www.uni-mannheim.de/en/academics/professor/Matthias_Krause' : ['https://dblp.org/pid/90/2046-1','https://www.wikidata.org/entity/Q102110274'],
    'https://www.uni-mannheim.de/en/academics/professor/Rainer_Gemulla': ['https://dblp.org/pid/32/5357', 'http://www.wikidata.org/entity/Q39160786'],
    'https://www.uni-mannheim.de/en/academics/professor/Frederik_Armknecht' : ['https://dblp.org/pid/65/6856', 'https://www.wikidata.org/entity/Q102110273'],
    'https://www.uni-mannheim.de/en/academics/professor/Roland_Lei%C3%9Fa': ['https://dblp.org/pid/05/10957','https://www.wikidata.org/entity/Q108751937'],
    'https://www.uni-mannheim.de/en/academics/professor/Christian_Bizer': ['https://dblp.org/pid/b/ChristianBizer', 'http://www.wikidata.org/entity/Q17744291'],
    'https://www.uni-mannheim.de/en/academics/professor/Heiner_Stuckenschmidt': ['https://dblp.org/pid/50/5895', 'http://www.wikidata.org/entity/Q38721681'],
    'https://www.uni-mannheim.de/en/academics/professor/Markus_Strohmaier' : ['https://dblp.org/pid/01/6659', 'https://www.wikidata.org/entity/Q48612584'],
    'https://www.uni-mannheim.de/en/academics/professor/Simone_Ponzetto': ['https://dblp.org/pid/04/2532', 'http://www.wikidata.org/entity/Q28578432']
    }

for original, extensions in sameAs.items():
    for extension in extensions:
        g.add((URIRef(original), OWL.sameAs, URIRef(extension)))

#### Courses

In [26]:
for courseREF in mmds_courses.courseURI:
    g.add((URIRef(courseREF), RDF.type, baseURL.course))

Connect courses with literal values and object instances

In [27]:
def add_course_details(course_data):
    #reset the index of dataframe to avoid indexing errors
    course_data = course_data.reset_index()
    #counter for row number
    row=0
    num_rows = len(course_data.index)
    
    #for each row in the dataframe
    while row<num_rows:
        #add the triple
        sub, prop, val = (course_data.loc[row,'courseURI'],course_data.loc[row,'propertyURI'], course_data.loc[row,'property_value'])
        
        #check if value is null
        if not pd.isnull((val)):
            #check if the property is lecturer
            if course_data.loc[row,'property']=='Lecturer':
                #link the lecturer object to course
                g.add((sub, prop, val))
            else:
                #link the literal to the course
                g.add((sub, prop, Literal(val.lower())))
        row = row+1

In [28]:
add_course_details(mmds_courses_data_)

Connect courses with modules and more literal values (courseCode, courseName)

In [29]:
def link_courses_modules(mmds_courses):
    
    #reset the index of dataframe to avoid indexing errors
    mmds_courses = mmds_courses.reset_index()
    #counter for row number
    row=0
    num_rows = len(mmds_courses.index)
    
    #for each row in the dataframe
    while row<num_rows:
        #create a mapping dictionary for adding predicates
        map_dict = {"courseCode":mmds_courses.loc[row,"module_code"],
                    "courseName":mmds_courses.loc[row,"Name of Module"],
                    "ects":mmds_courses.loc[row,"ECTS"],
                    "language":mmds_courses.loc[row,"Language"],
                    "offering":mmds_courses.loc[row,"Offered"],
                    "modURI":create_uri(mmds_courses.loc[row,"moduleCategory"],mmdsModuleURL),
                    "courseURI":mmds_courses.loc[row,"courseURI"]}
        #add triples to link courses with modules
        g.add((map_dict["courseURI"], propertyURL.isCourseOf, map_dict["modURI"]))
        #add courseCode
        g.add((map_dict["courseURI"], propertyURL.courseCode, Literal(map_dict["courseCode"])))
        #add courseName
        g.add((map_dict["courseURI"], propertyURL.courseName, Literal(map_dict["courseName"])))
        #add ects
        g.add((map_dict["courseURI"], propertyURL.ects, Literal(int(map_dict["ects"]))))
        #add language
        g.add((map_dict["courseURI"], propertyURL.language, Literal(map_dict["language"])))
        #add offering 
        g.add((map_dict["courseURI"], propertyURL.offering, Literal(map_dict["offering"])))
            
        row=row+1

In [30]:
link_courses_modules(mmds_courses)

## 4. Save File I/O

In [31]:
s = g.serialize(format="ttl")
g.serialize(destination="../data/kg/mmds_courses_kg.ttl", format="ttl", encoding="utf-8") ;

In [32]:
print(s)

@prefix courseKG: <https://www.uni-mannheim.de/en/academics/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix prop: <https://www.uni-mannheim.de/en/academics/property/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

courseKG:course a rdfs:Class .

courseKG:module a rdfs:Class .

courseKG:people a rdfs:Class ;
    rdfs:subClassOf foaf:Person .

courseKG:programs a rdfs:Class .

<https://www.uni-mannheim.de/en/academics/courses/ac_651_additional_course_%E2%80%93_data_management> a courseKG:course ;
    prop:aim_of_module "the course falls into the data management area of the mmds and covers topics related to data management, but is not di- rectly equivalent to any course in the mmds module cata- logue. the course level equals a regular course in mmds study program. the module can be taken either at the university of mannheim or at any other university in germany or abroad