In [1]:
import csv, re, requests
from rdflib import Dataset, URIRef, Literal, Namespace, RDF, RDFS, OWL, XSD
from iribaker import to_iri
from SPARQLWrapper import SPARQLWrapper, JSON
import pprint as pp

In [2]:
GROUP12_REPOSITORY_VOCAB = "http://data.krw.d2s.labs.vu.nl/group12/vocab/"

In [3]:
def getPostcodesCity():
    postcodesFile = open('postcode.csv', 'r')
    fieldnames = ['PC','PLAATS','GEMEENTE','PROVINCIE']
    postcodes = csv.DictReader(postcodesFile, delimiter=',', quotechar='"', fieldnames=fieldnames)
    headers = postcodes.next()
    postcodeDict = {}
    for row in postcodes:
        postcodeDict[row['PC']] = row['GEMEENTE']
    return postcodeDict
postcodeDict = getPostcodesCity()

In [4]:
def stringFilter(string, valid_chars):
    string = ''.join([c for c in string.replace(' ', '_') if c in valid_chars]).lower()
    return string

In [5]:
def getStreetNames():
    VALID_CHARS = list("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-")
    REPOSITORY = "http://stardog.krw.d2s.labs.vu.nl/group12"
    REPOSITORY_VOCAB = (REPOSITORY + "/vocab/").replace("stardog","data") 
    endpoint = REPOSITORY + '/query'
    sparql = SPARQLWrapper(endpoint)
    
    query = """
PREFIX kkv: <%s> 

SELECT DISTINCT ?s ?streetName WHERE {
  ?s a kkv:Street;
      rdfs:label ?streetName.
}LIMIT %d
 OFFSET %d"""

    limit = 1000
    offset = 0
    sparql.setQuery(query%(REPOSITORY_VOCAB, limit, offset))
    print query%(REPOSITORY_VOCAB, limit, offset)
    sparql.setReturnFormat(JSON)
    sparql.addParameter('Accept','application/sparql-results+json')
    results = sparql.query().convert()
    numResults = len(results['results']['bindings'])
    streetNames = {}
    while numResults > 0:
        for object in results['results']['bindings']:
            name = object['streetName']['value']
            s = object['s']['value']
            newName = stringFilter(name, VALID_CHARS)
            streetNames[newName] = s
        offset += limit
        print offset
        sparql.setQuery(query%(REPOSITORY_VOCAB, limit, offset))
        results = sparql.query().convert()
        numResults = len(results['results']['bindings'])
            
    return streetNames
streetNames = getStreetNames()


PREFIX kkv: <http://data.krw.d2s.labs.vu.nl/group12/vocab/> 

SELECT DISTINCT ?s ?streetName WHERE {
  ?s a kkv:Street;
      rdfs:label ?streetName.
}LIMIT 1000
 OFFSET 0
1000
2000
3000
4000
5000
6000


In [22]:
def checkEndpoint(repository, streetNames, postcodeDict, out):
    postcodeRegex = '^[1-9][0-9]{3} ?(?!sa|sd|ss)[a-z]{2}'
    
    VALID_CHARS = list("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-")
    REPOSITORY_VOCAB = (repository + "/vocab/").replace("stardog","data") 
    endpoint = repository + '/query'
    sparql12 = SPARQLWrapper("http://stardog.krw.d2s.labs.vu.nl/group12/query")
    sparql12.setReturnFormat(JSON)
    sparql12.addParameter('Accept','application/sparql-results+json')
    
    sparql = SPARQLWrapper(endpoint)
    sparql.setReturnFormat(JSON)
    sparql.addParameter('Accept','application/sparql-results+json')
    
    overallQuery = """
PREFIX kkv: <%s> 

SELECT DISTINCT ?s ?o WHERE {
  ?s ?p ?o.
}LIMIT %d
 OFFSET %d"""
    
    objectQuery = """
PREFIX kkv: <%s> 

SELECT DISTINCT ?o WHERE {
  <%s> ?p ?o.
}LIMIT %d  
    """
    
    roadsectionQuery = """
PREFIX kkv: <%s> 

SELECT DISTINCT ?s ?e_hnr_lnks ?l_hnr_lnks ?e_hnr_rhts ?l_hnr_rhts ?hnrstrlnks ?hnrstrrhts WHERE {
  ?s ?p <%s>.
  ?s a kkv:RoadSection;
       kkv:e_hnr_lnks	?e_hnr_lnks;
       kkv:l_hnr_lnks	?l_hnr_lnks;
       kkv:e_hnr_rhts	?e_hnr_rhts;
       kkv:l_hnr_rhts	?l_hnr_rhts;
       kkv:hnrstrlnks	?hnrstrlnks;
       kkv:hnrstrrhts	?hnrstrrhts.
}LIMIT %d  
    """
    
    limit = 5000
    offset = 0
    sparql.setQuery(overallQuery%(REPOSITORY_VOCAB, limit, offset))

    results = sparql.query().convert()
    numResults = len(results['results']['bindings'])
    for object in results['results']['bindings']:
        name = object['o']['value'].strip()
        subject = object['s']['value'].strip()
        newName = stringFilter(name, VALID_CHARS)
        #It looks like we found a street
        if newName and newName in streetNames:
            #maybe we also have the housenumber
            splitName = name.split()
            if splitName[-1].isdigit():
                straat = " ".join(splitName[0:-1])
                huisnummer = splitName[-1]
            else:
                straat = name
                huisnummer = 0
            
            sparql.setQuery(objectQuery%(REPOSITORY_VOCAB, subject,limit))
            results = sparql.query().convert()
            stad = None
            postcode = None
            for objectResult in results['results']['bindings']:
                objectName = objectResult['o']['value']
                if stringFilter(objectName, VALID_CHARS) == "amsterdam":
                    stad = objectName
            
                regexed = re.search(postcodeRegex, objectName, flags=re.IGNORECASE)
                if regexed:
                    postcode = regexed.group(0)
                    postNumber = re.search('[1-9][0-9]{3}', objectName, flags=re.IGNORECASE)
                    stad = postcodeDict[postNumber.group(0)]

            # We make this assumption to ensure we can match with at least 2 group ontologies
            if '13' in repository:
                stad = 'Amsterdam'
            
            if huisnummer and stad == "Amsterdam":
                #print "\nstraat:%s huisnummer:%s\npostcode:%s stad:%s\n%s"%(straat, huisnummer, postcode, stad, streetNames[newName])
                out.write("\nstraat:%s\nhuisnummer:%s\npostcode:%s\nstad:%s\n%s\n"%(straat, huisnummer, postcode, stad, streetNames[newName]))
                sparql12.setQuery(roadsectionQuery%(GROUP12_REPOSITORY_VOCAB, streetNames[newName],limit))
                results = sparql12.query().convert()
                try:
                    huisnummer = int(huisnummer)
                    even = True if not huisnummer%2 else False
                    #print 'even' if even else 'odd'
                except:
                    print "Something went wrong, skip"
                for objectResult in results['results']['bindings']:
                    #Check left
                    leven = True if objectResult['hnrstrlnks']['value'] == "E" or objectResult['hnrstrlnks']['value'] == "B" else False
                    if (even and leven and 
                        (int(objectResult['e_hnr_lnks']['value']) <= huisnummer <= int(objectResult['l_hnr_lnks']['value']) or
                         int(objectResult['l_hnr_lnks']['value']) <= huisnummer <= int(objectResult['e_hnr_lnks']['value']))
                        ):
                        #print objectResult['s']['value']
                        out.write(objectResult['s']['value'] + '\n')
                    if (not even and not leven and 
                        (int(objectResult['e_hnr_lnks']['value']) <= huisnummer <= int(objectResult['l_hnr_lnks']['value']) or
                         int(objectResult['l_hnr_lnks']['value']) <= huisnummer <= int(objectResult['e_hnr_lnks']['value']))
                        ):
                        #print objectResult['s']['value']
                        out.write(objectResult['s']['value'] + '\n')

                    #check right
                    reven = True if objectResult['hnrstrrhts']['value'] == "E" or objectResult['hnrstrrhts']['value'] == "B" else False
                    if (even and reven and 
                        (int(objectResult['e_hnr_rhts']['value']) <= huisnummer <= int(objectResult['l_hnr_rhts']['value']) or
                         int(objectResult['l_hnr_rhts']['value']) <= huisnummer <= int(objectResult['e_hnr_rhts']['value']))
                        ):
                        #print objectResult['s']['value']
                        out.write(objectResult['s']['value'] + '\n')
                    if (not even and not reven and 
                        (int(objectResult['e_hnr_rhts']['value']) <= huisnummer <= int(objectResult['l_hnr_rhts']['value']) or
                         int(objectResult['l_hnr_rhts']['value']) <= huisnummer <= int(objectResult['e_hnr_rhts']['value']))
                        ):
                        #print objectResult['s']['value']
                        out.write(objectResult['s']['value'] + '\n')
            
#             print name, newName
for i in [13, 15]:
    print "Group%d"%(i)
    with open('Group%d-MatchingResults.txt'%(i), 'w') as out:
        checkEndpoint("http://stardog.krw.d2s.labs.vu.nl/group%d"%(i), streetNames, postcodeDict, out)


Group13
Group15
