SIC 2007 to SKOS concept scheme

In [1]:
import requests
from pathlib import Path
from io import BytesIO
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

xl_file = BytesIO(session.get('https://www.ons.gov.uk/file?uri=/methodology/classificationsandstandards/'\
                              'ukstandardindustrialclassificationofeconomicactivities/uksic2007/'\
                              'sic2007summaryofstructurtcm6.xls').content)
import xlrd
with open('data.xls', 'wb') as f:
    f.write(xl_file.read())
book = xlrd.open_workbook(file_contents=xl_file.getvalue())
book

<xlrd.book.Book at 0x7f7a05775c88>

In [2]:
from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.namespace import SKOS, RDF, RDFS, OWL, XSD
from rdflib.collection import Collection
from titlecase import titlecase

g = Graph()
SIC = Namespace('http://www.ons.gov.uk/def/sic2007#')

g.bind('skos', SKOS)
g.bind('owl', OWL)
g.bind('sic', SIC)

cs = URIRef('http://www.ons.gov.uk/def/sic2007')
g.add((cs, RDF.type, SKOS.ConceptScheme))
g.add((cs, RDFS.label, Literal("SIC 2007")))
g.add((cs, RDFS.comment, Literal("UK Standard Industrial Classification of Economic Activities", "en")))

for (typ, label, pattern) in [
    ('Section', 'Section', '[A-U]'),
    ('Division', 'Division', '[0-9]{2}'),
    ('Group', 'Group', '[0-9]{2}\.[0-9]'),
    ('Class', 'Class', '[0-9]{2}\.[0-9]{2}'),
    ('SubClass', 'Sub class', '[0-9]{2}\.[0-9]{2}/[0-9]')]:
    g.add((SIC[typ], RDF.type, RDFS.Datatype))
    g.add((SIC[typ], RDFS.label, Literal(label, "en")))
    g.add((SIC[typ], OWL.onDatatype, XSD.String))
    pattern_node = BNode()
    pattern_list = BNode()
    collection = Collection(g, pattern_list, [pattern_node])
    g.add((SIC[typ], OWL.withRestrictions, pattern_list))
    g.add((pattern_node, XSD.pattern, Literal(pattern)))

def addConcept(c, label, notation, typ, parent):
    g.add((c, RDF.type, SKOS.Concept))
    g.add((c, RDFS.label, Literal(label, "en")))
    g.add((c, SKOS.notation, Literal(notation, datatype=typ)))
    g.add((c, SKOS.inScheme, cs))
    if parent:
        g.add((c, SKOS.broader, parent))
    else:
        g.add((c, SKOS.topConceptOf, cs))
    return c

sheet = book.sheets()[0]
for i in range(3, sheet.nrows):
    row = sheet.row_values(i, 0)
    if row[0] != '': # Section
        parentSection = addConcept(SIC[row[0].strip()], titlecase(row[1]), row[0].strip(), SIC.Section, None)
        g.add((cs, SKOS.hasTopConcept, parentSection))
    elif row[1] != '': # extra comment for section
        g.add((parentSection, RDFS.comment, Literal(row[1].strip(), "en")))
    elif row[2] != '': # Division
        parentDivision = addConcept(SIC[row[2].strip()], row[3].strip(), row[2].strip(), SIC.Division, parentSection)
    elif row[3] != '': # extra comment for division
        g.add((parentDivision, RDFS.comment, Literal(row[3].strip(), "en")))
    elif row[4] != '': # Group
        parentGroup = addConcept(SIC[row[4].strip()], row[5].strip(), row[4].strip(), SIC.Group, parentDivision)
    elif row[5] != '': # extra comment for group
        g.add((parentGroup, RDFS.comment, Literal(row[5].strip(), "en")))
    elif row[6] != '': # Class
        parentClass = addConcept(SIC[row[6].strip()], row[7].strip(), row[6].strip(), SIC.Class, parentGroup)
    elif row[7] != '': # extra comment for class
        g.add((parentGroup, RDFS.comment, Literal(row[7].strip(), "en")))
    elif row[8] != '': # Sub Class
        addConcept(SIC[row[8].strip()], row[9].strip(), row[8].strip(), SIC.SubClass, parentClass)

turtle = b'@base <http://www.ons.gov.uk/def/sic2007> .\n' + g.serialize(format='turtle', base='http://www.ons.gov.uk/def/sic2007')
print(turtle.decode('utf-8')[:1000])
with open('sic2007.ttl', 'wb') as f:
    f.write(turtle)

@base <http://www.ons.gov.uk/def/sic2007> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix sic: <http://www.ons.gov.uk/def/sic2007#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<#01.11> a skos:Concept ;
    rdfs:label "Growing of cereals (except rice), leguminous crops and oil seeds"@en ;
    skos:broader <#01.1> ;
    skos:inScheme <> ;
    skos:notation "01.11"^^sic:Class .

<#01.12> a skos:Concept ;
    rdfs:label "Growing of rice"@en ;
    skos:broader <#01.1> ;
    skos:inScheme <> ;
    skos:notation "01.12"^^sic:Class .

<#01.13> a skos:Concept ;
    rdfs:label "Growing of vegetables and melons, roots and tubers"@en ;
    skos:broader <#01.1> ;
    skos:inScheme <> ;
    skos:notation "01.13"^^sic:Class .

<#01.14> a skos:Concept ;
    rd