## Northern Ireland Statistics and Research Agency

[2017 Mid Year Population Estimates for Northern Ireland (NEW FORMAT TABLES)
](https://www.nisra.gov.uk/publications/2017-mid-year-population-estimates-northern-ireland-new-format-tables)

In [1]:
from databaker.framework import *
import pandas as pd

from rdflib import Namespace
from rdflib.namespace import DCTERMS, VOID

DCAT = Namespace('http://www.w3.org/ns/dcat#')
GDP = Namespace('http://gss-data.org.uk/def/gdp#')
GOVORG = Namespace('https://www.gov.uk/government/organisations/')

def extractMetadata(sheet):
    metadata = {}
    description = []
    prop = None
    name2prop = {
        'National Statistics Theme:': DCAT.theme,
        'Data Subset:': DCAT.theme,
        'Dataset Title:': DCTERMS.title,
        'Coverage:': DCTERMS.spatial,
        'Source:': DCTERMS.publisher,
        'Contact:': DCAT.contactPoint,
        'National Statistics Data?': GDP.nationalStatistics,
        'Responsible Statistician:': DCAT.contactPoint
    }
    section = 'metadata'
    for row in sheet:
        if section == 'metadata':
            if row[1] in name2prop:
                prop = name2prop[row[1]]
            elif row[1] == 'Description of Data':
                section = 'description'
            if section != 'description' and len(row[2]) != 0 and prop:
                if prop in metadata:
                    metadata[prop] = metadata[prop] + " " + row[2]
                else:
                    metadata[prop] = row[2].strip()
        elif section == 'description':
            description.append(row[1])
    metadata[DCTERMS.description] = '\n'.join(description).strip()
    return metadata

In [2]:
%run "NISRA Migration MEY17CoC.ipynb"
all_tidy = tidy.copy()
md1 = extractMetadata(book['Metadata'])

%run "NISRA Migration MYE17 NETMIG AGE BANDS Gender.ipynb"
all_tidy = pd.concat([all_tidy, tidy])
md2 = extractMetadata(book['Metadata'])

%run "NISRA Migration MYE17 NETMIG AGE.ipynb"
all_tidy = pd.concat([all_tidy, tidy])
md3 = extractMetadata(book['Metadata'])

%run "NISRA Migration MYE17 NETMIG FLOW.ipynb"
all_tidy = pd.concat([all_tidy, tidy])
md4 = extractMetadata(book['Metadata'])

We're merging the data from these spreadsheets into the same cube, by making implicit dimensions explicit. The metadata should be the same for each, but it differs slightly: the description for the first table is a bit less verbose; the reference area for the first is a specialization of the rest.

In [3]:
metadata = md4
md4

{rdflib.term.URIRef('http://www.w3.org/ns/dcat#theme'): 'Population Population and Migration',
 rdflib.term.URIRef('http://purl.org/dc/terms/title'): 'Mid-Year Population Estimates',
 rdflib.term.URIRef('http://purl.org/dc/terms/spatial'): 'Northern Ireland',
 rdflib.term.URIRef('http://purl.org/dc/terms/publisher'): 'NISRA',
 rdflib.term.URIRef('http://www.w3.org/ns/dcat#contactPoint'): 'Customer Services; 02890 255156;  census@nisra.gov.uk Brian Green - Head of Demographic Statistics',
 rdflib.term.URIRef('http://gss-data.org.uk/def/gdp#nationalStatistics'): 'Yes',
 rdflib.term.URIRef('http://purl.org/dc/terms/description'): 'Mid-2017 population estimates for Northern Ireland were published on 28 June 2018. Migration is one of the components of population change; its estimates are provided to enable understanding of the mid-year population estimates and to inform comment. \n\nNotes:\n1. The estimates are produced using a variety of data sources and statistical models.  Therefore smal

In [4]:
all_tidy.count()

Mid Year                       19580
Area                           19580
Age                            19580
Sex                            19580
Population Change Component    19580
Measure Type                   19580
Value                          19580
Unit                           19580
dtype: int64

In [5]:
all_tidy.rename(columns={'Area': 'Area of Destination or Origin'}, inplace=True)
all_tidy

Unnamed: 0,Mid Year,Area of Destination or Origin,Age,Sex,Population Change Component,Measure Type,Value,Unit
0,2001-06-30T00:00:00/P1Y,N92000002,all,T,Starting population,Count,1688838,People
1,2001-06-30T00:00:00/P1Y,N92000002,all,T,Births,Count,21460,People
2,2001-06-30T00:00:00/P1Y,N92000002,all,T,Deaths,Count,14432,People
3,2001-06-30T00:00:00/P1Y,N92000002,all,T,Natural Change,Count,7028,People
4,2001-06-30T00:00:00/P1Y,N92000002,all,T,Internal Inflows,Count,0,People
5,2001-06-30T00:00:00/P1Y,N92000002,all,T,Internal Outflows,Count,0,People
6,2001-06-30T00:00:00/P1Y,N92000002,all,T,Internal Net,Count,0,People
7,2001-06-30T00:00:00/P1Y,N92000002,all,T,United Kingdom Inflows,Count,12510,People
8,2001-06-30T00:00:00/P1Y,N92000002,all,T,United Kingdom Outflows,Count,11589,People
9,2001-06-30T00:00:00/P1Y,N92000002,all,T,United Kingdom Net,Count,921,People


In [6]:
out = Path('out')
out.mkdir(exist_ok=True)
all_tidy.to_csv(out / 'migration_nisra.csv', index = False)

Some metadata is in the spreadsheets, above, while some is in the web page at https://www.nisra.gov.uk/publications/2017-mid-year-population-estimates-northern-ireland-new-format-tables

In [7]:
from lxml import html
import os
from rdflib import URIRef, Literal, Dataset, RDFS, RDF
from rdflib.namespace import VOID
import re

pageURL = 'https://www.nisra.gov.uk/publications/2017-mid-year-population-estimates-northern-ireland-new-format-tables'
page = session.get(pageURL)
tree = html.fromstring(page.text)
published = pd.to_datetime(
    tree.xpath("//span[starts-with(text(), 'Date published')]/following-sibling::*/text()")[0].strip()
).tz_localize('Europe/London').date()

metadata[DCTERMS.issued] = Literal(published)

def pathify(label):
    return re.sub('-\$', '',
        re.sub('-+', '-',
            re.sub('[^\\w/]', '-', label.lower())))

base = 'http://gss-data.org.uk'
datasetPath = pathify(os.getenv('JOB_NAME', 'NISRA-NI-migration-estimates'))
metadataURI = URIRef(f'{base}/graph/{datasetPath}/metadata')
datasetURI = URIRef(f'{base}/data/{datasetPath}')

PMD = Namespace('http://publishmydata.com/def/dataset#')
QB = Namespace('http://purl.org/linked-data/cube#')
GDP = Namespace(f'{base}/def/gdp#')

def propObject(prop, s):
    if prop in [DCAT.theme, DCTERMS.title, DCTERMS.spatial, DCTERMS.description,
                DCAT.contactPoint]:
        return Literal(s, 'en')
    elif prop == DCTERMS.publisher:
        return {'NISRA': GOVORG['northern-ireland-statistics-and-research-agency']}.get(
            s, Literal(s, 'en')
        )
    elif prop == GDP.nationalStatistics:
        return Literal(True) if s == 'Yes' else Literal(False)
    else:
        return Literal(s, 'en')

quads = Dataset()
quads.bind('pmd', PMD)
quads.bind('qb', QB)
quads.bind('dct', DCTERMS)
quads.bind('void', VOID)
quads.bind('gdp', GDP)
quads.bind('dcat', DCAT)

mdgraph = quads.graph(metadataURI)
mdgraph.add((datasetURI, RDF.type, PMD.LinkedDataset))
mdgraph.add((datasetURI, RDF.type, PMD.Dataset))
mdgraph.add((datasetURI, RDF.type, QB.DataSet))

for k, v in metadata.items():
    mdgraph.add((datasetURI, k, propObject(k, v)))

mdgraph.add((datasetURI, VOID.sparqlEndpoint, URIRef(f'{base}/sparql')))
mdgraph.add((datasetURI, PMD.graph, URIRef(f'{base}/graph/{datasetPath}')))
mdgraph.add((datasetURI, GDP.family, GDP.migration))

with open(out / 'dataset.trig', 'wb') as f:
    quads.serialize(destination=f, format='trig')