For more details see https://skeptric.com/sparql-job-country/

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import gzip
import rdflib
import urllib
from urllib.request import urlretrieve
from pathlib import Path

import dateutil
import datetime

import logging

from tqdm.notebook import tqdm

In [3]:
sys.path.insert(0, '../src')

In [4]:
from lib.rdftool import *

Data From http://webdatacommons.org/structureddata/2019-12/stats/schema_org_subsets.html

Download both the microdata (1.9GB) and the JSON-LD (700MB)

In [5]:
DEST_DIR = Path('..') / 'data' / 'webcommons'
DEST_DIR.mkdir(parents=True, exist_ok=True)

In [6]:
class TqdmUpTo(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)  # will also set self.n = b * bsize

def download(url, filename, overwrite=False):
    filename = Path(filename)
    if (not filename.exists()) or overwrite:
        with TqdmUpTo(unit = 'B', unit_scale = True, unit_divisor = 1024, miniters = 1, desc = Path(filename).name) as t:
            urlretrieve(url, filename = filename, reporthook = t.update_to)

In [7]:
JOBS_JSON_2019 = DEST_DIR / '2019-12_json_JobPosting.gz'

In [8]:
JOBS_MD_2019 = DEST_DIR / '2019-12_md_JobPosting.gz'

In [9]:
download('http://data.dws.informatik.uni-mannheim.de/structureddata/2019-12/quads/classspecific/json/schema_JobPosting.gz',
         JOBS_JSON_2019)

In [10]:
download('http://data.dws.informatik.uni-mannheim.de/structureddata/2019-12/quads/classspecific/md/schema_JobPosting.gz',
         JOBS_MD_2019)

# JSON

In [11]:
def get_domain(url):
    return urllib.parse.urlparse(url).netloc

In [12]:
def parse_nquads_distinct(lines, total=None):
    seen = set()
    for group, quad_lines in tqdm(groupby(lines, get_quad_label), total=total):
        domain = get_domain(group)
        if domain in seen:
            continue
        graph = rdflib.Graph(identifier=group)
        try:
            graph.parse(data=''.join(quad_lines), format='nquads')
        except rdflib.plugins.parsers.ntriples.ParseError as e:
            msg = str(e)
            if len(msg) > 255:
                msg = msg[:255] + '...'
            logging.error(str(msg))
            continue
        if list(get_job_postings(graph)):
            seen.add(domain)
            yield graph

In [13]:
rdflib.term.bind(rdflib.term.URIRef("http://schema.org/Date"), datetime.datetime, dateutil.parser.parse, lambda dt: dt.isoformat())
rdflib.term.bind(rdflib.term.URIRef("http://schema.org/DateTime"), datetime.datetime, dateutil.parser.parse, lambda dt: dt.isoformat())
rdflib.term.bind(rdflib.term.URIRef("https://schema.org/Date"), datetime.datetime, dateutil.parser.parse, lambda dt: dt.isoformat())
rdflib.term.bind(rdflib.term.URIRef("https://schema.org/DateTime"), datetime.datetime, dateutil.parser.parse, lambda dt: dt.isoformat())

In [14]:
json_graphs = list(parse_nquads_distinct(gzip.open(JOBS_JSON_2019, 'rt'), 460_000))

HBox(children=(IntProgress(value=0, max=460000), HTML(value='')))

skype:raloffice?call|skype:raloffice?chat does not look like a valid URI, trying to serialize this will break.
https://www.accenture.com/us-en?c=us_us_brand_10460943&amp;n=psgs_brand_1218&amp;c=ad_usadfy17_10000001&amp;n=psgs_Brand-|-US-|-Exact_accenture&amp;gclid=EAIaIQobChMIpKXKyq2o5AIVksDACh36_QtlEAAYASAAEgLCbPD_BwE does not look like a valid URI, trying to serialize this will break.





In [15]:
md_graphs = list(parse_nquads_distinct(gzip.open(JOBS_MD_2019, 'rt'), 680_000))

HBox(children=(IntProgress(value=0, max=680000), HTML(value='')))

http://chart.apis.google.com/chart?chs=155x155&cht=qr&chl=http%3A%2F%2Fwww%2Eemprega%2Einfo%2FVA%2D93231%2Demprego%2Dde%2DScrum%2DMaster%2Dem%2DCampinas%2DSP&chld=|0 does not look like a valid URI, trying to serialize this will break.
https://www.flexibleworks.co.uk/homepage.php?employerid=16&company=Citizen\ does not look like a valid URI, trying to serialize this will break.
https://www.diversitylink.co.uk/homepage.php?employerid=122&company=St-Andrew\ does not look like a valid URI, trying to serialize this will break.
https://cbb.de/karriere/stellenboerse/detail/s-51-h-entwicklungsingenieur-entwicklungsingenieur-projektleiter-dipl-ing-elektrotechnik-dipl/javascript:linkTo_UnCryptMailto('nbjmup+kpctAdcc\/ef'); does not look like a valid URI, trying to serialize this will break.
ERROR:root:Invalid line (Failed to eat <([^:]+:[^\s"<>]*)> at <http://schema.org/JobPosting/\"description\"> "\n\t\t\t\t\u0420\u0430\u0431\u043E\u0442\u0430 \u043D\u0430 \u0437\u0430\u0432\u043E\u0434\u0435 \

ERROR:root:Invalid line (Failed to eat <([^:]+:[^\s"<>]*)> at <http://schema.org/JobPosting/\"description\"> "\n\t\t\t\t\u0420\u0410\u0411\u041E\u0422\u0410 \u0412 \u0427\u0415\u0425\u0418\u0418 \u041D\u0410 \u0428\u041E\u041A\u041E\u041B\u0410\u0414\u041D\u041E\u04...
ERROR:root:Invalid line (Failed to eat <([^:]+:[^\s"<>]*)> at <http://schema.org/JobPosting/\"description\"> "\n\t\t\t\t\u0421\u043E\u0441\u0442\u0430\u0432\u043B\u0435\u043D\u0438\u044F \u0438 \u043A\u043E\u043C\u043F\u043B\u0435\u043A\u0442\u0430\u0446\u0438\u044F...
ERROR:root:Invalid line (Failed to eat <([^:]+:[^\s"<>]*)> at <http://schema.org/JobPosting/\"description\"> "\n\t\t\t\t\u0420\u0430\u0431\u043E\u0442\u0430 \u0432 \u0427\u0435\u0445\u0438\u0438 \u0422\u0440\u0430\u043A\u0442\u043E\u0440\u0438\u0441\u0442\u043E\u043...
ERROR:root:Invalid line (Failed to eat <([^:]+:[^\s"<>]*)> at <http://schema.org/JobPosting/\"description\"> "\n\t\t\t\t\u0418\u0437\u0433\u043E\u0442\u043E\u0432\u043B\u0435\u043D\u0438\u04




In [25]:
len(json_graphs)

5376

In [17]:
len(md_graphs)

10087

In [255]:
dataset = rdflib.Dataset()
for graph in json_graphs + md_graphs:
    dataset.add_graph(graph)

# Attempt 1: Using the URL

Australian job ads are likely to be posted from a `.au` domain.

In [253]:
def show(results):
    for row in results:
        print([field.toPython() if field else field for field in row])

We can extract the source URL from common code from the graph identifier

In [256]:
%%time
results = dataset.query(r'''
PREFIX sdo: <http://schema.org/>

SELECT ?src
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting .}
}
LIMIT 10
''')


show(results)

['https://werkeninleisure.nl/o/bijbaan-groen-en-technisch-onderhoud-de-schatberg']
['https://jobs.wcbradley.com/TIKI%C2%AE%20Brand/job/Menomonee-Falls-Financial-Analyst-WI-53051/545158300/']
['http://www.ifallschamber.com/jobs/info/administrative-administrative-assistant-191']
['https://www.jobs-bielefeld.org/stellenanzeigen-schalten']
['http://class1personnel.com/job/sheltered-housing-team-leader/']
['https://headhunter.ge/vacancy/32999959']
['https://careers.thesustainablerecruiter.com/careers/4565-General/jobs/543352-Marketing-and-Communicatie-Stage-nog-1-plek-beschikbaar?host=careers.thesustainablerecruiter.com']
['https://tire-factory.hiringthing.com/job/17125/warehouse-associate']
['http://jobbank.com.mm/jobs/detail/receptionist-japanese-speaker-1823.html?lang=japanese']
['https://www.cannarecruiter.com/job/cannacity-dispensary-full-time-entry-level-dispensary-openings/']
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.5 ms


We can extract the domain with [replace function](https://www.w3.org/TR/sparql11-query/#func-replace)

In [257]:
%%time
results = dataset.query(r'''
PREFIX sdo: <http://schema.org/>

SELECT ?domain ?src
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting .}
    BIND (replace(str(?src), 'https?://([^?/]+).*', '\\1') AS ?domain)

}
LIMIT 10
''')


show(results)

['werkeninleisure.nl', 'https://werkeninleisure.nl/o/bijbaan-groen-en-technisch-onderhoud-de-schatberg']
['jobs.wcbradley.com', 'https://jobs.wcbradley.com/TIKI%C2%AE%20Brand/job/Menomonee-Falls-Financial-Analyst-WI-53051/545158300/']
['www.ifallschamber.com', 'http://www.ifallschamber.com/jobs/info/administrative-administrative-assistant-191']
['www.jobs-bielefeld.org', 'https://www.jobs-bielefeld.org/stellenanzeigen-schalten']
['class1personnel.com', 'http://class1personnel.com/job/sheltered-housing-team-leader/']
['headhunter.ge', 'https://headhunter.ge/vacancy/32999959']
['careers.thesustainablerecruiter.com', 'https://careers.thesustainablerecruiter.com/careers/4565-General/jobs/543352-Marketing-and-Communicatie-Stage-nog-1-plek-beschikbaar?host=careers.thesustainablerecruiter.com']
['tire-factory.hiringthing.com', 'https://tire-factory.hiringthing.com/job/17125/warehouse-associate']
['jobbank.com.mm', 'http://jobbank.com.mm/jobs/detail/receptionist-japanese-speaker-1823.html?lang

And then filter on the domain with [strends](https://www.w3.org/TR/sparql11-query/#func-strends)

In [258]:
%%time
results = dataset.query(r'''
PREFIX sdo: <http://schema.org/>

SELECT ?domain ?src
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting .}
    BIND (replace(str(?src), 'https?://([^?/]+).*', '\\1') AS ?domain)
    FILTER (strends(?domain, '.au'))
}
LIMIT 10
''')


show(results)

['www.ramrecruitment.com.au', 'https://www.ramrecruitment.com.au/job/security-shifts-this-weekend-next/']
['www.businessservicesjobs.com.au', 'https://www.businessservicesjobs.com.au/job/manager-business-services-achieve-partnership-australia-vic-melbourne-191108/214764']
['www.client-server.com.au', 'https://www.client-server.com.au/job/data-engineer-1/']
['www.nursingjobs.com.au', 'https://www.nursingjobs.com.au/job/12836/nurse-unit-manager/']
['www.saulrecruitment.com.au', 'https://www.saulrecruitment.com.au/job/senior-systems-administrator-client-site-for-a-boutique-msp/']
['www.shk.com.au', 'https://www.shk.com.au/job/executive-director-freight-victoria-1']
['careers.sportaus.gov.au', 'http://careers.sportaus.gov.au/casual-job-details/query/7762978/']
['volunteeringqld.org.au', 'https://volunteeringqld.org.au/roles/138232']
['www.iscd.edu.au', 'https://www.iscd.edu.au/job/designer-sales-consultant-2/']
['careers.swire.com.au', 'https://careers.swire.com.au/job/Saraji-%28HSE_LOC_00

In [259]:
%%time
result = dataset.query(r'''
PREFIX sdo: <http://schema.org/>

SELECT DISTINCT ?src
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting .}
    BIND (replace(str(?src), 'https?://([^?/]+).*', '\\1') AS ?domain)
    FILTER (strends(?domain, '.au'))
}
''')

au_domains = [x[0] for x in result]

CPU times: user 7.75 s, sys: 172 ms, total: 7.92 s
Wall time: 8.42 s


In [260]:
au_domains[:5]

[rdflib.term.URIRef('https://www.ramrecruitment.com.au/job/security-shifts-this-weekend-next/'),
 rdflib.term.URIRef('https://www.businessservicesjobs.com.au/job/manager-business-services-achieve-partnership-australia-vic-melbourne-191108/214764'),
 rdflib.term.URIRef('https://www.client-server.com.au/job/data-engineer-1/'),
 rdflib.term.URIRef('https://www.nursingjobs.com.au/job/12836/nurse-unit-manager/'),
 rdflib.term.URIRef('https://www.saulrecruitment.com.au/job/senior-systems-administrator-client-site-for-a-boutique-msp/')]

This gives 233 domains

In [261]:
len(au_domains)

233

# Extracting country from schema

Extracting it from a Country name

In [263]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>


SELECT ?country (COUNT(distinct ?src) as ?count)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting; sdo:jobLocation/sdo:address/sdo:addressCountry/sdo:name ?country .}
}
GROUP BY ?country
ORDER BY DESC(?count)
LIMIT 15
''')

show(results)

['US', 127]
['CA', 20]
['DE', 20]
['GB', 18]
['IL', 14]
['IN', 7]
['worldwide', 6]
['FR', 6]
['BR', 5]
['MY', 5]
['AU', 5]
['DK', 4]
['Vietnam', 4]
['NL', 4]
[rdflib.term.Literal(''), 4]
CPU times: user 2.38 s, sys: 203 ms, total: 2.58 s
Wall time: 2.59 s


308 jobs with countries

In [264]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>


SELECT (count(distinct ?src) as ?count)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting; sdo:jobLocation/sdo:address/sdo:addressCountry/sdo:name ?country .}
}
''')

show(results)

[308]
CPU times: user 2.34 s, sys: 141 ms, total: 2.48 s
Wall time: 2.59 s


Sometimes the addressCountry is just text, and sometimes a blank node

In [265]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>


SELECT ?country ?src
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting; sdo:jobLocation/sdo:address/sdo:addressCountry ?country .}
}
LIMIT 15
''')

show(results)

['NL', 'https://werkeninleisure.nl/o/bijbaan-groen-en-technisch-onderhoud-de-schatberg']
['Deutschland', 'https://www.jobs-bielefeld.org/stellenanzeigen-schalten']
['GB', 'http://class1personnel.com/job/sheltered-housing-team-leader/']
['GB', 'https://www.spheredigitalrecruitment.com/job/graduate-campaign-executive-12/']
['ZA', 'https://www.graftonrecruitment.com/job/diesel-fitter-8/']
['Uganda', 'https://jobopenings.co.ug/job/5793/technical-specialist-knowledge-management-and-communications-specialist-uganda-ican-61696/']
['NL', 'https://werkenbij.vakmedianet.nl/o/database-marketeer']
['Ireland', 'https://www.nursebank.ie/jobs/practice-nurse-full-time-north-dublin-irl/546-1/']
['Australia', 'https://www.businessservicesjobs.com.au/job/manager-business-services-achieve-partnership-australia-vic-melbourne-191108/214764']
['Afghanistan', 'https://maihoangphotocopy.com/tuyen-ky-thuat-may-photocopy/']
['United States', 'https://www.adeccousa.com/jobs/order-picker-new-albany-ohio/?ID=US_EN_

We can just get the countries with isLiteral

In [266]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>


SELECT ?country (COUNT(distinct ?src) AS ?count)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting; sdo:jobLocation/sdo:address/sdo:addressCountry ?country .}
    FILTER (isLiteral(?country))
}
GROUP BY ?country
ORDER BY DESC(?count)
LIMIT 10
''')

show(results)

['United States', 385]
['JP', 358]
['GB', 345]
['US', 320]
['DE', 270]
['NL', 253]
['Deutschland', 179]
['United Kingdom', 139]
['FR', 110]
['AU', 86]
CPU times: user 3 s, sys: 125 ms, total: 3.12 s
Wall time: 3.23 s


This gives over 3700 results

In [267]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>


SELECT (count(distinct ?src) as ?total)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting; sdo:jobLocation/sdo:address/sdo:addressCountry ?country .}
    FILTER (isLiteral(?country))
}
''')

show(results)

[3763]
CPU times: user 2.95 s, sys: 93.8 ms, total: 3.05 s
Wall time: 3.17 s


We can combine the two; for some reason I don't understand we get URIs in here like `schema.org/JobPosting`

In [372]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>


SELECT ?country (count(distinct ?src) as ?total)
WHERE {
    GRAPH ?src
    {_:j a sdo:JobPosting; sdo:jobLocation/sdo:address/sdo:addressCountry/(sdo:name?) ?country .}
}
GROUP BY ?country
ORDER BY DESC(?total)
LIMIT 15
''')

show(results)

['US', 447]
['United States', 385]
['GB', 363]
['JP', 359]
['DE', 290]
['NL', 257]
['Deutschland', 179]
['United Kingdom', 140]
['FR', 116]
['AU', 91]
['CA', 81]
['India', 60]
[rdflib.term.Literal(''), 60]
['http://schema.org/JobPosting', 56]
['http://schema.org/Place', 56]
CPU times: user 3.89 s, sys: 156 ms, total: 4.05 s
Wall time: 4.46 s


But not if we do it this way:

In [269]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>


SELECT ?country (count(?src) as ?total)
WHERE {
    GRAPH ?src
    {_:j a sdo:JobPosting; sdo:jobLocation/sdo:address/(sdo:addressCountry|sdo:addressCountry/sdo:name) ?country .}
}
GROUP BY ?country
ORDER BY DESC(?total)
LIMIT 15
''')

show(results)

['US', 609]
['JP', 444]
['United States', 395]
['GB', 363]
['DE', 304]
['NL', 258]
['Deutschland', 179]
['United Kingdom', 140]
['FR', 116]
['AU', 91]
['CA', 84]
['India', 70]
['Canada', 66]
[rdflib.term.Literal(''), 60]
['IN', 57]
CPU times: user 2.84 s, sys: 219 ms, total: 3.06 s
Wall time: 3.2 s


However if we filter the nodes to literals we get the same results (after deduplicating)

In [375]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>


SELECT ?country (count(distinct ?src) as ?total)
WHERE {
    GRAPH ?src
    {_:j a sdo:JobPosting; sdo:jobLocation/sdo:address/(sdo:addressCountry|sdo:addressCountry/sdo:name) ?country .}
    FILTER (!isblank(?country))
}
GROUP BY ?country
ORDER BY DESC(?total)
limit 20
''')

show(results)

['US', 447]
['United States', 385]
['GB', 363]
['JP', 359]
['DE', 290]
['NL', 257]
['Deutschland', 179]
['United Kingdom', 140]
['FR', 116]
['AU', 91]
['CA', 81]
['India', 60]
[rdflib.term.Literal(''), 60]
['IN', 56]
['Canada', 44]
['BE', 41]
['USA', 39]
['PL', 38]
['BR', 38]
['ES', 38]
CPU times: user 3.59 s, sys: 78.1 ms, total: 3.67 s
Wall time: 3.78 s


In [376]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>


SELECT ?country (count(distinct ?src) as ?total)
WHERE {
    GRAPH ?src
    {_:j a sdo:JobPosting; sdo:jobLocation/sdo:address/sdo:addressCountry/(sdo:name?) ?country .}
    FILTER (isliteral(?country))
}
GROUP BY ?country
ORDER BY DESC(?total)
limit 20
''')

show(results)

['US', 447]
['United States', 385]
['GB', 363]
['JP', 359]
['DE', 290]
['NL', 257]
['Deutschland', 179]
['United Kingdom', 140]
['FR', 116]
['AU', 91]
['CA', 81]
['India', 60]
[rdflib.term.Literal(''), 60]
['IN', 56]
['Canada', 44]
['USA', 41]
['BE', 41]
['PL', 38]
['BR', 38]
['ES', 38]
CPU times: user 3.42 s, sys: 62.5 ms, total: 3.48 s
Wall time: 3.55 s


We can also get results from the fully qualified version of the schema (this tends to be used in Microdata)

In [287]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>
PREFIX sdo_jp: <http://schema.org/JobPosting/>
PREFIX sdo_pl: <http://schema.org/Place/>
PREFIX sdo_pa: <http://schema.org/PostalAddress/>
PREFIX sdo_co: <http://schema.org/Country/>

SELECT ?relation (count(distinct ?src) as ?count)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting ;
         sdo_jp:jobLocation/sdo_pl:address/sdo_pa:addressCountry/(sdo_co:name?) ?country .
         [] ?relation ?country .
         FILTER (isliteral(?country) &&
                (lcase(str(?country)) not in ('', 'na', 'n/a', 'unavailable', ' ', 'null')))
         }
}
GROUP BY ?relation
ORDER BY DESC(?count)
LIMIT 10
''')

show(results)

['http://schema.org/PostalAddress/addressCountry', 1351]
['http://schema.org/Country/name', 4]
['http://schema.org/PostalAddress/addressLocality', 3]
['http://schema.org/PostalAddress/streetAddress', 1]
['http://schema.org/PostalAddress/addressRegion', 1]
CPU times: user 5.59 s, sys: 172 ms, total: 5.77 s
Wall time: 5.96 s


And we can combine the results together

In [308]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>
PREFIX sdo_jp: <http://schema.org/JobPosting/>
PREFIX sdo_pl: <http://schema.org/Place/>
PREFIX sdo_pa: <http://schema.org/PostalAddress/>
PREFIX sdo_co: <http://schema.org/Country/>

SELECT ?country (count(distinct ?src) as ?count)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting ;
         (sdo:jobLocation|sdo_jp:jobLocation)/(sdo:address|sdo_pl:address)/(sdo:addressCountry|sdo_pa:addressCountry)/((sdo:name|sdo_co:name)?) ?country .
         FILTER (isliteral(?country))
         }
}
GROUP BY ?country
ORDER BY DESC(?count)
LIMIT 10
''')

show(results)

['United States', 469]
['US', 463]
['United States', 393]
['GB', 365]
['JP', 359]
['DE', 295]
['RU', 283]
['NL', 257]
['Deutschland', 186]
['United Kingdom', 142]
CPU times: user 5.27 s, sys: 188 ms, total: 5.45 s
Wall time: 5.56 s


Notice the duplicates are due to language tags

In [309]:
[r[0] for r in results]

[rdflib.term.Literal('United States', lang='en'),
 rdflib.term.Literal('US'),
 rdflib.term.Literal('United States'),
 rdflib.term.Literal('GB'),
 rdflib.term.Literal('JP'),
 rdflib.term.Literal('DE'),
 rdflib.term.Literal('RU'),
 rdflib.term.Literal('NL'),
 rdflib.term.Literal('Deutschland'),
 rdflib.term.Literal('United Kingdom')]

We can combine them using str, and strip away surrounding whitespace.

There are still many varians: 'United States', 'US', 'United States of America' and 'us' are all the same.

In [379]:
%%time
results = dataset.query(r'''
PREFIX sdo: <http://schema.org/>
PREFIX sdo_jp: <http://schema.org/JobPosting/>
PREFIX sdo_pl: <http://schema.org/Place/>
PREFIX sdo_pa: <http://schema.org/PostalAddress/>
PREFIX sdo_co: <http://schema.org/Country/>

SELECT ?countryplain (count(distinct ?src) as ?count)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting ;
         (sdo:jobLocation|sdo_jp:jobLocation)/(sdo:address|sdo_pl:address)/(sdo:addressCountry|sdo_pa:addressCountry)/((sdo:name|sdo_co:name)?) ?country .
         FILTER (isliteral(?country))
         BIND (replace(str(?country), '[ \n\t]*(.*)[ \n\t]*', '\\1') as ?countryplain)
         }
}
GROUP BY ?countryplain
ORDER BY DESC(?count)
LIMIT 50
''')

show(results)

['United States', 863]
['US', 496]
['GB', 381]
['JP', 362]
['DE', 355]
['RU', 287]
['NL', 264]
['Deutschland', 192]
['United Kingdom', 175]
['FR', 128]
['AU', 96]
['CA', 88]
['India', 65]
['Canada', 61]
[rdflib.term.Literal(''), 60]
['IN', 59]
['Germany', 50]
['USA', 50]
['BE', 45]
['ES', 43]
['PL', 39]
['BR', 39]
['BY', 37]
['UK', 34]
['Brazil', 32]
['Null', 30]
['AT', 30]
['UA', 30]
['CH', 29]
['Australia', 28]
['SG', 26]
['China', 24]
['KZ', 22]
['VN', 22]
['Nederland', 21]
['Schweiz', 21]
['HK', 21]
['IE', 19]
['PT', 19]
['ZA', 18]
['France', 18]
['NZ', 18]
['United States of America', 18]
['IT', 18]
['AF', 17]
['MY', 17]
['IL', 16]
['AE', 16]
['Ireland', 14]
['SE', 14]
CPU times: user 6.59 s, sys: 172 ms, total: 6.77 s
Wall time: 7.06 s


Extracting Australian job ads

In [313]:
%%time
results = dataset.query(r'''
PREFIX sdo: <http://schema.org/>
PREFIX sdo_jp: <http://schema.org/JobPosting/>
PREFIX sdo_pl: <http://schema.org/Place/>
PREFIX sdo_pa: <http://schema.org/PostalAddress/>
PREFIX sdo_co: <http://schema.org/Country/>

SELECT DISTINCT ?src
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting ;
         (sdo:jobLocation|sdo_jp:jobLocation)/(sdo:address|sdo_pl:address)/(sdo:addressCountry|sdo_pa:addressCountry)/((sdo:name|sdo_co:name)?) ?country .
         FILTER (isliteral(?country) && lcase(replace(str(?country), '[ \n\t]*(.*)[ \n\t]*', '\\1')) in ('au', 'australia'))
         }
}
''')

au_country = [row[0] for row in results]

CPU times: user 7.33 s, sys: 141 ms, total: 7.47 s
Wall time: 7.75 s


In [314]:
len(au_country), au_country[:5]

(124,
 [rdflib.term.URIRef('https://www.businessservicesjobs.com.au/job/manager-business-services-achieve-partnership-australia-vic-melbourne-191108/214764'),
  rdflib.term.URIRef('https://www.finxl.co.nz/job/opentext-developer/'),
  rdflib.term.URIRef('https://www.client-server.com.au/job/data-engineer-1/'),
  rdflib.term.URIRef('https://www.nursingjobs.com.au/job/12836/nurse-unit-manager/'),
  rdflib.term.URIRef('https://www.shk.com.au/job/executive-director-freight-victoria-1')])

We got an extra 40 urls not ending in .au

In [320]:
len(au_domains), len(au_country), len(set(au_domains + au_country))

(233, 124, 273)

Some notable additions:

* Many NZ job sites
* Some global companies/recruiters: aecom, aerotek
* Some talent platforms like breezy.hr, gosnaphop, recruitee
* Devonport in Tasmania (pop c. 30k) has its own website: jobsindevonport.com

In [323]:
sorted(term.toPython() for term in au_country if term not in set(au_domains))

['http://careers.sanchurro.com/p/7ad99d50189b-duty-manager-midland-gate',
 'http://jobs.functionn.io/job/32248/full-stack-engineer-melbourne',
 'https://airteam.breezy.hr/p/b2f30a8e060b-senior-front-end-developer',
 'https://angel.co/company/liven/jobs/581117-ios-developer',
 'https://australianz.aecom.jobs/fortitude-valley-aus/professional-highways-engineer/ED6A0076CE97414296B9F7C36F155980/job/',
 'https://big-fat-smile-group-ltd.breezy.hr/p/9253fd6759cc-early-childhood-teacher',
 'https://careers.engagesq.com/o/change-and-adoption-senior-consultant',
 'https://cgh.vincere.io/careers/job/45425/environmental-planning-manager',
 'https://costco.gosnaphop.com/jsYqsU/administration-clerk-administration-bundamba-qld-australia-au2830w000100',
 'https://cturtle.vincere.io/careers/job/32943/advertising-executives',
 'https://cummins-australia.jobs/mildura-aus/resident-field-service-diesel-mechanic-technician-broken-hill/452E7BEF97EB43048CA60EFC3B0072DB/job/',
 'https://cummins-technician.jobs

# Extracting currency

Australia has a unique currency: the Australian dollar (AUD). Jobs with Australian dollar are very likely to be Australian (unlike Euro or US dollar used in many regions)

In [329]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>
PREFIX sdo_jp: <http://schema.org/JobPosting/>

SELECT ?curr (COUNT(distinct ?src) as ?count)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting ;
        sdo:salaryCurrency ?currency .
    }
    BIND (str(?currency) as ?curr)
}
GROUP BY ?curr
ORDER BY DESC(?count)
LIMIT 20
''')

show(results)

['GBP', 179]
['EUR', 93]
['USD', 69]
['€', 58]
['AUD', 41]
['JPY', 27]
[rdflib.term.Literal(''), 13]
['円', 8]
['INR', 7]
['HKD', 7]
['CAD', 6]
['SGD', 6]
['THB', 5]
['NZD', 3]
['AFN', 3]
['MYR', 3]
['BRL', 3]
['Rs', 3]
['VND', 2]
['Euro', 1]
CPU times: user 2.02 s, sys: 109 ms, total: 2.12 s
Wall time: 2.45 s


Using the prefixed path

In [331]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>
PREFIX sdo_jp: <http://schema.org/JobPosting/>

SELECT ?curr (COUNT(distinct ?src) as ?count)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting ;
        sdo_jp:salaryCurrency ?currency .
    }
    BIND (str(?currency) as ?curr)
}
GROUP BY ?curr
ORDER BY DESC(?count)
LIMIT 20
''')

show(results)

['AUD', 60]
['CZK', 55]
['RUB', 49]
['USD', 46]
['GBP', 43]
['RUR', 36]
['EUR', 19]
['\n\nGBP\n', 14]
['Null', 13]
['руб.', 8]
['€', 6]
['INR', 5]
['Kč', 3]
['£', 3]
['PLN', 3]
['USD ', 2]
['≪昇給・賞与≫', 2]
['月給 月給25万円+通勤手当 ≪月給内訳≫ 基本給23万円+手当 ≪手当詳細≫', 2]
['JPY', 2]
['KZT', 2]
CPU times: user 2.08 s, sys: 188 ms, total: 2.27 s
Wall time: 2.46 s


The currency can also be in the baseSalary

In [381]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>

SELECT ?curr (COUNT(distinct ?src) AS ?count)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting ;
        sdo:baseSalary/sdo:currency ?currency .
    }
    BIND (str(?currency) as ?curr)
}
GROUP BY ?curr
ORDER BY DESC(?count)
LIMIT 10
''')

show(results)

['GBP', 314]
['JPY', 261]
['USD', 234]
['EUR', 211]
[rdflib.term.Literal(''), 117]
['INR', 102]
['JPN', 93]
['€', 62]
['AUD', 54]
['AFA', 23]
CPU times: user 2.58 s, sys: 46.9 ms, total: 2.62 s
Wall time: 2.85 s


And adding the fully qualified schema

In [334]:
%%time
results = dataset.query('''
PREFIX sdo: <http://schema.org/>
PREFIX sdo_jp: <http://schema.org/JobPosting/>
PREFIX sdo_mv: <http://schema.org/MonetaryValue/>

SELECT ?curr (COUNT(distinct ?src) AS ?count)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting ;
        (sdo:baseSalary|sdo_jp:baseSalary)/(sdo:currency|sdo_mv:currency) ?currency .
    }
    BIND (str(?currency) as ?curr)
}
GROUP BY ?curr
ORDER BY DESC(?count)
LIMIT 20
''')

show(results)

['GBP', 314]
['JPY', 261]
['USD', 234]
['EUR', 211]
[rdflib.term.Literal(''), 117]
['INR', 102]
['JPN', 93]
['€', 62]
['AUD', 54]
['AFA', 23]
['VND', 18]
['CAD', 16]
['HKD', 14]
['RUR', 13]
['£', 11]
['SEK', 11]
['PKR', 9]
['SGD', 9]
['THB', 9]
['MYR', 9]
CPU times: user 3.09 s, sys: 109 ms, total: 3.2 s
Wall time: 3.51 s


Putting it all together - let's check we're getting something from each path

In [336]:
%%time
results = dataset.query(r'''
PREFIX sdo: <http://schema.org/>
PREFIX sdo_jp: <http://schema.org/JobPosting/>
PREFIX sdo_mv: <http://schema.org/MonetaryValue/>


SELECT ?prefix (COUNT(distinct ?src) as ?count)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting ;
        ((sdo:salaryCurrency|sdo_jp:salaryCurrency)|
         (sdo:baseSalary|sdo_jp:baseSalary)/(sdo:currency|sdo_mv:currency)) ?currency .
     [] ?prefix ?currency .
    }
    BIND (replace(str(?currency), '[ \n\t]+', '') as ?curr)
    FILTER (!(lcase(?curr) in ('', 'null', 'na', 'n/a', 'unavailable')))
}
GROUP BY ?prefix
ORDER BY DESC(?count)
LIMIT 20
''')

show(results)

['http://schema.org/currency', 1611]
['http://schema.org/salaryCurrency', 540]
['http://schema.org/JobPosting/salaryCurrency', 397]
['https://schema.org/MonetaryAmount/currency', 38]
['http://schema.org/MonetaryAmount/currency', 26]
['http://schema.org/priceCurrency', 8]
['http://schema.org/currenciesAccepted', 3]
['http://schema.org/addressCountry', 2]
['http://schema.org/JobPosting/experienceRequirements', 1]
['http://schema.org/JobPosting/datePosted', 1]
['http://schema.org/JobPosting/url', 1]
['http://schema.org/JobPosting/baseSalary', 1]
CPU times: user 24 s, sys: 344 ms, total: 24.3 s
Wall time: 25.1 s


We could still do some normalisation: e.g. € = EUR, £ = GBP, ...

In [338]:
%%time
results = dataset.query(r'''
PREFIX sdo: <http://schema.org/>
PREFIX sdo_jp: <http://schema.org/JobPosting/>
PREFIX sdo_mv: <http://schema.org/MonetaryValue/>
PREFIX sdos_mv: <https://schema.org/MonetaryValue/>


SELECT ?curr (COUNT(distinct ?src) as ?count)
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting ;
        ((sdo:salaryCurrency|sdo_jp:salaryCurrency)|
         (sdo:baseSalary|sdo_jp:baseSalary)/(sdo:currency|sdo_mv:currency|sdos_mv:currency)) ?currency .
    }
    BIND (replace(str(?currency), '[ \n\t]+', '') as ?curr)
    FILTER (!(lcase(?curr) in ('', 'null', 'na', 'n/a', 'unavailable')))
}
GROUP BY ?curr
ORDER BY DESC(?count)
LIMIT 20
''')

show(results)

['GBP', 392]
['USD', 302]
['EUR', 295]
['JPY', 266]
['AUD', 114]
['INR', 108]
['JPN', 93]
['€', 68]
['CZK', 57]
['RUB', 50]
['RUR', 49]
['AFA', 23]
['CAD', 19]
['VND', 18]
['HKD', 14]
['£', 14]
['BRL', 12]
['SEK', 11]
['PKR', 10]
['THB', 10]
CPU times: user 5.17 s, sys: 141 ms, total: 5.31 s
Wall time: 5.41 s


Getting jobs paying in AUD AU jobs

In [342]:
%%time
results = dataset.query(r'''
PREFIX sdo: <http://schema.org/>
PREFIX sdo_jp: <http://schema.org/JobPosting/>
PREFIX sdo_mv: <http://schema.org/MonetaryValue/>
PREFIX sdos_mv: <https://schema.org/MonetaryValue/>


SELECT distinct ?src
WHERE {
    GRAPH ?src
    {[] a sdo:JobPosting ;
        ((sdo:salaryCurrency|sdo_jp:salaryCurrency)|
         (sdo:baseSalary|sdo_jp:baseSalary)/(sdo:currency|sdo_mv:currency|sdos_mv:currency)) ?currency .
    }
    BIND (replace(str(?currency), '[ \n\t]+', '') as ?curr)
    FILTER (lcase(?curr) = 'aud')
}
''')

au_salary = [row[0] for row in results]

CPU times: user 4.59 s, sys: 156 ms, total: 4.75 s
Wall time: 4.96 s


Get 114 URLs

In [343]:
len(au_salary), au_salary[:5]

(114,
 [rdflib.term.URIRef('https://www.businessservicesjobs.com.au/job/manager-business-services-achieve-partnership-australia-vic-melbourne-191108/214764'),
  rdflib.term.URIRef('https://www.globalmedics.com/jobs/vac46758-emergency-department-registrar?keywords=registrar+paediatrics'),
  rdflib.term.URIRef('https://www.nursingjobs.com.au/job/12836/nurse-unit-manager/'),
  rdflib.term.URIRef('http://careers.sportaus.gov.au/casual-job-details/query/7762978/'),
  rdflib.term.URIRef('http://jobboards.adlogic.com.au/oscarwylee/job-details/query/Optometrist+-+Queensland+%28Relocation+Package%21%29/in/New+South+Wales/7712328/')])

There are 18 jobs not in AU domains

In [346]:
len(au_domains), len(au_salary), len(set(au_domains + au_salary))

(233, 114, 251)

In [347]:
new_jobs = [url.toPython() for url in au_salary if url not in set(au_domains + au_country)]

While a couple of these are false positives (particularly NZ jobs), some are definitely correct

In [348]:
new_jobs

['https://www.globalmedics.com/jobs/vac46758-emergency-department-registrar?keywords=registrar+paediatrics',
 'https://www.awf.co.nz/job/specialist-water-and-sand-blaster-slash-spray-painter/',
 'https://www.cartermurray.com.sg/job/marketing-manager-46/',
 'https://myrecruitmentplus.com/job-details/query/inside-saas-software-sales-perfect-for-a-recruiter-s80k-base-and-s180k-ote/in/sydney/7760726/',
 'http://careers.centreport.co.nz/job-details/query/Cargo+Handler+-+Break+Bulk/in/Wellington/7762814/',
 'https://www.dimplecare.com/job-details/query/team-leader/in/victoria/7672086/',
 'https://www.ethosbc.com/job/100858132055947/operational-risk-manager/',
 'https://au.talentinternational.com/jobs/executive-assistant-27',
 'https://brannigans.co.nz/job-details/query/chief-executive-officer/in/canterbury/7710393/',
 'https://greenstone-recruitment.co.nz/job-details/query/farm-assistant-868984-bay-of-plenty/in/bay-of-plenty/7774644/',
 'https://www.taylorroot.de/job/partner-competition-melb

Get total of 285 domains that contain some jobs that are likely to be Australian

In [349]:
len(set(au_domains + au_country + au_salary))

285

In a single query

In [370]:
%%time
results = dataset.query(r'''
PREFIX sdo: <http://schema.org/>
PREFIX sdo_jp: <http://schema.org/JobPosting/>
PREFIX sdo_pl: <http://schema.org/Place/>
PREFIX sdo_pa: <http://schema.org/PostalAddress/>
PREFIX sdo_co: <http://schema.org/Country/>
PREFIX sdo_mv: <http://schema.org/MonetaryValue/>
PREFIX sdos_mv: <https://schema.org/MonetaryValue/>


SELECT (COUNT(distinct ?src) AS ?count)
WHERE {
 { 
  GRAPH ?src
  {[] a sdo:JobPosting .}
  BIND (replace(str(?src), 
                'https?://([^?/]+).*',
                '\\1') AS ?domain)
    FILTER (strends(?domain, '.au'))
 }
 UNION
 {
  GRAPH ?src
  {
    {[] a sdo:JobPosting ;
         (sdo:jobLocation|sdo_jp:jobLocation)/
         (sdo:address|sdo_pl:address)/
         (sdo:addressCountry|sdo_pa:addressCountry)/
         ((sdo:name|sdo_co:name)?) ?country .
         FILTER (isliteral(?country) && 
                 lcase(replace(str(?country),
                               '[ \n\t]*(.*)[ \n\t]*',
                               '\\1')) in ('au', 'australia'))
    }
    UNION
    {[] a sdo:JobPosting ;
        ((sdo:salaryCurrency|sdo_jp:salaryCurrency)|
         (sdo:baseSalary|sdo_jp:baseSalary)/
         (sdo:currency|sdo_mv:currency|sdos_mv:currency)) ?currency .
    BIND (replace(str(?currency), '[ \n\t]+', '') as ?curr)
    FILTER (lcase(?curr) = 'aud')}
  }
 }
}
''')

show(results)

[285]
CPU times: user 20.4 s, sys: 422 ms, total: 20.8 s
Wall time: 22.4 s
