In [1]:
import time
import math
import random
import requests
import pandas as pd
from urllib.parse import urlparse, parse_qs

BASE = "https://ec.europa.eu/esco/api"
SESSION = requests.Session()
SESSION.headers.update({"Accept": "application/json"})

def normalize_esco_uri(u: str | None) -> str | None:
    if not u:
        return None
    if "data.europa.eu/esco/" in u and "/resource/" not in u:
        return u
    qs = parse_qs(urlparse(u).query)
    if "uri" in qs and qs["uri"]:
        return qs["uri"][0]
    return u

def extract_concept_uri_from_search_item(item) -> str | None:
    emb = item.get("_embedded", {}) if isinstance(item, dict) else {}
    res = emb.get("resource", {}) if isinstance(emb, dict) else {}
    if isinstance(res, dict) and res.get("uri"):
        return normalize_esco_uri(res["uri"])
    links = item.get("_links", {}) if isinstance(item, dict) else {}
    for k in ("resource", "self"):
        href = links.get(k, {}).get("href")
        if href:
            return normalize_esco_uri(href)
    return None

def safe_json(resp: requests.Response):
    # Some ESCO error responses are HTML; guard json() with content-type check
    ctype = resp.headers.get("Content-Type", "")
    if "application/json" not in ctype.lower():
        raise RuntimeError(f"Non-JSON response (status {resp.status_code}): {resp.text[:300]}...")
    return resp.json()

def esco_search(text, type_, language="en", limit=50, offset=0, full=False, retries=3, backoff=1.2):
    params = {
        "text": text,
        "type": type_,
        "language": language,
        "limit": min(int(limit), 50),   # be nice to the API
        "offset": max(int(offset), 0),
        "full": str(bool(full)).lower(),
    }
    for attempt in range(retries):
        r = SESSION.get(f"{BASE}/search", params=params, timeout=30)
        if r.status_code in (429, 502, 503, 504):
            time.sleep((backoff ** attempt) + random.random() * 0.2)
            continue
        r.raise_for_status()
        return safe_json(r)
    # last try (raise if still failing)
    r.raise_for_status()
    return safe_json(r)

def fetch_all_by_seeds(type_, language="en", limit=50, seeds=None, max_pages_per_seed=None):
    """
    Crawl ESCO by iterating seed queries (to avoid empty-text pitfalls) and de-duplicate by URI.
    """
    if seeds is None:
        # broad coverage: a-z, 0-9 and some common bigrams
        letters = [chr(i) for i in range(ord('a'), ord('z')+1)]
        digits = [str(i) for i in range(10)]
        bigrams = ["ch", "sh", "th", "ph", "st", "pr", "gr", "data", "info", "tech"]
        seeds = letters + digits + bigrams

    seen = set()
    rows = []

    for seed in seeds:
        offset = 0
        pages = 0
        while True:
            data = esco_search(seed, type_, language=language, limit=limit, offset=offset, full=False)
            results = data.get("_embedded", {}).get("results", [])
            if not results:
                break

            for item in results:
                uri = extract_concept_uri_from_search_item(item)
                if not uri or uri in seen:
                    continue
                lbl = item.get("preferredLabel") or item.get("title") or uri
                if isinstance(lbl, dict):
                    lbl = lbl.get(language) or next(iter(lbl.values()), uri)
                rows.append((lbl, uri))
                seen.add(uri)

            if len(results) < limit:
                break
            offset += limit
            pages += 1
            if max_pages_per_seed and pages >= max_pages_per_seed:
                break

    df = pd.DataFrame(rows, columns=["Label", "URI"])
    df.drop_duplicates(subset="URI", inplace=True)
    df.sort_values("Label", key=lambda s: s.astype(str).str.lower(), inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def _label(value, language="en", fallback=""):
    """Return a readable label from ESCO fields that may be str or dict."""
    if isinstance(value, dict):
        return value.get(language) or next(iter(value.values()), fallback)
    return value or fallback

# Public functions:

def get_all_occupations(language="en", limit=50, max_pages_per_seed=None) -> pd.DataFrame:
    return fetch_all_by_seeds("occupation", language=language, limit=limit, max_pages_per_seed=max_pages_per_seed)

def get_all_skills(language="en", limit=50, max_pages_per_seed=None) -> pd.DataFrame:
    return fetch_all_by_seeds("skill", language=language, limit=limit, max_pages_per_seed=max_pages_per_seed)

def get_necessary_skills_for_occupation(occupation: str, language="en") -> pd.DataFrame:
    """
    Return a DataFrame of all ESCO 'essential' (i.e., necessary) skills for an occupation.
    `occupation` can be a canonical ESCO URI or a plain-text name.
    Columns: ['Occupation', 'Occupation URI', 'Skill', 'Skill URI'].
    """
    # Resolve occupation to a canonical URI (if a name is provided)
    if "data.europa.eu/esco/occupation/" not in occupation:
      # treat as a search query and take the first matching occupation
      hits = esco_search(occupation, type_="occupation", language=language, limit=1)
      items = hits.get("_embedded", {}).get("results", [])
      if not items:
          raise ValueError(f"No occupation found for query: {occupation!r}")
      occ_uri = extract_concept_uri_from_search_item(items[0])
      if not occ_uri:
          raise ValueError(f"Could not resolve occupation URI for query: {occupation!r}")
    else:
      occ_uri = normalize_esco_uri(occupation)

    # Fetch occupation resource
    occ = esco_get_resource("occupation", occ_uri, language=language)
    occ_label = _label(occ.get("preferredLabel"), language, fallback=occ_uri)

    # Follow essential skills links
    essential = occ.get("_links", {}).get("hasEssentialSkill", [])
    if isinstance(essential, dict):
        essential = [essential]

    rows = []
    for link in essential:
        href = link.get("href")
        sk_uri = normalize_esco_uri(href)
        if not sk_uri:
            continue
        sk = esco_get_resource("skill", sk_uri, language=language)
        sk_label = _label(sk.get("preferredLabel"), language, fallback=sk_uri)
        rows.append({
            "Occupation": occ_label,
            "Occupation URI": occ_uri,
            "Skill": sk_label,
            "Skill URI": sk_uri
        })

    df = pd.DataFrame(rows).sort_values("Skill", key=lambda s: s.astype(str).str.lower()).reset_index(drop=True)
    return df

def esco_get_resource(type_: str, uri: str, language: str = "en", retries: int = 3, backoff: float = 1.2) -> dict:
    """
    Fetch a single ESCO resource (occupation/skill) by its canonical ESCO URI or by an API href.
    Handles both:
      - canonical URIs like "https://data.europa.eu/esco/occupation/..."
      - API hrefs like "https://ec.europa.eu/esco/api/resource/occupation?uri=...&language=en"

    Returns the JSON dict for the resource (unwrapped from _embedded.resource if present).
    Raises for HTTP errors and non-JSON responses.
    """
    if not uri:
        raise ValueError("Missing URI")

    norm_uri = normalize_esco_uri(uri)

    # If the provided value already points to the API /resource endpoint, use it as-is.
    if isinstance(norm_uri, str) and norm_uri.startswith(f"{BASE}/resource/"):
        url = norm_uri
        params = {"language": language}
    else:
        # Otherwise, call the /resource/{type} endpoint with ?uri=<canonical esco uri>
        url = f"{BASE}/resource/{type_}"
        params = {"uri": norm_uri, "language": language}

    last_resp = None
    for attempt in range(retries):
        r = SESSION.get(url, params=params, timeout=30)
        last_resp = r
        if r.status_code in (429, 502, 503, 504):
            # exponential backoff with a touch of jitter
            time.sleep((backoff ** attempt) + random.random() * 0.2)
            continue
        r.raise_for_status()
        data = safe_json(r)
        # Some ESCO responses wrap the resource
        if isinstance(data, dict) and "_embedded" in data:
            emb = data.get("_embedded", {})
            if isinstance(emb, dict) and "resource" in emb and isinstance(emb["resource"], dict):
                return emb["resource"]
        return data

    # If we get here, retries were exhausted; raise for the last response
    assert last_resp is not None
    last_resp.raise_for_status()
    return safe_json(last_resp)


In [2]:
df_occupations = get_all_occupations()

In [3]:
print(df_occupations.info())
print(df_occupations['Label'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 774 entries, 0 to 773
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   774 non-null    object
 1   URI     774 non-null    object
dtypes: object(2)
memory usage: 12.2+ KB
None
0                   3D animator
1                   3D modeller
2        3D printing technician
3      academic support officer
4         accommodation manager
                 ...           
769                zoo educator
770               zoo registrar
771          zoo section leader
772                   zookeeper
773          zoology technician
Name: Label, Length: 774, dtype: object


In [4]:
for s in df_occupations['Label']:
    if "data" in s:
        print(s)

aviation data communications manager
big data archive librarian
chief data officer
data analyst
data centre operator
data entry clerk
data entry supervisor
data quality specialist
data scientist
data warehouse designer
database administrator
database designer
database developer
database integrator


In [5]:
df_skills = get_all_skills()
print(df_skills.head())

                 Label                                                URI
0          3D lighting  http://data.europa.eu/esco/skill/6e53fd99-b646...
1         3D modelling  http://data.europa.eu/esco/skill/97965983-0da4...
2  3D printing process  http://data.europa.eu/esco/skill/2afb2b59-c9a3...
3         3D texturing  http://data.europa.eu/esco/skill/3e7516dc-0f7c...
4     ABBYY FineReader  http://data.europa.eu/esco/skill/9a35cb50-8106...


In [6]:
print(df_skills.info())
print(df_skills['Label'])
df_skills.to_json("Data/skills.json", orient = "records", lines = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   2919 non-null   object
 1   URI     2919 non-null   object
dtypes: object(2)
memory usage: 45.7+ KB
None
0               3D lighting
1              3D modelling
2       3D printing process
3              3D texturing
4          ABBYY FineReader
               ...         
2914           zoning codes
2915          zoo community
2916     zoo exhibit design
2917        zoo regulations
2918      zoonotic diseases
Name: Label, Length: 2919, dtype: object


OSError: Cannot save file into a non-existent directory: 'Data'

In [None]:
for s in df_skills['Label']:
    print(s)

3D lighting
3D modelling
3D printing process
3D texturing
ABBYY FineReader
abide by regulations on banned materials
Absorb (learning management systems)
accept feedback on artistic performance
accept own accountability
accounting entries
acquire licences for the use of weapons
act as a watchstander
act discreetly
adapt fighting techniques for performance
adapt instruction to labour market
adapt teaching to student's capabilities
adapt text culturally
adapt to change
adapt to change in marketing
adapt to different roles
adapt to different weather conditions
adapt to new design materials
adapt to new technology used in cars
add chemicals to drilling fluid
address gender-related issues in family planning counselling
address problems critically
adhere to OHSAS 18001
adhere to questionnaires
adhere to traffic regulations on inland waterways
adjust drying process to goods
adjust envelope cutting settings
adjust fold plates
adjust fuel prices in line with company procedures
adjust hearing aid

In [None]:
df = get_necessary_skills_for_occupation("data scientist", language="en")
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Occupation      23 non-null     object
 1   Occupation URI  23 non-null     object
 2   Skill           23 non-null     object
 3   Skill URI       23 non-null     object
dtypes: object(4)
memory usage: 868.0+ bytes
None
['build recommender systems', 'collect ICT data', 'data mining', 'data models', 'deliver visual presentation of data', 'design database scheme', 'develop data processing applications', 'establish data processes', 'execute analytical mathematical calculations', 'handle data samples', 'implement data quality processes', 'information categorisation', 'information extraction', 'interpret current data', 'manage data collection systems', 'normalise data', 'online analytical processing', 'perform data cleansing', 'query languages', 'report analysis results', 'resource description fram

In [None]:
print(list(df["Skill"]))
print(len(df["Skill"]))

['build recommender systems', 'collect ICT data', 'data mining', 'data models', 'deliver visual presentation of data', 'design database scheme', 'develop data processing applications', 'establish data processes', 'execute analytical mathematical calculations', 'handle data samples', 'implement data quality processes', 'information categorisation', 'information extraction', 'interpret current data', 'manage data collection systems', 'normalise data', 'online analytical processing', 'perform data cleansing', 'query languages', 'report analysis results', 'resource description framework query language', 'statistics', 'visual presentation techniques']
23


# Qualifications

In [None]:
import requests
import pandas as pd
from urllib.parse import urlparse, parse_qs

BASE = "https://ec.europa.eu/esco/api"
SESSION = requests.Session()
SESSION.headers.update({"Accept": "application/json"})

# --- small helpers (reuse your existing ones if you already defined them) ---

def normalize_esco_uri(u: str | None) -> str | None:
    if not u:
        return None
    if "data.europa.eu/esco/" in u and "/resource/" not in u:
        return u
    qs = parse_qs(urlparse(u).query)
    if "uri" in qs and qs["uri"]:
        return qs["uri"][0]
    return u

def esco_search(text, type_="occupation", language="en", limit=1):
    r = SESSION.get(f"{BASE}/search", params={
        "text": text, "type": type_, "language": language,
        "limit": limit, "full": "false"
    }, timeout=30)
    r.raise_for_status()
    return r.json()

def extract_concept_uri_from_search_item(item) -> str | None:
    emb = item.get("_embedded", {}) if isinstance(item, dict) else {}
    res = emb.get("resource", {}) if isinstance(emb, dict) else {}
    if isinstance(res, dict) and res.get("uri"):
        return normalize_esco_uri(res["uri"])
    links = item.get("_links", {}) if isinstance(item, dict) else {}
    for k in ("resource", "self"):
        href = links.get(k, {}).get("href")
        if href:
            return normalize_esco_uri(href)
    return None

def esco_get_resource(resource_type, concept_uri, language="en"):
    concept_uri = normalize_esco_uri(concept_uri)
    r = SESSION.get(f"{BASE}/resource/{resource_type}",
                    params={"uri": concept_uri, "language": language},
                    timeout=30)
    r.raise_for_status()
    return r.json()

def _label(value, language="en", fallback=""):
    if isinstance(value, dict):
        return value.get(language) or next(iter(value.values()), fallback)
    return value or fallback

# --- main: qualifications only ---

def get_qualifications_for_occupation(occupation: str, language="en") -> pd.DataFrame:
    """
    Return all ESCO qualifications linked to an occupation.
    `occupation` can be a canonical ESCO occupation URI or a plain-text name.
    Columns: ['Occupation', 'Occupation URI', 'Qualification', 'Qualification URI', 'Relation']
    """

    # 1) Resolve occupation to canonical URI
    if "data.europa.eu/esco/occupation/" not in occupation:
        hits = esco_search(occupation, type_="occupation", language=language, limit=1)
        items = hits.get("_embedded", {}).get("results", [])
        if not items:
            raise ValueError(f"No occupation found for query: {occupation!r}")
        occ_uri = extract_concept_uri_from_search_item(items[0])
        if not occ_uri:
            raise ValueError(f"Could not resolve occupation URI for query: {occupation!r}")
    else:
        occ_uri = normalize_esco_uri(occupation)

    # 2) Fetch occupation resource + label
    occ = esco_get_resource("occupation", occ_uri, language=language)
    occ_label = _label(occ.get("preferredLabel"), language, fallback=occ_uri)

    # 3) Collect qualification links (names vary; be defensive)
    links = occ.get("_links", {})
    qual_link_names = [
        "requiresQualification",
        "hasQualification",
        "hasEssentialQualification",
        "hasOptionalQualification",
        "requiresFormalQualification",
        "hasRelatedQualification",   # rare / future-proof
    ]

    qual_refs = []  # (uri, relation)
    for rel in qual_link_names:
        rel_items = links.get(rel, [])
        if isinstance(rel_items, dict):
            rel_items = [rel_items]
        for it in rel_items:
            href = normalize_esco_uri(it.get("href"))
            if href:
                qual_refs.append((href, rel))

    # 4) De-duplicate by URI, but keep the first seen relation as a tag
    seen = set()
    rows = []
    for quri, rel in qual_refs:
        if quri in seen:
            continue
        seen.add(quri)
        q = esco_get_resource("qualification", quri, language=language)
        q_label = _label(q.get("preferredLabel"), language, fallback=quri)
        rows.append({
            "Occupation": occ_label,
            "Occupation URI": occ_uri,
            "Qualification": q_label,
            "Qualification URI": quri,
            "Relation": rel
        })

    df = pd.DataFrame(rows)
    if not df.empty:
        df.sort_values(["Qualification"], key=lambda s: s.astype(str).str.lower(), inplace=True)
        df.reset_index(drop=True, inplace=True)
    return df


In [None]:
# By name:
q_df = get_qualifications_for_occupation("nurse", language="en")
q_df.head()

# Or by canonical ESCO occupation URI:
# q_df = get_qualifications_for_occupation("http://data.europa.eu/esco/occupation/<UUID>", language="en")


In [None]:
q_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
