## **Quality Detector**

The following is the implementation of the task 1 and task 2. To execute it you should have a valid OpenAI API key.

### **Task 1**

Import required Python libraries and download the Wordnet dictionary

In [None]:
# Importing some packages
from SPARQLWrapper import SPARQLWrapper, JSON
from openai import OpenAI
import nltk
from nltk.corpus import wordnet as wn
import spacy
import pandas as pd
from IPython.display import display
import numpy as np

# Download 'Wordnet' dictionary
nltk.download("wordnet")

# Load SpaCy model
model = spacy.load("en_core_web_sm")

Define relevant functions

In [None]:
def check_ambiguity_llm_approach(name, openai_key):
    """
    This function used to check if a name (class or property) is ambiguous or not
    using the ChatGPT LLM and especially the gpt-4o-mini model of it.

    :param name: The name to check
    :param openai_key: The OpenAI key to access the corresponding API
    :return: If it is ambiguous or not together with a explanation of that
    """

    # Setting up the content of the 'system' role
    system_content = f"""
    You are an system that helps assess the ambiguity of a term. Your job is to determine whether a given term is ambiguous or not.\n\
    \n\
    Some cases to help you understand when a term is ambiguous or not:\n\
        - A term is ambiguous when it has multiple meanings or it is uncertain (e.g., "bat" can refer to an animal or a piece of sports equipment.).\n\
        - A term is ambiguous when its meaning changes depending on the context in which it is used (e.g., "scale" can mean a musical scale in music, a device for weighing in measurement, or even a pattern on fish skin in biology.).\n\
        - A term is ambiguous when it has different meanings in different fields or industries (e.g., In sports, "run" refers to moving quickly on foot and in programming, "run" means to execute a program or script.). 
    \n\
    Important Instructions:\n\
        - Return the word "Yes" if the term is ambiguous or the word "No" if it is not and also a sentence explaining why it is ambiguous or not and always include the term to know what it refers to.
    """

    # Setting up the content of the 'user' role
    user_content = f"""
        Term: {name}\n\
        Is the given term ambiguous or not?
    """

    # Requesting
    completion = openai_key.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content},
        ],
    )

    return completion.choices[0].message.content


def check_ambiguity_dictionary_approach(name):
    """
    This function used to check if a name (class or property) is ambiguous or not
    using the Wordnet dictionary. It first checks if the whole name has senses
    otherwise checks the ambiguity of each it's words.

    :param name: The name to check
    :return: The word 'Ambiguous' or 'Non-Ambiguous'
    """

    # Getting the tokens of the name
    tokens = model(name)
    
    # Checking the number of tokens/words
    if len(tokens) == 1:

        # Finding the senses
        name_senses = wn.synsets(name)
        total_name_senses = len(name_senses)
        
        # Checking if it is ambiguous or not and return respectively
        if total_name_senses == 1:

            return {name: total_name_senses},"Non-Ambiguous"
        
        else:

            return {name: total_name_senses}, "Ambiguous"

    else:

        # Preprocessing
        tokens = [token.lower_ for token in tokens if token.is_alpha or token.is_digit]

        # Getting the name where the words are separated by underscore or space
        name_with_undersore = "_".join(tokens)
        name_with_space = " ".join(tokens)

        # Finding the senses for both versions of name
        name_senses_1 = wn.synsets(name_with_undersore)
        total_name_senses_1 = len(name_senses_1)
        name_senses_2 = wn.synsets(name_with_space)
        total_name_senses_2 = len(name_senses_2)

        # Checking if the name with underscores has at least one sence
        if total_name_senses_1 > 0:

            # Checking if it is ambiguous or not and return respectively
            if total_name_senses_1 == 1:

                return {name: total_name_senses_1},"Non-Ambiguous"
            else:

                return {name: total_name_senses_1}, "Ambiguous"

        # Checking if the name with spaces has at least one sense
        elif total_name_senses_2 > 0:

            # Checking if it is ambiguous or not and return respectively
            if total_name_senses_2 == 1:

                return {name: total_name_senses_2},"Non-Ambiguous"
            else:

                return {name: total_name_senses_2}, "Ambiguous"

        else:

            # Defining the dictionary to store the number
            # of senses for each token
            ambiguities = {}

            # Iterating through the tokens of the name
            for token in tokens:
                
                # Finding the senses of the token and
                # storing the number of them into the dictionary
                token_senses = wn.synsets(token)
                ambiguities[token] = len(token_senses)
            
            # Checking if it is ambiguous or not and return respectively
            if all(value == 1 for value in ambiguities.values()):

                return ambiguities, "Non-Ambiguous"

            else:

                return ambiguities, "Ambiguous"


def get_class_names(sparql):
    """
    This function used to find and return the names of all the classes in the DBPedia knowledge graph.

    :param sparql: 'SPARQLWrapper' object
    :return: The class names
    """

    # Setting up the query
    sparql.setQuery(
        """
        SELECT 
            DISTINCT (STR(?name) AS ?className)
        WHERE {
            ?classId rdf:type owl:Class .
            OPTIONAL {
                ?classId rdfs:label ?name .
                FILTER(LANG(?name) = "en")
            }
        }
        LIMIT 50
    """
    )

    # Executing the query
    query_results = sparql.query().convert()

    return query_results["results"]["bindings"]


def get_property_names(sparql):
    """
    This function used to find and return the names of all the properties in the DBPedia knowledge graph.

    :param sparql: 'SPARQLWrapper' object
    :return: The property names
    """

    # Setting up the query
    sparql.setQuery(
        """
        SELECT 
            DISTINCT (STR(?name) AS ?propertyName)
        WHERE {
            ?propertyId rdf:type rdf:Property .
            OPTIONAL {
                ?propertyId rdfs:label ?name .
                FILTER(LANG(?name) = "en")
            }
        }
        LIMIT 50
    """
    )

    # Executing the query
    query_results = sparql.query().convert()

    return query_results["results"]["bindings"]

Connect to the OpenAI API

In [None]:
# Connecting to OpenAI API
openai_key = OpenAI(
    api_key="Here place your OpenAI API key"
)

Access the DBPedia endpoint

In [4]:
# SPARQL endpoint of DBPedia
sparql_endpoint = "https://dbpedia.org/sparql"

# Setting up the 'SPARQLWrapper' object based on the DBPedia endpoint
sparql = SPARQLWrapper(sparql_endpoint)

# Defining the format of the queries results
sparql.setReturnFormat(JSON)

- #### **Quality Aassessment on Class Names**

Obtain the class names

In [5]:
# Getting the class names
class_names = get_class_names(sparql)

# Creating a dictionary to compare the results between the approaches
class_ambiguity_comparison = {class_name["className"]["value"]:{} for class_name in class_names}

Check which class names are ambiguous or not using ChatGPT

In [6]:
# Iterating through the names
for class_name in class_names:

    # Requesting
    response = check_ambiguity_llm_approach(
        class_name["className"]["value"],
        openai_key,
    )

    # Updating
    class_ambiguity_comparison[class_name["className"]["value"]]["llm"] = (
        "Ambiguous" if "Yes" in response else "Non-Ambiguous"
    )

    print(response)

Yes, the term "company" is ambiguous because it can refer to a business organization in a commercial context, or it can mean companionship or a group of people gathered together in a social context.
Yes, the term "activity" is ambiguous because it can refer to various types of actions or tasks in different contexts, such as physical exercise, a specific event or project in a business setting, or even a recreational pastime.
Yes, the term "name" is ambiguous because it can refer to a person's name, a brand name, a title of a work, or even a designation for a specific object or concept, depending on the context in which it is used.
No, the term "person" is not ambiguous because it generally refers to an individual human being and does not have multiple meanings or interpretations in common usage.
Yes, the term "actor" is ambiguous because it can refer to a person who performs in films, television, or theater, but it can also refer to a participant in a particular role within a system or 

Check which class names are ambiguous or not using the dictionary Wordnet

In [14]:
# Iterating through the names
for class_name in class_names:

    # Requesting
    ambiguity_dict, ambiguity_type = check_ambiguity_dictionary_approach(
        class_name["className"]["value"]
    )

    # Updating
    class_ambiguity_comparison[class_name["className"]["value"]][
        "dictionary"
    ] = ambiguity_type

    # Printing according to the return values
    print(
        f"Class name '{class_name["className"]["value"]}':\n{ambiguity_dict} -> is {ambiguity_type}"
    )

Class name 'company':
{'company': 10} -> is Ambiguous
Class name 'activity':
{'activity': 6} -> is Ambiguous
Class name 'name':
{'name': 15} -> is Ambiguous
Class name 'person':
{'person': 3} -> is Ambiguous
Class name 'actor':
{'actor': 2} -> is Ambiguous
Class name 'place':
{'place': 32} -> is Ambiguous
Class name 'publisher':
{'publisher': 3} -> is Ambiguous
Class name 'genre':
{'genre': 4} -> is Ambiguous
Class name 'language':
{'language': 6} -> is Ambiguous
Class name 'department':
{'department': 3} -> is Ambiguous
Class name 'software':
{'software': 1} -> is Non-Ambiguous
Class name 'school':
{'school': 10} -> is Ambiguous
Class name 'type':
{'type': 8} -> is Ambiguous
Class name 'non-profit organisation':
{'non': 1, 'profit': 4, 'organisation': 7} -> is Ambiguous
Class name 'agent':
{'agent': 6} -> is Ambiguous
Class name 'Biomolecule':
{'Biomolecule': 0} -> is Ambiguous
Class name 'broadcaster':
{'broadcaster': 2} -> is Ambiguous
Class name 'cartoon':
{'cartoon': 3} -> is Ambi

In [16]:
# Converting the dictionary to dataframe
class_ambiguity_comparison_df = pd.DataFrame.from_dict(class_ambiguity_comparison, orient='index').reset_index()

# Renaming
class_ambiguity_comparison_df.columns = ['class_name', 'llm', 'dictionary']

# Calculating in how many class names agree on the ambiguity estimation
class_agreement = class_ambiguity_comparison_df[class_ambiguity_comparison_df["llm"] == class_ambiguity_comparison_df["dictionary"]]["class_name"].count()

print(f"The two approaches agree in {class_agreement} class names")

The two approaches agree in 36 class names


Show cases where the two approaches agree on the ambiguity of the class names

In [17]:
print("Cases where the two approaches agree:")
display(class_ambiguity_comparison_df[class_ambiguity_comparison_df["llm"] == class_ambiguity_comparison_df["dictionary"]])

Cases where the two approaches agree:


Unnamed: 0,class_name,llm,dictionary
0,company,Ambiguous,Ambiguous
1,activity,Ambiguous,Ambiguous
2,name,Ambiguous,Ambiguous
4,actor,Ambiguous,Ambiguous
5,place,Ambiguous,Ambiguous
6,publisher,Ambiguous,Ambiguous
7,genre,Ambiguous,Ambiguous
8,language,Ambiguous,Ambiguous
9,department,Ambiguous,Ambiguous
10,software,Non-Ambiguous,Non-Ambiguous


Show cases where the two approaches do not agree on the ambiguity of the class names

In [18]:
print("Cases where the two approaches do not agree:")
display(class_ambiguity_comparison_df[class_ambiguity_comparison_df["llm"] != class_ambiguity_comparison_df["dictionary"]])

Cases where the two approaches do not agree:


Unnamed: 0,class_name,llm,dictionary
3,person,Non-Ambiguous,Ambiguous
13,non-profit organisation,Non-Ambiguous,Ambiguous
15,Biomolecule,Non-Ambiguous,Ambiguous
19,celestial body,Ambiguous,Non-Ambiguous
22,gene,Ambiguous,Non-Ambiguous
23,gridiron football player,Non-Ambiguous,Ambiguous
25,identifier,Ambiguous,Non-Ambiguous
29,motorsport racer,Non-Ambiguous,Ambiguous
32,Organisation member,Non-Ambiguous,Ambiguous
33,periodical literature,Non-Ambiguous,Ambiguous


- #### **Quality Assessment on Property Names**

Obtain the property names

In [12]:
# Getting the property names
property_names = get_property_names(sparql)

# Creating a dictionary to compare the results between the approaches
property_ambiguity_comparison = {property_name["propertyName"]["value"]:{} for property_name in property_names}

Check which property names are ambiguous or not using ChatGPT

In [13]:
# Iterating through the names
for property_name in property_names:

    # Requesting
    response = check_ambiguity_llm_approach(
        property_name["propertyName"]["value"],
        openai_key,
    )

    # Updating
    property_ambiguity_comparison[property_name["propertyName"]["value"]]["llm"] = (
        "Ambiguous" if "Yes" in response else "Non-Ambiguous"
    )

    print(response)

Yes, the term "name" is ambiguous because it can refer to a person's name, a brand name, a title of a work, or even a designation for a specific object or concept, depending on the context in which it is used.
No, the term "sec cik" is not ambiguous. It appears to be a specific term that does not have widely recognized multiple meanings or interpretations in different contexts or fields.
Yes, the term "location" is ambiguous because it can refer to a physical place, a position in a geographical context, or even a specific point in a digital context, such as a location in a database or a file system.
No, the term "relates an entity to the populated place in which it is located" is not ambiguous because it has a specific meaning in the context of geography or data representation, referring to the relationship between an entity (like a business or landmark) and its geographical location.
No, the term "birth place" is not ambiguous because it specifically refers to the location where a per

Check which property names are ambiguous or not using the dictionary Wordnet

In [19]:
# Iterating through the names
for property_name in property_names:

    # Requesting
    ambiguity_dict, ambiguity_type = check_ambiguity_dictionary_approach(
        property_name["propertyName"]["value"]
    )

    # Updating
    property_ambiguity_comparison[property_name["propertyName"]["value"]][
        "dictionary"
    ] = ambiguity_type

    # Printing according to the return values
    print(
        f"Property name '{property_name["propertyName"]["value"]}':\n{ambiguity_dict} -> is {ambiguity_type}"
    )

Property name 'name':
{'name': 15} -> is Ambiguous
Property name 'sec cik':
{'sec': 4, 'cik': 0} -> is Ambiguous
Property name 'location':
{'location': 4} -> is Ambiguous
Property name 'Relates an entity to the populated place in which it is located.':
{'relates': 5, 'an': 1, 'entity': 1, 'to': 0, 'the': 0, 'populated': 3, 'place': 32, 'in': 7, 'which': 0, 'it': 1, 'is': 13, 'located': 5} -> is Ambiguous
Property name 'birth place':
{'birth': 6, 'place': 32} -> is Ambiguous
Property name 'death place':
{'death': 8, 'place': 32} -> is Ambiguous
Property name 'death date':
{'death': 8, 'date': 13} -> is Ambiguous
Property name 'death':
{'death': 8} -> is Ambiguous
Property name 'birth date':
{'birth': 6, 'date': 13} -> is Ambiguous
Property name 'birth':
{'birth': 6} -> is Ambiguous
Property name 'Wikipage page ID':
{'wikipage': 0, 'page': 9, 'id': 3} -> is Ambiguous
Property name 'Wikipage revision ID':
{'wikipage': 0, 'revision': 3, 'id': 3} -> is Ambiguous
Property name 'Link from a W

In [20]:
# Converting the dictionary to dataframe
property_ambiguity_comparison_df = pd.DataFrame.from_dict(property_ambiguity_comparison, orient='index').reset_index()

# Renaming
property_ambiguity_comparison_df.columns = ['property_name', 'llm', 'dictionary']

# Calculating in how many class names agree on the ambiguity estimation
property_agreement = property_ambiguity_comparison_df[property_ambiguity_comparison_df["llm"] == property_ambiguity_comparison_df["dictionary"]]["property_name"].count()

print(f"The two approaches agree in {property_agreement} class names")

The two approaches agree in 22 class names


Show cases where the two approaches agree on the ambiguity of the property names

In [21]:
print("Cases where the two approaches agree:")
display(property_ambiguity_comparison_df[property_ambiguity_comparison_df["llm"] == property_ambiguity_comparison_df["dictionary"]])

Cases where the two approaches agree:


Unnamed: 0,property_name,llm,dictionary
0,name,Ambiguous,Ambiguous
2,location,Ambiguous,Ambiguous
5,death place,Ambiguous,Ambiguous
15,reference,Ambiguous,Ambiguous
18,Wikipage disambiguates,Ambiguous,Ambiguous
19,selection year,Ambiguous,Ambiguous
20,selector,Ambiguous,Ambiguous
22,Selenium iu,Ambiguous,Ambiguous
23,selenium mg,Ambiguous,Ambiguous
24,selenium ug,Ambiguous,Ambiguous


Show cases where the two approaches do not agree on the ambiguity of the property names

In [22]:
print("Cases where the two approaches do not agree:")
display(property_ambiguity_comparison_df[property_ambiguity_comparison_df["llm"] != property_ambiguity_comparison_df["dictionary"]])

Cases where the two approaches do not agree:


Unnamed: 0,property_name,llm,dictionary
1,sec cik,Non-Ambiguous,Ambiguous
3,Relates an entity to the populated place in wh...,Non-Ambiguous,Ambiguous
4,birth place,Non-Ambiguous,Ambiguous
6,death date,Non-Ambiguous,Ambiguous
7,death,Non-Ambiguous,Ambiguous
8,birth date,Non-Ambiguous,Ambiguous
9,birth,Non-Ambiguous,Ambiguous
10,Wikipage page ID,Non-Ambiguous,Ambiguous
11,Wikipage revision ID,Non-Ambiguous,Ambiguous
12,Link from a Wikipage to another Wikipage,Non-Ambiguous,Ambiguous


### **Task 2**

Define functions related to finding the equivalent ESCO entities in DBPedia and their associated class

In [None]:
def read_file(file_path):
    """
    This function used to read and process the CSV file containing the ESCO skill entities.

    :param file_path: The path of the CSV file
    :return: A dataframe containing the content of the file
    """

    # Loading the file
    esco_skills_df = pd.read_csv(file_path)

    # Splitting the 'altLabels' column
    esco_skills_df["ESCO_alt_labels"] = esco_skills_df["altLabels"].apply(
        lambda row: row.split("\n") if pd.notnull(row) else []
    )

    # Renaming
    esco_skills_df.rename(columns={"preferredLabel": "ESCO_label"}, inplace=True)

    # Capitalize the first character of the first word
    esco_skills_df["ESCO_label_Capita_first"] = esco_skills_df["ESCO_label"].apply(lambda row: row.capitalize())

    # Capitalize the first character of each word
    esco_skills_df["ESCO_label_Capital_all"] = esco_skills_df["ESCO_label"].apply(lambda row: row.title())

    return esco_skills_df[["ESCO_label", "ESCO_label_Capita_first", "ESCO_label_Capital_all", "ESCO_alt_labels"]]
    

def find_equivelant_entity(labels):
    """
    This function used to find the equivelant entity and its classes based on its labels.

    :param labels: The label of the entity
    :return: The entity and class names
    """
    
    # Iterating through its labels
    for label in labels:
        
        # Trying to find the entity based on its label
        matches = match_entity(label)

        # Checking if it found an entity
        if matches:

            # Setting up a list to store the classes of the entity
            classes = []

            # Iterating through the results
            for match in matches:

                # Updating
                classes.append(match["className"]["value"])

            return label, classes

    return np.nan, np.nan


def match_entity(label):
    """
    This function used to match and return an entity from DBPedia graph based on a given label.

    :param label: The label to search for
    :return: The matched entity
    """

    # Setting up the query
    sparql.setQuery(
        f"""
        SELECT DISTINCT (STR(?classLabel) AS ?className) WHERE {{
        ?entity rdfs:label "{label}"@en .
        ?entity rdf:type ?class .
        ?class rdfs:label ?classLabel .
        FILTER (LANG(?classLabel) = "en")
        FILTER (STRSTARTS(STR(?entity), "http://dbpedia.org/resource/"))
        FILTER (STRSTARTS(STR(?class), "http://dbpedia.org/ontology/"))
        FILTER NOT EXISTS {{ ?superclass rdfs:subClassOf ?class . ?entity rdf:type ?superclass . }}
    }}
    """
    )

    # Executing the query
    query_results = sparql.query().convert()

    return query_results["results"]["bindings"]

Read the CSV file containing the ESCO skill entities

In [26]:
# Loading data
esco_skills_df = read_file("esco_skills_en.csv")

esco_skills_df.head(10)

Unnamed: 0,ESCO_label,ESCO_label_Capita_first,ESCO_label_Capital_all,ESCO_alt_labels
0,Haskell,Haskell,Haskell,[]
1,sport and exercise medicine,Sport and exercise medicine,Sport And Exercise Medicine,"[sports injury treatment, sports medicine, exe..."
2,Incremental development,Incremental development,Incremental Development,[]
3,use of special equipment for daily activities,Use of special equipment for daily activities,Use Of Special Equipment For Daily Activities,[]
4,sawing techniques,Sawing techniques,Sawing Techniques,"[sawing technologies, sawing methods, sawing t..."
5,cold vulcanisation,Cold vulcanisation,Cold Vulcanisation,"[using vulcanising solution, tyre repair metho..."
6,types of barley,Types of barley,Types Of Barley,"[type of barley, a type of barley, barley type..."
7,KDevelop,Kdevelop,Kdevelop,"[KDevelop 4.7.0, KDevelop 4.6.0, KDevelop 4.0...."
8,Absorb (learning management systems),Absorb (learning management systems),Absorb (Learning Management Systems),[]
9,cosmetics ingredients,Cosmetics ingredients,Cosmetics Ingredients,"[cosmetics additives, cosmetics elements, a co..."


Find the equivelant DBPedia entities and their classes

In [None]:
# Finding the equivelant entities and their classes
esco_skills_df[["DBPedia_entity", "DBPedia_class"]] = esco_skills_df.apply(
    lambda row: pd.Series(
        find_equivelant_entity(
            set(
                [
                    row["ESCO_label"],
                    row["ESCO_label_Capita_first"],
                    row["ESCO_label_Capital_all"]
                ]
            )
        )
    ),
    axis=1,
)

In [44]:
esco_skills_df.head(10)

Unnamed: 0,ESCO_label,ESCO_label_Capita_first,ESCO_label_Capital_all,ESCO_alt_labels,DBPedia_entity,DBPedia_class
0,Haskell,Haskell,Haskell,[],Haskell,[programming language]
1,sport and exercise medicine,Sport and exercise medicine,Sport And Exercise Medicine,"[sports injury treatment, sports medicine, exe...",,
2,Incremental development,Incremental development,Incremental Development,[],,
3,use of special equipment for daily activities,Use of special equipment for daily activities,Use Of Special Equipment For Daily Activities,[],,
4,sawing techniques,Sawing techniques,Sawing Techniques,"[sawing technologies, sawing methods, sawing t...",,
5,cold vulcanisation,Cold vulcanisation,Cold Vulcanisation,"[using vulcanising solution, tyre repair metho...",,
6,types of barley,Types of barley,Types Of Barley,"[type of barley, a type of barley, barley type...",,
7,KDevelop,Kdevelop,Kdevelop,"[KDevelop 4.7.0, KDevelop 4.6.0, KDevelop 4.0....",KDevelop,[software]
8,Absorb (learning management systems),Absorb (learning management systems),Absorb (Learning Management Systems),[],,
9,cosmetics ingredients,Cosmetics ingredients,Cosmetics Ingredients,"[cosmetics additives, cosmetics elements, a co...",,


Define a function asking ChatGPT to judge the accuracy of each entity-class pair 

In [31]:
def evaluate_entity_class_pair(entity, enity_class):
    """
    This function used to determine if the entity-class pair is accurate and valid
    using the ChatGPT LLM and especially the gpt-4o-mini model of it.

    :param entity: The entity of the pair
    :param entity_class: The class of the pair
    :return: Yes or No
    """

    # Setting up the content of the 'system' role
    system_content = f"""
    You are an system that judge if a entity-class pair in a knowledge graph is accurate or not. Your job is to determine if the given class accurately describe the given entity.\n
    Please return "Yes" or "No"
    """

    # Setting up the content of the 'user' role
    user_content = f"""
    Entity: "{entity}"\n\
    Proposed Class: "{enity_class}"
    Question: Does the proposed class "{enity_class}" accurately describe the entity "{entity}"?
    """

    # Requesting
    completion = openai_key.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content},
        ],
    )

    return completion.choices[0].message.content

Request for each entity-class pair

In [None]:
# Getting only the matched entities
matched_esco_skills_df = esco_skills_df[esco_skills_df["DBPedia_entity"].isna() == False][["DBPedia_entity", "DBPedia_class"]].copy()

# Coverting the dataframe
matched_esco_skills_df = matched_esco_skills_df.explode('DBPedia_class')
matched_esco_skills_df.reset_index(drop=True, inplace=True)

# Requesting
matched_esco_skills_df["is_accurate"] = matched_esco_skills_df.apply(lambda row: evaluate_entity_class_pair(row["DBPedia_entity"], row["DBPedia_class"]), axis=1)

matched_esco_skills_df.head(10)

Unnamed: 0,DBPedia_entity,DBPedia_class,is_accurate
0,Haskell,programming language,Yes
1,KDevelop,software,Yes
2,Geomatics,sport,No
3,Maltego,software,Yes
4,Cardiovascular system,anatomical structure,Yes
5,Criminology,book,No
6,Criminology,university,No
7,Photogrammetry,work,No
8,Photogrammetry,music genre,No
9,Forensic pathology,agent,No


Show how many entities matched

In [39]:
print(len(matched_esco_skills_df["DBPedia_entity"].unique()))

590


Show how many entity-class pairs are judged to be accurate or inaccurate

In [43]:
print(f"Accurate: {len(matched_esco_skills_df[matched_esco_skills_df["is_accurate"] == "Yes"])}")

print(f"Inaccurate: {len(matched_esco_skills_df[matched_esco_skills_df["is_accurate"] == "No"])}")

Accurate: 110
Inaccurate: 668


Show classes that tend to have the fewest mistakes

In [34]:
matched_esco_skills_df[matched_esco_skills_df["is_accurate"] == "Yes"].groupby(
    "DBPedia_class"
).agg(Total_Mistakes=("DBPedia_entity", "count")).reset_index().sort_values(
    by="Total_Mistakes", ascending=False
)

Unnamed: 0,DBPedia_class,Total_Mistakes
13,software,45
12,programming language,16
10,medical specialty,13
9,language,12
5,company,4
14,sport,4
15,topical concept,4
16,website,2
6,disease,2
0,academic journal,1


Show classes that tend to have the most mistakes

In [33]:
matched_esco_skills_df[matched_esco_skills_df["is_accurate"] == "No"].groupby(
    "DBPedia_class"
).agg(Total_Mistakes=("DBPedia_entity", "count")).reset_index().sort_values(
    by="Total_Mistakes", ascending=False
)

Unnamed: 0,DBPedia_class,Total_Mistakes
31,person function,145
27,music genre,115
12,book,71
29,organisation,71
50,university,30
53,work,24
2,Election,23
16,company,22
44,sport,22
42,software,16
