General approach:
1. For each PK cell in the table, get the top 3 entities from Wikidata.
2. For each other cell in the table, get the relations and objects for each of the top 3 entities.
3. For each column, get the most common relation.
4. For each column, get the object that has the most common relation with the PK cell.
5. For the PK column, get the most common type.

The code is a bit messy with a lot of loop, however, it does work. The results are fine, but there sure are improvements to be made.

The is likely some errors in the TP, FP, FN calculations, but the correct/incorrect counts are correct.

In [13]:
import pandas as pd
import os
from lookup import WikidataAPI
from endpoints import WikidataEndpoint

In [2]:
# Query to return matches for object and relation given value from a QID
def get_relation_object_query(QID, value):
    if isinstance(value, str):
        filter_string = f'FILTER(?object = "{value}").'
    else:
        filter_string = f'FILTER(?object = {value}).'
    
    query = f"""
    SELECT ?relation ?object
    WHERE {{
      {{
        wd:{QID} ?relation ?object.
        ?object rdfs:label "{value}"@en.
      }}
      UNION
      {{
        wd:{QID} ?relation ?object.
        {filter_string}
      }}
    }}
    """
    return query


class CSV2KG:
    def __init__(self, df):
        self.wikidata = WikidataAPI()
        self.ep = WikidataEndpoint()
        self.df = df
        self.df_candidates = None
        self.df_relations = None
        self.df_entities = None
        self.df_classes = None
        
    def get_candidates(self):
        dataframe = pd.DataFrame(columns=self.df.columns, dtype=object)#self.df.copy()
        
        for idx, row in self.df.iterrows():
            search_string = row.iloc[0]
            
            # Entities
            entities = self.wikidata.getKGEntities(query=search_string, 
                                      limit=3, 
                                      type="item")
            
            dataframe.at[idx, self.df.columns[0]] = entities
            
                
            for search_idx in row.index[1:]:
                search_obj_dict = {}
                search_obj = row[search_idx]
                
                if search_obj == search_obj:  # Check if it is not NaN
                
                    for entity in entities:
                        QID = entity.getId().split("/")[-1]
                        
                        query = get_relation_object_query(QID, search_obj)
                        results = self.ep.getQueryResults(query)
                        
                        rel_obj_list = []
                        if len(results["results"]["bindings"]) > 0:
                            for res in results["results"]["bindings"]:
                                rel = res["relation"]["value"]
                                obj = res["object"]["value"]
                                rel_obj_list.append((rel, obj))
                                
                        search_obj_dict[entity.getId()] = rel_obj_list

                dataframe.at[idx, search_idx] = search_obj_dict
        
        self.df_candidates = dataframe
    
    
    def get_relations(self):
        counts = {}
        for col in self.df_candidates.columns[1:]:
            relation_counts = {}
            for idx, row in self.df_candidates.iterrows():
                for entity, search_obj_dict in row[col].items():
                    for rel, obj in search_obj_dict:
                        relation_counts[rel] = relation_counts.get(rel, 0) + 1
            counts[col] = relation_counts
            
        self.df_relations = pd.DataFrame(columns=self.df.columns)
        for col, rel_counts in counts.items():
            relation = max(rel_counts, key=rel_counts.get)
            self.df_relations.at[0, col] = relation
            
    def get_entities(self):
        self.df_entities = pd.DataFrame(columns=self.df.columns, index=self.df.index)
        for idx, row in self.df_candidates.iterrows():
            for col in self.df_candidates.columns[1:]:
                for entity, search_obj_dict in row[col].items():
                    if self.df_relations.at[0, col] in [rel for rel, obj in search_obj_dict]:
                        if self.df_entities.at[idx, self.df_entities.columns[0]] != self.df_entities.at[idx, self.df_entities.columns[0]]:
                            self.df_entities.at[idx, self.df_entities.columns[0]] = entity
        
        for idx, row in self.df_entities.iterrows():
            for col in self.df_entities.columns[1:]:
                subject = self.df_entities.at[idx, self.df_entities.columns[0]]
                
                if subject == subject:
                    for prop, obj in self.df_candidates.at[idx, col].get(subject, []):
                        if prop == self.df_relations.at[0, col]:
                            self.df_entities.at[idx, col] = obj
                            
    def get_types(self):
        self.df_classes = pd.DataFrame(columns=self.df.columns)
        
        types_dct = {}
        for idx, row in self.df_candidates.iterrows():
            entites = row.iloc[0]
            for entity in entites:
                types = self.ep.getTypesForEntity(entity.getId())
                for t in types:
                    types_dct[t] = types_dct.get(t, 0) + 1
            
        type_entity = max(types_dct, key=types_dct.get)
        self.df_classes.at[0, self.df_classes.columns[0]] = type_entity
        
        for col in self.df_entities.columns[1:]:
            types_dct = {}
            for idx, row in self.df_entities.iterrows():
                entity = row[col]
                if entity == entity:
                    types = self.ep.getTypesForEntity(entity.split("/")[-1])
                    for t in types:
                        types_dct[t] = types_dct.get(t, 0) + 1
            
            if types_dct:
                type_entity = max(types_dct, key=types_dct.get)
                self.df_classes.at[0, col] = type_entity    
        
    
    def run_alignment(self):
        self.get_candidates()
        self.get_relations()
        self.get_entities()
        self.get_types()


# Run for 5 tables

In [3]:
# Download the data from https://github.com/sem-tab-challenge/2024/blob/main/data/WikidataTables2024R1.tar.gz
path_to_tables = './data/WikidataTables2024R1/DataSets/Valid/tables/'
file_names = os.listdir(path_to_tables)

In [4]:
results = []
for file_name in file_names[:10]:
    df = pd.read_csv(path_to_tables + file_name)
    mappings = CSV2KG(df)
    mappings.run_alignment()
    results.append([file_name, mappings])
    print(file_name)

Y3OHOKFF.csv
Y4OS3SBS.csv
RQT6VSWL.csv
NGT7C6EO.csv
BAJ5LMX3.csv
VQHPBH3L.csv
BDWQF2CN.csv
KPFFDB6X.csv
4OH908JW.csv
OJHKRXI7.csv


# CEA (Cell-Entity Annotation)

In [5]:
cea = pd.read_csv("./data/WikidataTables2024R1/DataSets/Valid/gt/cea_gt.csv", header=None)

In [6]:
for res_no in range(len(results)):
    correct = 0
    incorrect = 0
    precision_reduction = 0
    subset_df = cea[cea[0] == results[res_no][0].removesuffix('.csv')]
    for idx, row in subset_df.iterrows():
        mapping = results[res_no][1].df_entities.iloc[row[1] - 1, row[2]]
        if mapping == row[3]:
            correct += 1
        else:
            if mapping == mapping:
                precision_reduction += 1
            incorrect += 1
    print(f"{results[res_no][0]}\n  Correct: {correct}, Incorrect: {incorrect}")
    precision = correct / (correct + precision_reduction)
    recall = correct / len(subset_df)
    f1 = 2 * (precision * recall) / (precision + recall)
    print(f"  Precision: {precision},  Recall: {recall},  F1: {f1}")

Y3OHOKFF.csv
  Correct: 2, Incorrect: 4
  Precision: 1.0,  Recall: 0.3333333333333333,  F1: 0.5
Y4OS3SBS.csv
  Correct: 3, Incorrect: 4
  Precision: 1.0,  Recall: 0.42857142857142855,  F1: 0.6
RQT6VSWL.csv
  Correct: 7, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 1.0
NGT7C6EO.csv
  Correct: 6, Incorrect: 3
  Precision: 1.0,  Recall: 0.6666666666666666,  F1: 0.8
BAJ5LMX3.csv
  Correct: 10, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 1.0
VQHPBH3L.csv
  Correct: 8, Incorrect: 6
  Precision: 0.5714285714285714,  Recall: 0.5714285714285714,  F1: 0.5714285714285714
BDWQF2CN.csv
  Correct: 2, Incorrect: 1
  Precision: 1.0,  Recall: 0.6666666666666666,  F1: 0.8
KPFFDB6X.csv
  Correct: 7, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 1.0
4OH908JW.csv
  Correct: 2, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 1.0
OJHKRXI7.csv
  Correct: 1, Incorrect: 2
  Precision: 1.0,  Recall: 0.3333333333333333,  F1: 0.5


# CTA (Column-Type Annotation)

In [7]:
cta = pd.read_csv("./data/WikidataTables2024R1/DataSets/Valid/gt/cta_gt.csv", header=None)

In [8]:
cta[cta[0] == results[0][0].removesuffix('.csv')]

Unnamed: 0,0,1,2
76,Y3OHOKFF,0,http://www.wikidata.org/entity/Q25110269
77,Y3OHOKFF,1,http://www.wikidata.org/entity/Q20970434


In [9]:
results[0][1].df_classes

Unnamed: 0,col0,col1,col2
0,http://www.wikidata.org/entity/Q7725634,,


In [10]:
for res_no in range(len(results)):
    correct = 0
    incorrect = 0
    precision_reduction = 0
    subset_df = cta[cta[0] == results[res_no][0].removesuffix('.csv')]
    for idx, row in subset_df.iterrows():
        mapping = results[res_no][1].df_classes.iloc[0, row[1]]
        if mapping == row[2]:
            correct += 1
        else:
            if mapping == mapping:
                precision_reduction += 1
            incorrect += 1
    print(f"{results[res_no][0]}\n  Correct: {correct}, Incorrect: {incorrect}")
    precision = correct / (correct + precision_reduction)
    recall = correct / len(subset_df)
    f1 = 2 * (precision * recall) / (precision + recall+ 1e-12)
    print(f"  Precision: {precision},  Recall: {recall},  F1: {f1}")

Y3OHOKFF.csv
  Correct: 0, Incorrect: 2
  Precision: 0.0,  Recall: 0.0,  F1: 0.0
Y4OS3SBS.csv
  Correct: 0, Incorrect: 1
  Precision: 0.0,  Recall: 0.0,  F1: 0.0
RQT6VSWL.csv
  Correct: 1, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995
NGT7C6EO.csv
  Correct: 1, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995
BAJ5LMX3.csv
  Correct: 1, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995
VQHPBH3L.csv
  Correct: 0, Incorrect: 2
  Precision: 0.0,  Recall: 0.0,  F1: 0.0
BDWQF2CN.csv
  Correct: 1, Incorrect: 1
  Precision: 1.0,  Recall: 0.5,  F1: 0.6666666666662222
KPFFDB6X.csv
  Correct: 0, Incorrect: 3
  Precision: 0.0,  Recall: 0.0,  F1: 0.0
4OH908JW.csv
  Correct: 1, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995
OJHKRXI7.csv
  Correct: 1, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995


# CPA (Columns-Property Annotation)

In [11]:
cpa = pd.read_csv("./data/WikidataTables2024R1/DataSets/Valid/gt/cpa_gt.csv", header=None)

In [12]:
for res_no in range(len(results)):
    correct = 0
    incorrect = 0
    precision_reduction = 0
    subset_df = cpa[cpa[0] == results[res_no][0].removesuffix('.csv')]
    for idx, row in subset_df.iterrows():
        mapping = results[res_no][1].df_relations.iloc[row[1], row[2]]
        if mapping == row[3]:
            correct += 1
        else:
            if mapping == mapping:
                precision_reduction += 1
            incorrect += 1
    print(f"{results[res_no][0]}\n  Correct: {correct}, Incorrect: {incorrect}")
    precision = correct / (correct + precision_reduction)
    recall = correct / len(subset_df)
    f1 = 2 * (precision * recall) / (precision + recall+ 1e-12)
    print(f"  Precision: {precision},  Recall: {recall},  F1: {f1}")

Y3OHOKFF.csv
  Correct: 1, Incorrect: 1
  Precision: 0.5,  Recall: 0.5,  F1: 0.49999999999949996
Y4OS3SBS.csv
  Correct: 1, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995
RQT6VSWL.csv
  Correct: 1, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995
NGT7C6EO.csv
  Correct: 1, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995
BAJ5LMX3.csv
  Correct: 1, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995
VQHPBH3L.csv
  Correct: 1, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995
BDWQF2CN.csv
  Correct: 1, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995
KPFFDB6X.csv
  Correct: 2, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995
4OH908JW.csv
  Correct: 2, Incorrect: 0
  Precision: 1.0,  Recall: 1.0,  F1: 0.9999999999995
OJHKRXI7.csv
  Correct: 0, Incorrect: 0


ZeroDivisionError: division by zero