### Filtering DBPedia to get a reference dataset

#### Steps 
- *Download the raw DBpedia object mapping dataset from [here](https://databus.dbpedia.org/dbpedia/mappings/mappingbased-objects/2022.12.01/mappingbased-objects_lang=en.ttl.bz2)* 
- Extract unique subjects and objects from the provided train and test datasets
- Filter only the triples, from the reference set, which contain entities present in the train and test sets.
- Save.

#### Other
- A reference dataset maps the given train & test datasets to a vector space
- Triplets are filtered based on provided entities (subjects and objects)         
- Execution time: 30-60 sec
- The reference dataset was download using the following script :  ```data/extract_dbpedia_dataset.sh```

In [72]:
from rdflib import Graph, RDF
from pprint import pprint
import os

In [73]:
TRAIN_FILE = "data/KG-2022-train.nt.txt"
TEST_FILE = "data/KG-2022-test.nt.txt"

def get_unique_entities(filename):
    g = Graph()
    g.parse(filename, format='nt')

    unique_p = set()
    unique_s = set()
    unique_o = set()
    unique_sp = set()
    unique_po = set()

    for  stmt in g.subjects(RDF.type, RDF.Statement):

        p = g.value(stmt, RDF.predicate)
        s = g.value(stmt, RDF.subject)
        o = g.value(stmt, RDF.object)

        unique_s.add(str(s))        
        unique_p.add(str(p))        
        unique_o.add(str(o))        
        unique_sp.add((str(s), str(p)))        
        unique_po.add((str(p), str(o)))        

    return unique_s, unique_p, unique_o, unique_sp,  unique_po

train_s, train_p, train_o, train_sp, train_po= get_unique_entities(TRAIN_FILE)
test_s, test_p, test_o, test_sp, test_po= get_unique_entities(TEST_FILE)

u_s = train_s.union(test_s)
u_p = train_p.union(test_p)
u_o = train_o.union(test_o)
u_sp = train_sp.union(test_sp)
u_po = train_po.union(test_po)

# Subjects: 1174, Predicates: 711, Objects: 1284


In [74]:
print(f"""
Subjects: {len(u_s)}, 
Predicates: {len(u_p)}, 
Objcts: {len(u_o)}, 
Subject Predicate: {len(u_sp)}, 
Predicate Object: {len(u_po)}""")



Subjects: 1174, 
Predicates: 9, 
Objcts: 1020, 
Subject Predicate: 1440, 
Predicate Object: 1319


In [None]:
from urllib.parse import unquote

def normalize(uri: str) -> str:
    """Normalize DBpedia-style URIs for consistent matching."""
    return unquote(uri.strip("<>"))


def filter_reference_kg(reference_file, output_file, u_s, u_p, u_o, u_sp, u_po):
    count_kept = 0
    total = 0

    with open(reference_file, "r", encoding="utf-8") as fin, \
         open(output_file, "w", encoding="utf-8") as fout:

        for line in fin:
            line = line.strip()
            if not line:
                continue

            try:
                s, p, o, _ = line.split(" ", 3)
                # s = normalize(s)
                # p = normalize(p)
                # o = normalize(o)
            except ValueError:
                continue  # malformed line

            if (
                (s in u_s) or
                # (p in u_p) or
                (o in u_o)
                # ((s, p) in u_sp) or
                # ((p, 0) in u_po)
            ):
                fout.write(f"<{s}> <{p}> <{o}> .\n")
                count_kept += 1

            total += 1

    print(
        f"Filtering complete. Kept {count_kept:,} triples "
        f"({(100 * count_kept / total):.2f}%)."
    )


In [None]:
RAW_FILE = "data/mappingbased-objects_lang=en.nt"
REFERENCE_FILE = "data/dbpedia-reference-kg.nt"
if os.path.exists(REFERENCE_FILE):
    print("Filtered dataset exists !")
else:
    filter_reference_kg(RAW_FILE, REFERENCE_FILE, u_s, u_p, u_o, u_sp, u_po)

# (data/dbpedia-reference-kg-1.nt) Filtering complete. Kept 1,139,536 triples (5.00%).
# Filtering complete. Kept 1,139,532 triples (5.00%).

Filtering complete. Kept 1,139,532 triples (5.00%).
