### Training the veracity prediction model

#### Steps
- Load the reference DBpedia graph
- Load the TransE embedding model
- Load train and test knowledge graphs
- Perform feature engineering
    - Convert entities and relations to tensors
    - Compute features

- Split training data into train and validation sets
- Choose a model and train
    - Logistic regression
    - RandomForest
    - Ensemble

- Make predictions on Test data
- Save output file as 'result.ttl'


In [None]:
# !pip install pykeen rdflib torch --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.3/730.3 kB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.2/587.2 kB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# !unzip /content/trans-e-embeddings.zip -d trans-e-embeddings

Archive:  /content/trans-e-embeddings.zip
   creating: trans-e-embeddings/training_triples/
  inflating: trans-e-embeddings/results.json  
  inflating: trans-e-embeddings/trained_model.pkl  
  inflating: trans-e-embeddings/metadata.json  
  inflating: trans-e-embeddings/training_triples/relation_to_id.tsv.gz  
  inflating: trans-e-embeddings/training_triples/numeric_triples.tsv.gz  
  inflating: trans-e-embeddings/training_triples/base.pth  
  inflating: trans-e-embeddings/training_triples/entity_to_id.tsv.gz  


In [5]:
import torch
import numpy as np
from rdflib import Graph, URIRef, RDF
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline

INFO:pykeen.utils:Using opt_einsum


In [None]:
REFERENCE_FILE = "data/dbpedia-reference-kg.nt"
reference_knowledge_graph = Graph()
reference_knowledge_graph.parse(REFERENCE_FILE)

<Graph identifier=N8b6c177b25014850928961b3dd80d156 (<class 'rdflib.graph.Graph'>)>

In [None]:
reference_data_pykeen = TriplesFactory.from_path_binary("trans-e-embeddings/training_triples")


INFO:pykeen.triples.triples_factory:Loading from file:///content/fokg_mini_project/trans-e-embeddings/training_triples


In [None]:
reference_data_pykeen

TriplesFactory(num_entities=897222, num_relations=343, create_inverse_triples=True, num_triples=1139536)

In [None]:
model = torch.load("trans-e-embeddings/trained_model.pkl", map_location=torch.device('cpu'), weights_only=False)


In [26]:
reference_triples = []
for head, rel, tail in reference_knowledge_graph.triples((None, None, None)):
    if isinstance(head, URIRef) and isinstance(tail, URIRef):
        reference_triples.append((str(head), str(rel), str(tail)))

# triple lookup map
triple_lookup = set(reference_triples)

entity_to_id = reference_data_pykeen.entity_to_id
relation_to_id = reference_data_pykeen.relation_to_id

def get_embeddings(subj, pred, obj):
    emb_dim = 120
    if subj not in entity_to_id or obj not in entity_to_id or pred not in relation_to_id:
        return np.zeros(emb_dim), np.zeros(emb_dim), np.zeros(emb_dim)

    s_id = entity_to_id[subj]
    p_id = relation_to_id[pred]
    o_id = entity_to_id[obj]

    with torch.no_grad():
        s_emb = model.entity_representations[0](indices=torch.tensor([s_id]))
        p_emb = model.relation_representations[0](indices=torch.tensor([p_id]))
        o_emb = model.entity_representations[0](indices=torch.tensor([o_id]))

    return (s_emb[0].cpu().numpy(),
            p_emb[0].cpu().numpy(),
            o_emb[0].cpu().numpy())


def build_feature_vector(h, r, t):
    h_emb, r_emb, t_emb = get_embeddings(h, r, t)
    eps = 1e-8

    # Vector norms
    h_n = np.linalg.norm(h_emb) + eps
    r_n = np.linalg.norm(r_emb) + eps
    t_n = np.linalg.norm(t_emb) + eps

    # Normalized vectors
    h_u = h_emb / h_n
    r_u = r_emb / r_n
    t_u = t_emb / t_n

    features = []

    # TransE translation errors
    translation = h_emb + r_emb - t_emb
    features.append(np.linalg.norm(translation, ord=1))
    features.append(np.linalg.norm(translation, ord=2))
    features.append(np.linalg.norm(translation, ord=np.inf))

    # Head–tail distances
    features.append(np.linalg.norm(h_emb - t_emb, ord=1))
    features.append(np.linalg.norm(h_emb - t_emb, ord=2))

    # Cosine similarities
    features.append(np.dot(h_u, t_u))
    features.append(np.dot(h_u, r_u))
    features.append(np.dot(r_u, t_u))

    # Magnitude statistics
    features.extend([
        np.log(h_n), np.log(r_n), np.log(t_n),
        h_n, r_n, t_n
    ])

    # Mean absolute values
    features.extend([
        np.mean(np.abs(h_emb)),
        np.mean(np.abs(r_emb)),
        np.mean(np.abs(t_emb)),
    ])

    # Maximum absolute values
    features.extend([
        np.max(np.abs(h_emb)),
        np.max(np.abs(r_emb)),
        np.max(np.abs(t_emb)),
    ])

    # Dot products
    features.append(np.dot(h_emb, r_emb))
    features.append(np.dot(r_emb, t_emb))
    features.append(np.dot(h_emb, t_emb))

    # Higher-order interactions
    features.append(np.sum(h_emb * r_emb * t_emb))
    features.append(np.sum((h_emb + r_emb) * t_emb))

    # Structural indicators
    features.append(1.0 if (h, r, t) in triple_lookup else 0.0)
    features.append(1.0 if (t, r, h) in triple_lookup else 0.0)

    # Approximate degree features
    h_deg = sum(1 for (x, _, _) in reference_triples if x == h)
    t_deg = sum(1 for (_, _, y) in reference_triples if y == t)

    features.append(np.log(h_deg + 1))
    features.append(np.log(t_deg + 1))

    return np.asarray(features, dtype=np.float32)




In [None]:
from sklearn.model_selection import train_test_split

X_train = []
y_train = []
X_test = []

training_graph = Graph()
training_graph.parse("data/KG-2022-train.nt.txt")

from urllib.parse import unquote

def clean_uri(u: str) -> str:
    return unquote(u.strip("<>"))

for statement in training_graph.subjects(RDF.type, RDF.Statement):

    subject = training_graph.value(statement, RDF.subject)
    predicate = training_graph.value(statement, RDF.predicate)
    obj = training_graph.value(statement, RDF.object)
    val = training_graph.value(statement, URIRef("http://swc2017.aksw.org/hasTruthValue"))


    s = clean_uri(subject.n3().strip("<>"))
    p = clean_uri(predicate.n3().strip("<>"))
    o = clean_uri(obj.n3().strip("<>"))

    if (
        s not in reference_data_pykeen.entity_to_id
        or o not in reference_data_pykeen.entity_to_id
        or p not in reference_data_pykeen.relation_to_id
    ):
        continue

    X_train.append(build_feature_vector(s, p, o) )
    y_train.append(val)


X_train = np.array(X_train)
y_train = np.array(y_train)


X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, shuffle=True
)

X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X_train,
    y_train,
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

In [28]:
X_train_np.shape

(884, 29)

In [None]:
reference_triples

[tensor([[ 0.0956, -0.0728,  0.0666, -0.0813,  0.0144, -0.0103,  0.2118, -0.1911,
          -0.0181,  0.0414, -0.0644,  0.0719, -0.1192, -0.1504,  0.0580,  0.0277,
           0.2171, -0.0295, -0.1733,  0.1091, -0.0041,  0.1586, -0.0670, -0.1008,
           0.3378, -0.1190, -0.1890,  0.0868, -0.0212,  0.1334,  0.1003,  0.1182,
          -0.1779,  0.0255, -0.0671, -0.0898, -0.0803, -0.0473, -0.0185,  0.2489,
          -0.2844, -0.0921, -0.1388,  0.2095,  0.1688,  0.0720,  0.2741,  0.2517,
           0.2354,  0.0305, -0.1960,  0.1013, -0.1812,  0.0305, -0.1271,  0.2643,
          -0.0777, -0.1153,  0.1283,  0.0300,  0.1256, -0.2023,  0.1582,  0.1061,
           0.1145, -0.0206, -0.1550,  0.0302,  0.0060, -0.1934,  0.1558,  0.1532,
           0.3047,  0.0287, -0.1112, -0.0163, -0.1886,  0.1064, -0.0969, -0.0115,
          -0.1778,  0.1793,  0.1289,  0.2544,  0.1638, -0.0116, -0.1558,  0.0323,
           0.1755,  0.2164,  0.1232, -0.0890,  0.0359, -0.1282,  0.1176, -0.0393,
           0.163

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

lr = LogisticRegression(
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)

rf.fit(X_train_np, y_train_np)
lr.fit(X_train_np, y_train_np)


rf_probs = rf.predict_proba(X_test_np)[:, 1]
rf_auc = roc_auc_score(y_test_np, rf_probs)
rf_pred = rf.predict(X_test_np)
rf_acc = accuracy_score(y_test_np, rf_pred)

print(f"Random Forest ROC AUC: {rf_auc:.4f}")
print(f"Random Forest Accuracy: {rf_acc:.4f}")


lr_probs = lr.predict_proba(X_test_np)[:, 1]
lr_auc = roc_auc_score(y_test_np, lr_probs)
lr_pred = lr.predict(X_test_np)
lr_acc = accuracy_score(y_test_np, lr_pred)

print(f"Logistic Regression ROC AUC: {lr_auc:.4f}")
print(f"Logistic Regression Accuracy: {lr_acc:.4f}")


ensemble = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('lr', lr)
    ],
    voting='soft'   
)

ensemble.fit(X_train_np, y_train_np)

ens_probs = ensemble.predict_proba(X_test_np)[:, 1]
ens_auc = roc_auc_score(y_test_np, ens_probs)
ens_pred = ensemble.predict(X_test_np)
ens_acc = accuracy_score(y_test_np, ens_pred)

print(f"Ensemble ROC AUC: {ens_auc:.4f}")
print(f"Ensemble Accuracy: {ens_acc:.4f}")


Random Forest ROC AUC: 0.8251
Random Forest Accuracy: 0.7387
Logistic Regression ROC AUC: 0.8772
Logistic Regression Accuracy: 0.7613
Ensemble ROC AUC: 0.8630
Ensemble Accuracy: 0.7793


In [None]:
testing_graph = Graph()
testing_graph.parse("data/KG-2022-test.nt.txt")

X_test = []
test_triple_iris = []



for statement in testing_graph.subjects(RDF.type, RDF.Statement):

    subject = testing_graph.value(statement, RDF.subject)
    predicate = testing_graph.value(statement, RDF.predicate)
    obj = testing_graph.value(statement, RDF.object)
    # val = training_graph.value(statement, URIRef("http://swc2017.aksw.org/hasTruthValue"))


    s = clean_uri(subject.n3().strip("<>"))
    p = clean_uri(predicate.n3().strip("<>"))
    o = clean_uri(obj.n3().strip("<>"))

    if (
        s not in reference_data_pykeen.entity_to_id
        or o not in reference_data_pykeen.entity_to_id
        or p not in reference_data_pykeen.relation_to_id
    ):
        # print(s,p ,o)
        continue

    X_test.append(build_feature_vector(s, p, o) )
    test_triple_iris.append(statement)




In [31]:
test_pred = ensemble.predict(X_test)


In [32]:
# save predictions to file

test_pred_as_list = [x.item() for x in list(test_pred)]

result_file = open('result.ttl', 'w')
for i in range(len(test_pred_as_list)):
    print(f"<{test_triple_iris[i]}> <http://swc2017.aksw.org/hasTruthValue> \"{test_pred_as_list[i]}\"^^<http://www.w3.org/2001/XMLSchema#double> .", file=result_file)
result_file.close()

In [None]:
# from google.colab import files
# files.download('result.ttl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>