## Feature Engineering.

Feature Engineering. Handle datasets in order to make sure test sets do have new relations/entities

## ## 1. Environment and GPU sanity check

In [6]:
# GPU check for future pipeline
import torch

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)

if not torch.cuda.is_available():
  print("------------No GPU. Set Runtime → Change runtime type → GPU------------")

try:
    import torch_geometric
    print("Torch Geometric:", torch_geometric.__version__)
except ModuleNotFoundError:
    print("Torch Geometric not found. Installing")
    torch_version = torch.__version__.split("+")[0]
    cuda_version = torch.version.cuda.replace(".", "")

    !pip install -q pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv \
        -f https://data.pyg.org/whl/torch-{torch_version}+cu{cuda_version}.html

    !pip install -q torch-geometric

Torch: 2.9.0+cu126
CUDA available: True
CUDA Version: 12.6
Torch Geometric: 2.7.0


## 2. Dataset download and normalization

In [7]:
#!rm -r data

In [8]:
# =========================
# Dataset download & normalization
# =========================

from pathlib import Path
import requests
import pandas as pd
from torch_geometric.datasets import WordNet18RR, FB15k_237

# -------------------------
# Paths
# -------------------------
RAW_DIR  = Path("./raw_data")  # Raw / potentially dirty datasets
DATA_DIR = Path("./data")      # Normalized datasets (h, r, t)

RAW_DIR.mkdir(exist_ok=True)
DATA_DIR.mkdir(exist_ok=True)

print(f"RAW_DIR : {RAW_DIR.resolve()}")
print(f"DATA_DIR: {DATA_DIR.resolve()}")

# -------------------------
# Helpers
# -------------------------
def normalize_to_txt(src_path: Path, dst_path: Path):
    """
    Read raw KG triple file src_path and saves first 3 columns
    as head<TAB>rel<TAB>tail into dst_path.
    """
    df = pd.read_csv(
        src_path,
        sep=None,
        engine="python",
        header=None,
        on_bad_lines="skip"
    )

    if df.shape[1] < 3:
        raise ValueError(
            f"[FORMAT ERROR] Invalid KG triple file: {src_path}\n"
            f"Detected columns: {df.shape[1]}\n"
            "Expected format: head, relation, tail, [optional extra columns]"
        )

    df.iloc[:, :3].to_csv(dst_path, sep="\t", index=False, header=False)


def pyg_dataset_to_standard(pyg_dataset, name: str):
    """
    Normalize (tab) PyG raw files from raw
    and saves as data/name/{train,valid,test}.txt
    into data/name
    """
    raw_dir = Path(pyg_dataset.raw_dir)
    out_dir = DATA_DIR / name
    out_dir.mkdir(exist_ok=True)

    print(f"\nProcessing PyG dataset: {name}")

    file_map = {
        "train": ["train.txt"],
        "valid": ["valid.txt", "valid.csv"],
        "test":  ["test.txt"]
    }

    for split, candidates in file_map.items():
        for fname in candidates:
            src = raw_dir / fname
            if src.exists():
                dst = out_dir / f"{split}.txt"
                normalize_to_txt(src, dst)
                print(f"  -> {split}.txt")
                break
        else:
            print(f"  [!] Missing split: {split}")


def download_file(url: str, dst: Path):
    if dst.exists():
        return
    print(f"Downloading {dst.name}...")
    r = requests.get(url)
    r.raise_for_status()
    dst.write_bytes(r.content)

# -------------------------
# PyG datasets
# -------------------------
print("\n--- Downloading PyG datasets ---")

wn18rr = WordNet18RR(root=RAW_DIR / "WordNet18RR")
pyg_dataset_to_standard(wn18rr, "WN18RR")

fb237 = FB15k_237(root=RAW_DIR / "FB15k-237")
pyg_dataset_to_standard(fb237, "FB15k-237")

# -------------------------
# External datasets
# -------------------------
print("\n--- Downloading external datasets ---")

EXTERNAL_DATASETS = {
    "CoDEx-M": "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-m/",
    "WN11":    "https://raw.githubusercontent.com/KGCompletion/TransL/master/WN11/",
    "FB13":    "https://raw.githubusercontent.com/KGCompletion/TransL/master/FB13/",
}

for name, base_url in EXTERNAL_DATASETS.items():
    raw_out = RAW_DIR / name
    data_out = DATA_DIR / name
    raw_out.mkdir(exist_ok=True)
    data_out.mkdir(exist_ok=True)

    print(f"\n{name}")
    for split in ["train", "valid", "test"]:
        url = f"{base_url}{split}.txt"
        raw_path = raw_out / f"{split}.txt"
        data_path = data_out / f"{split}.txt"

        download_file(url, raw_path)
        normalize_to_txt(raw_path, data_path)
        print(f"  -> {split}.txt")

print("\n[DONE] All datasets downloaded and normalized.")


RAW_DIR : /content/raw_data
DATA_DIR: /content/data

--- Downloading PyG datasets ---

Processing PyG dataset: WN18RR
  -> train.txt
  -> valid.txt
  -> test.txt

Processing PyG dataset: FB15k-237
  -> train.txt
  -> valid.txt
  -> test.txt

--- Downloading external datasets ---

CoDEx-M
  -> train.txt
  -> valid.txt
  -> test.txt

WN11
  -> train.txt
  -> valid.txt
  -> test.txt

FB13
  -> train.txt
  -> valid.txt
  -> test.txt

[DONE] All datasets downloaded and normalized.


## 3. Inductive relation-based splits (InGram setup)

In [9]:
# =========================
# Inductive relation-based splits (NL-*)
# =========================

from pathlib import Path
import random
from collections import defaultdict

# -------------------------
# Config
# -------------------------
SEED = 42

ALPHAS = {
    "NL-25": 0.25,
    "NL-50": 0.50,
    "NL-75": 0.75,
    "NL-100": 1.00,
}

random.seed(SEED)

# -------------------------
# IO helpers
# -------------------------
def read_triples(path: Path):
    triples = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            h, r, t = line.rstrip("\n").split("\t")
            triples.append((h, r, t))
    return triples


def write_triples(path: Path, triples):
    with path.open("w", encoding="utf-8") as f:
        for h, r, t in triples:
            f.write(f"{h}\t{r}\t{t}\n")


# -------------------------
# Core logic
# -------------------------
def generate_inductive_splits(dataset_dir: Path):
    """
    Generate inductive relation-based splits (NL-*) for a dataset directory.

    The input directory must contain:
        train.txt
        valid.txt
        test.txt

    The function creates, inside the same directory:
        NL-25/, NL-50/, NL-75/, NL-100/
    each containing train/valid/test splits where relations in valid/test
    are completely unseen during training.

    Parameters
    ----------
    dataset_dir : Path
        Path to a dataset directory under BASE_DATA_DIR.
    """
    train_path = dataset_dir / "train.txt"
    valid_path = dataset_dir / "valid.txt"
    test_path  = dataset_dir / "test.txt"

    if not (train_path.exists() and valid_path.exists() and test_path.exists()):
        print(f"[SKIP] {dataset_dir.name}: missing train/valid/test files")
        return

    print(f"\n[DATASET] {dataset_dir.name}")

    train = read_triples(train_path)
    valid = read_triples(valid_path)
    test  = read_triples(test_path)

    all_triples = train + valid + test

    # Group triples by relation
    rel2triples = defaultdict(list)
    for h, r, t in all_triples:
        rel2triples[r].append((h, r, t))

    relations = list(rel2triples.keys())
    num_relations = len(relations)

    print(f"  Total relations : {num_relations}")
    print(f"  Total triples   : {len(all_triples)}")

    for split_name, alpha in ALPHAS.items():
        n_new = int(round(num_relations * alpha))

        shuffled = relations[:]
        random.shuffle(shuffled)

        new_rels = set(shuffled[:n_new])
        old_rels = set(shuffled[n_new:])

        train_split = []
        for r in old_rels:
            train_split.extend(rel2triples[r])

        new_triples = []
        for r in new_rels:
            new_triples.extend(rel2triples[r])

        random.shuffle(new_triples)
        mid = len(new_triples) // 2
        valid_split = new_triples[:mid]
        test_split  = new_triples[mid:]

        # Safety checks
        assert {r for _, r, _ in train_split}.isdisjoint(new_rels)
        assert {r for _, r, _ in valid_split}.issubset(new_rels)
        assert {r for _, r, _ in test_split}.issubset(new_rels)

        out_dir = dataset_dir / split_name
        out_dir.mkdir(exist_ok=True)

        write_triples(out_dir / "train.txt", train_split)
        write_triples(out_dir / "valid.txt", valid_split)
        write_triples(out_dir / "test.txt",  test_split)

        print(
            f"  [{split_name}] "
            f"new_rel={len(new_rels)} | "
            f"train={len(train_split)} | "
            f"valid={len(valid_split)} | "
            f"test={len(test_split)}"
        )



In [10]:
# -------------------------
# Run for all datasets
# -------------------------
print("\n=== Generating inductive splits for all datasets ===")

for dataset_dir in DATA_DIR.iterdir():
    if dataset_dir.is_dir():
        generate_inductive_splits(dataset_dir)

print("\n[DONE] All NL-* splits generated.")


=== Generating inductive splits for all datasets ===

[DATASET] CoDEx-M
  Total relations : 51
  Total triples   : 206205
  [NL-25] new_rel=13 | train=173839 | valid=16183 | test=16183
  [NL-50] new_rel=26 | train=137278 | valid=34463 | test=34464
  [NL-75] new_rel=38 | train=97895 | valid=54155 | test=54155
  [NL-100] new_rel=51 | train=0 | valid=103102 | test=103103

[DATASET] FB15k-237
  Total relations : 237
  Total triples   : 310116
  [NL-25] new_rel=59 | train=220712 | valid=44702 | test=44702
  [NL-50] new_rel=118 | train=126564 | valid=91776 | test=91776
  [NL-75] new_rel=178 | train=75589 | valid=117263 | test=117264
  [NL-100] new_rel=237 | train=0 | valid=155058 | test=155058

[DATASET] FB13
  Total relations : 13
  Total triples   : 375514
  [NL-25] new_rel=3 | train=317509 | valid=29002 | test=29003
  [NL-50] new_rel=6 | train=159199 | valid=108157 | test=108158
  [NL-75] new_rel=10 | train=135632 | valid=119941 | test=119941
  [NL-100] new_rel=13 | train=0 | valid=18775