## Feature Engineering.

Feature Engineering. Handle datasets in order to make sure test sets do have new relations/entities

## ## 1. Environment and GPU sanity check

In [4]:
# GPU check for future pipeline
import torch

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)

if not torch.cuda.is_available():
  print("------------No GPU. Set Runtime → Change runtime type → GPU------------")

try:
    import torch_geometric
    print("Torch Geometric:", torch_geometric.__version__)
except ModuleNotFoundError:
    print("Torch Geometric not found. Installing")
    torch_version = torch.__version__.split("+")[0]
    cuda_version = torch.version.cuda.replace(".", "")

    !pip install -q pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv \
        -f https://data.pyg.org/whl/torch-{torch_version}+cu{cuda_version}.html

    !pip install -q torch-geometric

Torch: 2.9.0+cu126
CUDA available: True
CUDA Version: 12.6
Torch Geometric: 2.7.0


## 2. Dataset download and normalization

In [2]:
# Datasets download

from pathlib import Path

# Rutas base
PYG_DIR = Path("./pyg_temp")     # Carpeta temporal para descargas de PyG
FINAL_DATA_DIR = Path("./data")  # Carpeta de datos principal

# Crear carpetas si no existen (idempotente)
for directory in [PYG_DIR, FINAL_DATA_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

print("Estructura de carpetas verificada:")
print(f"- PYG_DIR: {PYG_DIR.resolve()}")
print(f"- FINAL_DATA_DIR: {FINAL_DATA_DIR.resolve()}")

import os
import shutil
import pandas as pd
from torch_geometric.datasets import WordNet18RR, FB15k_237

# --- CONFIGURACIÓN ---
PYG_DIR = "./pyg_temp"  # Carpeta temporal para descargas de PyG
FINAL_DATA_DIR = "./data" # Tu carpeta de datos principal

def standard_to_txt(pyg_dataset, dataset_name):
    """
    Toma los archivos raw descargados por PyG y los mueve a tu carpeta ./data
    en formato limpio (head, relation, tail).
    """
    raw_dir = pyg_dataset.raw_dir
    target_dir = os.path.join(FINAL_DATA_DIR, dataset_name)
    os.makedirs(target_dir, exist_ok=True)

    print(f"\nProcesando {dataset_name} desde {raw_dir}...")

    # Mapeo de nombres de archivos de PyG a nombres estándar
    # PyG a veces usa 'train.txt', a veces otros nombres.
    files_map = {
        'train': ['train.txt'],
        'valid': ['valid.txt', 'valid.csv'],
        'test': ['test.txt']
    }

    for split, possible_names in files_map.items():
        found = False
        for fname in possible_names:
            src_path = os.path.join(raw_dir, fname)
            if os.path.exists(src_path):
                found = True
                dst_path = os.path.join(target_dir, f"{split}.txt")

                # Leemos con Pandas para asegurarnos de limpiar headers o índices extra
                try:
                    # FB15k-237 y WN18RR raw suelen venir separados por tabs o espacios
                    df = pd.read_csv(src_path, sep=None, engine='python', header=None, on_bad_lines='skip')

                    # Quedarnos con las primeras 3 columnas (Head, Rel, Tail)
                    # OJO: FB15k-237 a veces viene como (Head, Tail, Rel) o (Head, Rel, Tail)
                    # En los raw files de PyG estándar suele ser: Head, Relation, Tail (Strings)
                    if df.shape[1] >= 3:
                        df = df.iloc[:, :3]

                        # Guardamos en formato limpio separado por comas o tabs
                        df.to_csv(dst_path, sep=',', index=False, header=False)
                        print(f"  -> {split}.txt guardado en {target_dir} ({len(df)} filas)")
                    else:
                        print(f"  [!] Estructura extraña en {fname}: {df.shape}")

                except Exception as e:
                    print(f"  [Error] al procesar {fname}: {e}")
                break

        if not found:
            print(f"  [X] No se encontró archivo para split '{split}'")

# --- 1. DESCARGAR WN18RR ---
print("--- Descargando WordNet18RR usando PyG ---")
# Esto descargará automáticamente los archivos a ./pyg_temp/WordNet18RR/raw
dataset_wn = WordNet18RR(root=os.path.join(PYG_DIR, "WordNet18RR"))
standard_to_txt(dataset_wn, "WN18RR")

# --- 2. DESCARGAR FB15k-237 ---
print("--- Descargando FB15k-237 usando PyG ---")
# Esto descargará automáticamente los archivos a ./pyg_temp/FB15k_237/raw
dataset_fb = FB15k_237(root=os.path.join(PYG_DIR, "FB15k-237"))
standard_to_txt(dataset_fb, "FB15k-237")

print("\n--- ¡LISTO! ---")
print(f"Ahora tienes WN18RR y FB15k-237 en la carpeta '{FINAL_DATA_DIR}' con el mismo formato que el resto.")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import requests

# Set style for plots
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = [10, 6]

# --- CONFIGURATION (CORRECTED URLS) ---
DATA_DIR = "./data"

DATASETS = {
    "CoDEx-M": {
        "base_url": "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-m/",
        "files": ["train.txt", "valid.txt", "test.txt"]
    },
    "WN11": {
        "base_url": "https://raw.githubusercontent.com/KGCompletion/TransL/master/WN11/",
        "files": ["train.txt", "valid.txt", "test.txt"]
    },
    "FB13": {
        "base_url": "https://raw.githubusercontent.com/KGCompletion/TransL/master/FB13/",
        "files": ["train.txt", "valid.txt", "test.txt"]
    }
}


# --- HELPER FUNCTIONS ---
def download_file(url, save_path):
    """Downloads a file from a URL if it doesn't exist."""
    if not os.path.exists(save_path):
        print(f"Downloading {save_path}...")
        try:
            r = requests.get(url)
            r.raise_for_status()
            with open(save_path, 'wb') as f:
                f.write(r.content)
            print(f"  -> Success!")
        except Exception as e:
            print(f"  -> Failed to download {url}: {e}")
    else:
        print(f"Found local: {save_path}")

def load_dataset(name, config):
    """Loads train/valid/test into a dictionary of DataFrames."""
    path = os.path.join(DATA_DIR, name)
    os.makedirs(path, exist_ok=True)

    dfs = {}
    for file_name in config["files"]:
        # 1. Download
        url = config["base_url"] + file_name
        local_path = os.path.join(path, file_name)
        download_file(url, local_path)

        # 2. Load to Pandas
        # Try-catch to handle potential parsing issues
        try:
            # We use engine='python' and sep=None to auto-detect tab or space separators
            # This makes it robust against different format standards
            df = pd.read_csv(local_path, sep=None, engine='python',
                             names=['head', 'relation', 'tail'],
                             on_bad_lines='skip')

            # Clean up: Sometimes the last column has a 4th value (label 1/-1) in older datasets
            # For EDA we usually just want the triplet. Let's check shape.
            if df.shape[1] > 3:
                df = df.iloc[:, :3]
                df.columns = ['head', 'relation', 'tail']

            dfs[file_name.replace('.txt', '')] = df
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

    return dfs

# --- EXECUTION ---
print("--- STARTING DATA LOADING ---")
kg_data = {}

for ds_name, config in DATASETS.items():
    print(f"\nProcessing {ds_name}...")
    kg_data[ds_name] = load_dataset(ds_name, config)

    # Quick Sanity Check
    if 'train' in kg_data[ds_name]:
        count = len(kg_data[ds_name]['train'])
        print(f"  -> Loaded {ds_name} Train: {count} triplets")
        print(f"  -> Sample: {kg_data[ds_name]['train'].iloc[0].values}")


Estructura de carpetas verificada:
- PYG_DIR: /content/pyg_temp
- FINAL_DATA_DIR: /content/data
--- Descargando WordNet18RR usando PyG ---

Procesando WN18RR desde pyg_temp/WordNet18RR/raw...
  -> train.txt guardado en ./data/WN18RR (86835 filas)
  -> valid.txt guardado en ./data/WN18RR (3034 filas)
  -> test.txt guardado en ./data/WN18RR (3134 filas)
--- Descargando FB15k-237 usando PyG ---

Procesando FB15k-237 desde pyg_temp/FB15k-237/raw...
  -> train.txt guardado en ./data/FB15k-237 (272115 filas)
  -> valid.txt guardado en ./data/FB15k-237 (17535 filas)
  -> test.txt guardado en ./data/FB15k-237 (20466 filas)

--- ¡LISTO! ---
Ahora tienes WN18RR y FB15k-237 en la carpeta './data' con el mismo formato que el resto.
--- STARTING DATA LOADING ---

Processing CoDEx-M...
Found local: ./data/CoDEx-M/train.txt
Found local: ./data/CoDEx-M/valid.txt
Found local: ./data/CoDEx-M/test.txt
  -> Loaded CoDEx-M Train: 185584 triplets
  -> Sample: ['Q108946' 'P161' 'Q39792']

Processing WN11...


## 3. Inductive relation-based splits (InGram setup)

In [3]:
# Inductive relation-based partitioning for Knowledge Graph datasets

"""
Inductive relation-based partitioning for Knowledge Graph datasets.
Generates NL-25 / NL-50 / NL-75 / NL-100 splits where relations are unseen in train.

Assumptions:
- Input datasets follow:
  data/DATASET/train.txt
  data/DATASET/valid.txt
  data/DATASET/test.txt
- Triplets are tab-separated: h \t r \t t
- Entities may appear in any split (allowed)
- Relations selected as "new" do NOT appear in train
- valid and test contain ONLY new relations
- Reproducible via fixed seed
- No external deps beyond Python stdlib (+ optional numpy, not required)

Author: you + ChatGPT
"""

import os
import random
from collections import defaultdict

# =========================
# CONFIG
# =========================
BASE_DATA_DIR = "data"
DATASET_NAME = "WN18RR"      # change if needed
SEED = 42

ALPHAS = {
    "NL-25": 0.25,
    "NL-50": 0.50,
    "NL-75": 0.75,
    "NL-100": 1.00,
}

# =========================
# UTILS
# =========================
def read_triples(path):
    triples = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            h, r, t = line.split("\t")
            triples.append((h, r, t))
    return triples


def write_triples(path, triples):
    with open(path, "w", encoding="utf-8") as f:
        for h, r, t in triples:
            f.write(f"{h}\t{r}\t{t}\n")


def ensure_dir(path):
    os.makedirs(path, exist_ok=True)


# =========================
# MAIN LOGIC
# =========================
def main():
    random.seed(SEED)

    dataset_dir = os.path.join(BASE_DATA_DIR, DATASET_NAME)
    train_path = os.path.join(dataset_dir, "train.txt")
    valid_path = os.path.join(dataset_dir, "valid.txt")
    test_path  = os.path.join(dataset_dir, "test.txt")

    print(f"\n[INFO] Loading base dataset: {DATASET_NAME}")

    train_triples = read_triples(train_path)
    valid_triples = read_triples(valid_path)
    test_triples  = read_triples(test_path)

    all_triples = train_triples + valid_triples + test_triples

    # Group triples by relation
    rel2triples = defaultdict(list)
    for h, r, t in all_triples:
        rel2triples[r].append((h, r, t))

    all_relations = sorted(rel2triples.keys())
    num_relations = len(all_relations)

    print(f"[STATS] Total triples      : {len(all_triples)}")
    print(f"[STATS] Total relations    : {num_relations}")

    for split_name, alpha in ALPHAS.items():
        print("\n" + "=" * 60)
        print(f"[SPLIT] Generating {split_name} (alpha={alpha})")

        num_new_rel = int(round(num_relations * alpha))

        if num_new_rel > num_relations:
            num_new_rel = num_relations

        shuffled_relations = all_relations[:]
        random.shuffle(shuffled_relations)

        new_relations = set(shuffled_relations[:num_new_rel])
        old_relations = set(shuffled_relations[num_new_rel:])

        # Build splits
        train_split = []
        valid_split = []
        test_split  = []

        for r in old_relations:
            train_split.extend(rel2triples[r])

        new_relation_triples = []
        for r in new_relations:
            new_relation_triples.extend(rel2triples[r])

        # Split new-relation triples into valid/test (50/50)
        random.shuffle(new_relation_triples)
        mid = len(new_relation_triples) // 2
        valid_split = new_relation_triples[:mid]
        test_split  = new_relation_triples[mid:]

        # Safety checks
        train_rels = {r for _, r, _ in train_split}
        valid_rels = {r for _, r, _ in valid_split}
        test_rels  = {r for _, r, _ in test_split}

        assert train_rels.isdisjoint(new_relations), "Leakage: new relations in train!"
        assert valid_rels.issubset(new_relations), "Invalid relation in valid!"
        assert test_rels.issubset(new_relations), "Invalid relation in test!"

        # Output directory
        out_dir = os.path.join(dataset_dir, split_name)
        ensure_dir(out_dir)

        write_triples(os.path.join(out_dir, "train.txt"), train_split)
        write_triples(os.path.join(out_dir, "valid.txt"), valid_split)
        write_triples(os.path.join(out_dir, "test.txt"),  test_split)

        # Report
        print(f"[STATS] #new relations     : {len(new_relations)}")
        print(f"[STATS] train triples     : {len(train_split)}")
        print(f"[STATS] valid triples     : {len(valid_split)}")
        print(f"[STATS] test  triples     : {len(test_split)}")
        print(f"[PATH ] Written to        : {out_dir}")

    print("\n[DONE] All NL-* splits generated successfully.")


if __name__ == "__main__":
    main()



[INFO] Loading base dataset: WN18RR


ValueError: not enough values to unpack (expected 3, got 1)