In [5]:
# # Extract Skill Subgraph from RDF File
# # Extract triples where subject or object is at least one skill from data/esco-v1.2.0.rdf

# import pandas as pd
# from pathlib import Path
# from rdflib import Graph

# print("=" * 70)
# print("Extract Skill Subgraph from RDF File")
# print("=" * 70)

# # File paths
# rdf_file = "escodata/esco-v1.2.0.rdf"  # RDF/XML format
# output_dir = Path("out_esco")
# output_file = output_dir / "skill_subgraph_from_rdf.tsv"  # New filename, does not overwrite skill_subgraph_triples.tsv

# # Skill URI prefix
# skill_uri_prefix = "http://data.europa.eu/esco/skill/"

# print(f"\nInput file: {rdf_file}")
# print(f"Output file: {output_file}")
# print(f"Skill URI prefix: {skill_uri_prefix}")

# # Check if input file exists
# if not Path(rdf_file).exists():
#     print(f"\n⚠️  Error: File {rdf_file} does not exist")
#     print("  Please ensure the file path is correct")
#     raise FileNotFoundError(f"File not found: {rdf_file}")

# # Create output directory
# output_dir.mkdir(parents=True, exist_ok=True)

# print("\n" + "=" * 70)
# print("Step 1: Load RDF File")
# print("=" * 70)
# print("Loading RDF file (this may take a few minutes)...")

# # Load RDF file
# graph = Graph()
# graph.parse(rdf_file, format="xml")

# print(f"✓ RDF file loaded successfully")
# print(f"  Total triples: {len(graph):,}")

# print("\n" + "=" * 70)
# print("Step 2: Extract Subgraph Containing Skills")
# print("=" * 70)

# # Helper function: Convert URI to string uniformly
# def uri_to_string(uri):
#     """Convert rdflib URI object to string"""
#     if isinstance(uri, str):
#         return uri
#     elif hasattr(uri, 'toPython'):
#         return str(uri.toPython())
#     else:
#         return str(uri)

# # Helper function: Check if it's a literal
# def is_literal(uri_obj):
#     """
#     Check if it's a literal
#     In rdflib, Literal objects have a specific type
#     """
#     from rdflib import Literal
#     return isinstance(uri_obj, Literal)

# # Extract all triples containing skills (subject or object is skill)
# # Only keep URI-to-URI triples (can be directly used for PyKEEN)
# skill_subgraph_triples = []
# literal_count = 0

# print("Extracting triples containing Skills (only URI-to-URI, filtering literals)...")
# for idx, (subj, pred, obj) in enumerate(graph, 1):
#     if idx % 100000 == 0:
#         print(f"  Processed {idx:,} triples, extracted {len(skill_subgraph_triples):,} relevant triples (filtered {literal_count:,} literals)...")
    
#     # Check if object is a literal (if so, skip this triple)
#     if is_literal(obj):
#         literal_count += 1
#         continue
    
#     # Convert to string uniformly
#     subj_str = uri_to_string(subj)
#     pred_str = uri_to_string(pred)
#     obj_str = uri_to_string(obj)
    
#     # Check if subject or object is a skill
#     is_subj_skill = subj_str.startswith(skill_uri_prefix)
#     is_obj_skill = obj_str.startswith(skill_uri_prefix)
    
#     # If subject or object is a skill, keep this triple
#     if is_subj_skill or is_obj_skill:
#         skill_subgraph_triples.append([subj_str, pred_str, obj_str])

# print(f"\n✓ Extracted {len(skill_subgraph_triples):,} triples containing Skills (URI-to-URI, can be directly used for PyKEEN)")
# print(f"  Original triples: {len(graph):,}")
# print(f"  Subgraph triples: {len(skill_subgraph_triples):,}")
# print(f"  Filtered literal triples: {literal_count:,}")
# if len(graph) > 0:
#     print(f"  Retention ratio: {len(skill_subgraph_triples)/len(graph)*100:.2f}%")

# print("\n" + "=" * 70)
# print("Step 3: Save Subgraph to TSV File")
# print("=" * 70)

# # Save as TSV format
# subgraph_df = pd.DataFrame(skill_subgraph_triples, columns=['subject', 'predicate', 'object'])
# subgraph_df.to_csv(output_file, sep='\t', index=False, header=False, encoding='utf-8')

# print(f"✓ Subgraph saved to: {output_file}")
# print(f"  File size: {Path(output_file).stat().st_size / (1024**2):.2f} MB")
# print(f"  Number of rows: {len(subgraph_df):,}")

# print("\n" + "=" * 70)
# print("Extraction Complete!")
# print("=" * 70)
# print(f"\nExtraction Principle:")
# print(f"  1. Read all triples from RDF/XML file")
# print(f"  2. Identify all Skill URIs (by prefix matching: {skill_uri_prefix})")
# print(f"  3. Filter out triples containing literals, only keep URI-to-URI triples")
# print(f"  4. Keep all triples containing Skills (subject or object is Skill)")
# print(f"  5. Save as TSV format for subsequent use")
# print(f"\nThe extracted subgraph contains:")
# print(f"  - All Skill entities")
# print(f"  - Relationships between Skills (e.g., broader/narrower)")
# print(f"  - Associations between Skills and other entities (e.g., occupation, concept, etc.)")
# print(f"  - Does not contain literals (e.g., label, description and other text attributes)")
# print(f"\n✓ All triples are URI-to-URI and can be directly used for PyKEEN training")


In [7]:

# Check for duplicates in extracted triples

import pandas as pd
from pathlib import Path

print("=" * 70)
print("Check for Duplicates in Triples")
print("=" * 70)

# File path
tsv_file = "skill_subgraph_from_sparql.tsv"

print(f"\nChecking file: {tsv_file}")

# Load file
print("\nLoading file...")
df = pd.read_csv(
    tsv_file,
    sep='\t',
    header=None,
    names=['subject', 'predicate', 'object'],
    encoding='utf-8'
)
print(f"✓ Loaded successfully: {len(df):,} rows")

print("\n" + "=" * 70)
print("Checking for Duplicates")
print("=" * 70)

# Check for completely duplicate triples (subject, predicate, object are identical)
print("\n1. Checking for completely duplicate triples...")
duplicated_rows = df.duplicated(keep=False)  # keep=False marks all duplicates (including first occurrence)
num_duplicated = duplicated_rows.sum()
num_unique = df.drop_duplicates().shape[0]

print(f"  Total rows: {len(df):,}")
print(f"  Unique triples: {num_unique:,}")
print(f"  Duplicate rows: {num_duplicated:,}")

if num_duplicated > 0:
    print(f"  ⚠️  Found {num_duplicated:,} duplicate rows")
    print(f"  Duplicate ratio: {num_duplicated/len(df)*100:.2f}%")
    
    # Display statistics of duplicates
    duplicated_df = df[duplicated_rows]
    duplicate_counts = duplicated_df.groupby(['subject', 'predicate', 'object']).size().reset_index(name='count')
    duplicate_counts = duplicate_counts[duplicate_counts['count'] > 1].sort_values('count', ascending=False)
    
    print(f"\n  Duplicate triple details:")
    print(f"    {len(duplicate_counts):,} different triples appear multiple times")
    print(f"    Maximum duplicate count: {duplicate_counts['count'].max()}")
    print(f"    Average duplicate count: {duplicate_counts['count'].mean():.2f}")
    
    # Display top 10 most duplicated triples
    print(f"\n  Top 10 most duplicated triples:")
    for idx, row in duplicate_counts.head(10).iterrows():
        print(f"\n    {idx+1}. Duplicated {int(row['count'])} times:")
        print(f"       Subject: {row['subject'][:80]}...")
        print(f"       Predicate: {row['predicate'][:80]}...")
        print(f"       Object: {row['object'][:80]}...")
else:
    print(f"  ✓ No completely duplicate triples found")

# Check for duplicate (subject, predicate) pairs (different objects)
print("\n" + "=" * 70)
print("2. Checking (subject, predicate) pair duplicates")
print("=" * 70)

subject_predicate_duplicates = df.groupby(['subject', 'predicate']).size()
multiple_objects = subject_predicate_duplicates[subject_predicate_duplicates > 1]

if len(multiple_objects) > 0:
    print(f"  Found {len(multiple_objects):,} (subject, predicate) pairs with multiple different objects")
    print(f"  This is normal (one entity can be related to multiple entities through the same relation)")
    print(f"\n  Examples (top 5):")
    for (subj, pred), count in multiple_objects.head(5).items():
        print(f"\n    Subject: {subj[:60]}...")
        print(f"    Predicate: {pred[:60]}...")
        print(f"    Has {count} different objects:")
        objects = df[(df['subject'] == subj) & (df['predicate'] == pred)]['object'].unique()
        for i, obj in enumerate(objects[:3], 1):  # Only show first 3
            print(f"      {i}. {obj[:60]}...")
        if len(objects) > 3:
            print(f"      ... and {len(objects) - 3} more")
else:
    print(f"  ✓ No (subject, predicate) pair duplicates found")

# Check for duplicate (subject, object) pairs (different predicates)
print("\n" + "=" * 70)
print("3. Checking (subject, object) pair duplicates")
print("=" * 70)

subject_object_duplicates = df.groupby(['subject', 'object']).size()
multiple_predicates = subject_object_duplicates[subject_object_duplicates > 1]

if len(multiple_predicates) > 0:
    print(f"  Found {len(multiple_predicates):,} (subject, object) pairs with multiple different predicates")
    print(f"  This is normal (two entities can have multiple relationships)")
    print(f"\n  Examples (top 5):")
    for (subj, obj), count in multiple_predicates.head(5).items():
        print(f"\n    Subject: {subj[:60]}...")
        print(f"    Object: {obj[:60]}...")
        print(f"    Has {count} different predicates:")
        predicates = df[(df['subject'] == subj) & (df['object'] == obj)]['predicate'].unique()
        for i, pred in enumerate(predicates, 1):
            print(f"      {i}. {pred[:60]}...")
else:
    print(f"  ✓ No (subject, object) pair duplicates found")

# Summary
print("\n" + "=" * 70)
print("Check Complete")
print("=" * 70)

if num_duplicated > 0:
    print(f"\n⚠️  Found {num_duplicated:,} completely duplicate triples")
    print(f"  Suggestion: If deduplication is needed, use df.drop_duplicates()")
else:
    print(f"\n✓ No completely duplicate triples found, data quality is good")


Check for Duplicates in Triples

Checking file: skill_subgraph_from_sparql.tsv

Loading file...
✓ Loaded successfully: 1,814,404 rows

Checking for Duplicates

1. Checking for completely duplicate triples...
  Total rows: 1,814,404
  Unique triples: 1,780,467
  Duplicate rows: 67,874
  ⚠️  Found 67,874 duplicate rows
  Duplicate ratio: 3.74%

  Duplicate triple details:
    33,937 different triples appear multiple times
    Maximum duplicate count: 2
    Average duplicate count: 2.00

  Top 10 most duplicated triples:

    1. Duplicated 2 times:
       Subject: <http://data.europa.eu/esco/skill/0005c151-5b5a-4a66-8aac-60e734beb1ab>...
       Predicate: <http://www.w3.org/2004/02/skos/core#broader>...
       Object: <http://data.europa.eu/esco/skill/339ac029-066a-4985-9f9d-b3d7c8fea0bb>...

    22621. Duplicated 2 times:
       Subject: <http://data.europa.eu/esco/skill/a79f7982-36b8-44e7-b19a-9b7366ca2e87>...
       Predicate: <http://www.w3.org/2004/02/skos/core#broaderTransitive>...
