# PrimeKG Processing and Validation Scripts
To make it easier to load PrimeKG into Neo4j, I wrote some scripts to separate the nodes and edges. I have also verified uniqueness and made sure there are no dangling edges. The unique identifier for nodes is the index. 

In [None]:
import pandas as pd

In [None]:
# Load the CSV data
csv_file_path = 'data/kg.csv'
data = pd.read_csv(csv_file_path, low_memory=False)

# Create unique edges DataFrame with relation information
unique_edges = data[['relation', 'display_relation', 'x_index', 'y_index']].copy()
unique_edges_file_path = 'data/unique_edges.csv'
unique_edges.to_csv(unique_edges_file_path, index=False)

# Create unique nodes DataFrame
# For x nodes
x_nodes = data[['x_index', 'x_id', 'x_type', 'x_name']].drop_duplicates(subset=['x_index'])
x_nodes.columns = ['index', 'node_id', 'node_type', 'node_name']

# For y nodes
y_nodes = data[['y_index', 'y_id', 'y_type', 'y_name']].drop_duplicates(subset=['y_index'])
y_nodes.columns = ['index', 'node_id', 'node_type', 'node_name']

# Combine x and y nodes and remove duplicates based on all columns
unique_nodes = pd.concat([x_nodes, y_nodes])
unique_nodes = unique_nodes.drop_duplicates(subset=['node_id', 'node_type', 'node_name'])
unique_nodes_file_path = 'data/unique_nodes.csv'
unique_nodes.to_csv(unique_nodes_file_path, index=False)

# Print statistics
print(f'Total relationships: {len(unique_edges)}')
print(f'Total unique nodes: {len(unique_nodes)}')
print(f'Unique node types: {unique_nodes["node_type"].unique()}')

# Verification
print('\nVerification counts by node type:')
print(unique_nodes['node_type'].value_counts())

In [None]:
# Load the unique edges data
edges_file_path = 'data/unique_edges.csv'
edges_data = pd.read_csv(edges_file_path)

# Load the unique nodes data
nodes_file_path = 'data/unique_nodes.csv'
nodes_data = pd.read_csv(nodes_file_path)

# Get and display relationship statistics
print("\nRelationship Type Analysis:")
print("-" * 50)

# Count unique relationships
relationship_counts = edges_data['relation'].value_counts()
display_relation_counts = edges_data['display_relation'].value_counts()

print(f"\nTotal unique relationship types: {len(relationship_counts)}")
print("\nRelationship counts:")
print("-" * 50)
for rel, count in relationship_counts.items():
    display_rel = edges_data[edges_data['relation'] == rel]['display_relation'].iloc[0]
    print(f"{rel} ({display_rel}): {count} occurrences")

# Save relationship statistics to CSV
relationship_stats = pd.DataFrame({
    'relation': relationship_counts.index,
    'count': relationship_counts.values
}).merge(
    edges_data[['relation', 'display_relation']].drop_duplicates(),
    on='relation'
)

stats_file_path = 'data/relationship_stats.csv'
relationship_stats.to_csv(stats_file_path, index=False)
print(f"\nDetailed statistics saved to: {stats_file_path}")

In [None]:
# Validation section
print("\n" + "="*50)
print("KNOWLEDGE GRAPH VALIDATION")
print("="*50)

# 1. Check for duplicate nodes
print("\n1. Checking for duplicate nodes...")
duplicate_nodes = nodes_data[nodes_data.duplicated(subset=['index'])]
if len(duplicate_nodes) > 0:
    print(f"WARNING: Found {len(duplicate_nodes)} duplicate node indices!")
    print(duplicate_nodes)
    
    # Remove duplicate nodes
    print("\nRemoving duplicate node indices...")
    original_count = len(nodes_data)
    nodes_data = nodes_data.drop_duplicates(subset=['index'])
    print(f"✓ Removed {original_count - len(nodes_data)} duplicate node indices.")
else:
    print("✓ No duplicate node indices found.")

# Also check for duplicate node IDs
duplicate_node_ids = nodes_data[nodes_data.duplicated(subset=['node_id'])]
if len(duplicate_node_ids) > 0:
    print(f"WARNING: Found {len(duplicate_node_ids)} duplicate node IDs!")
    print(duplicate_node_ids)
    
    # Remove duplicate node IDs
    print("\nRemoving duplicate node IDs...")
    original_count = len(nodes_data)
    nodes_data = nodes_data.drop_duplicates(subset=['node_id'])
    print(f"✓ Removed {original_count - len(nodes_data)} duplicate node IDs.")
else:
    print("✓ No duplicate node IDs found.")

# 2. Verify that all relationship endpoints exist in the nodes dataset
print("\n2. Verifying relationship endpoints...")
node_indices = set(nodes_data['index'].values)

# Check x_index (source nodes)
missing_source_nodes = edges_data[~edges_data['x_index'].isin(node_indices)]
if len(missing_source_nodes) > 0:
    print(f"WARNING: Found {len(missing_source_nodes)} relationships with missing source nodes!")
    print(missing_source_nodes.head(10))  # Show first 10 examples
else:
    print("✓ All relationship source nodes exist in the nodes dataset.")

# Check y_index (target nodes)
missing_target_nodes = edges_data[~edges_data['y_index'].isin(node_indices)]
if len(missing_target_nodes) > 0:
    print(f"WARNING: Found {len(missing_target_nodes)} relationships with missing target nodes!")
    print(missing_target_nodes.head(10))  # Show first 10 examples
else:
    print("✓ All relationship target nodes exist in the nodes dataset.")

# 3. Summary
print("\nValidation Summary:")
if len(duplicate_nodes) == 0 and len(duplicate_node_ids) == 0 and len(missing_source_nodes) == 0 and len(missing_target_nodes) == 0:
    print("✓ Knowledge graph validation passed. No issues found.")
else:
    total_issues = len(duplicate_nodes) + len(duplicate_node_ids) + len(missing_source_nodes) + len(missing_target_nodes)
    print(f"⚠ Knowledge graph validation found {total_issues} issues that should be addressed.")

# Save the cleaned nodes data
if len(duplicate_nodes) > 0 or len(duplicate_node_ids) > 0:
    cleaned_nodes_file_path = 'data/unique_nodes_cleaned.csv'
    nodes_data.to_csv(cleaned_nodes_file_path, index=False)
    print(f"\nCleaned nodes data saved to: {cleaned_nodes_file_path}")
    print("NOTE: Please review the cleaned data and replace 'data/unique_nodes.csv' with it if satisfied.")

# Optional: Save validation results to a file
validation_results = {
    'duplicate_node_indices': len(duplicate_nodes),
    'duplicate_node_ids': len(duplicate_node_ids),
    'missing_source_nodes': len(missing_source_nodes),
    'missing_target_nodes': len(missing_target_nodes)
}

validation_df = pd.DataFrame([validation_results])
validation_file_path = 'data/kg_validation_results.csv'
validation_df.to_csv(validation_file_path, index=False)
print(f"\nDetailed validation results saved to: {validation_file_path}")

In [None]:
# Load the unique nodes data
nodes_file_path = 'data/unique_nodes.csv'
nodes_data = pd.read_csv(nodes_file_path)

# Extract unique node types and save to text file
unique_node_types = nodes_data['node_type:string'].unique()
node_types_file_path = 'data/unique_node_types.txt'

# Write the unique node types to a text file, one per line
with open(node_types_file_path, 'w') as f:
    for node_type in unique_node_types:
        f.write(f"{node_type}\n")

print(f"\nUnique node types saved to: {node_types_file_path}")