# Construct Knowledge Graph

### Import required libraries

In [20]:
import pandas as pd
import networkx as nx
from pyvis.network import Network
import matplotlib.pyplot as plt
import json
from collections import Counter

### Load data

In [21]:
print("Loading extracted data...")
entities_df = pd.read_csv("../data/entities_extracted.csv")
relations_df = pd.read_csv("../data/relations.csv")

print(f"Loaded {len(entities_df)} entities and {len(relations_df)} relations")

Loading extracted data...
Loaded 109 entities and 80 relations


### Initialize directed graph

In [22]:
G = nx.DiGraph()

### Add nodes

In [23]:
print("\nAdding nodes to graph...")
for _, row in entities_df.iterrows():
    G.add_node(
        row['entity'],
        entity_type=row['entity_type'],
        doi=row['id']
    )


Adding nodes to graph...


### Add edges

In [24]:
print("Adding edges to graph...")
for _, row in relations_df.iterrows():
    source = row['source']
    target = row['target']
    
    # Ensure both nodes exist (add if missing from entities)
    if not G.has_node(source):
        source_type = row.get('source_type', 'Unknown')
        G.add_node(source, entity_type=source_type, doi=row['doi'])
    
    if not G.has_node(target):
        target_type = row.get('target_type', 'Unknown')
        G.add_node(target, entity_type=target_type, doi=row['doi'])
    
    # Add edge with attributes
    G.add_edge(
        source,
        target,
        relation=row['relation'],
        confidence=row.get('confidence', 0.5),
        sentence=row['sentence'],
        doi=row['doi']
    )

print(f"\n Graph built: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

Adding edges to graph...

 Graph built: 91 nodes, 75 edges


### Graph Statistics

In [25]:
# Node type distribution
node_types = [data['entity_type'] for _, data in G.nodes(data=True)]
type_counts = Counter(node_types)
print("\nNode Types:")
for ntype, count in type_counts.items():
    print(f"  - {ntype}: {count}")

# Relation type distribution
relation_types = [data['relation'] for _, _, data in G.edges(data=True)]
relation_counts = Counter(relation_types)
print("\nRelation Types:")
for rtype, count in relation_counts.items():
    print(f"  - {rtype}: {count}")

# Network metrics
print("\nNetwork Metrics:")
print(f"  - Graph Density: {nx.density(G):.4f}")
print(f"  - Number of Connected Components: {nx.number_weakly_connected_components(G)}")

# Most central nodes
degree_cent = nx.degree_centrality(G)
top_nodes = sorted(degree_cent.items(), key=lambda x: x[1], reverse=True)[:10]
print("\nTop 10 Most Connected Entities:")
for i, (node, cent) in enumerate(top_nodes, 1):
    node_type = G.nodes[node]['entity_type']
    degree = G.degree(node)
    print(f"  {i}. {node} ({node_type}) - Degree: {degree}, Centrality: {cent:.3f}")


Node Types:
  - Disease: 54
  - Drug: 37

Relation Types:
  - associated_with: 60
  - causes: 9
  - treated_by: 3
  - treats: 3

Network Metrics:
  - Graph Density: 0.0092
  - Number of Connected Components: 40

Top 10 Most Connected Entities:
  1. infection (Disease) - Degree: 20, Centrality: 0.222
  2. inflammation (Disease) - Degree: 7, Centrality: 0.078
  3. SARS-CoV-2 infection (Disease) - Degree: 6, Centrality: 0.067
  4. PASC (Drug) - Degree: 5, Centrality: 0.056
  5. neuroinflammation (Disease) - Degree: 5, Centrality: 0.056
  6. zinc (Drug) - Degree: 5, Centrality: 0.056
  7. chronic disease (Disease) - Degree: 4, Centrality: 0.044
  8. iron (Drug) - Degree: 4, Centrality: 0.044
  9. NTM (Drug) - Degree: 4, Centrality: 0.044
  10. AXiAEC (Drug) - Degree: 3, Centrality: 0.033


## Visualization

In [26]:
# Define color scheme for node types
color_scheme = {
    'Drug': '#3498db',      # Blue
    'Disease': '#e74c3c',   # Red
    'Symptom': '#f39c12',   # Orange
    'Unknown': '#95a5a6'    # Gray
}

# Define color scheme for relations
relation_colors = {
    'treats': '#2ecc71',         # Green
    'treated_by': '#27ae60',     # Dark Green
    'causes': '#e67e22',         # Orange
    'associated_with': '#95a5a6', # Gray
    'prevents': '#9b59b6'        # Purple
}

# Initialize PyVis network
net = Network(
    height="900px",
    width="100%",
    notebook=False,
    directed=True,
    bgcolor="#ffffff",
    font_color="black"
)

# Configure physics for better layout
net.barnes_hut(
    gravity=-8000,
    central_gravity=0.3,
    spring_length=200,
    spring_strength=0.001,
    damping=0.09,
    overlap=0
)

# Add nodes with styling
print("Adding nodes to interactive visualization...")
for node, data in G.nodes(data=True):
    node_type = data.get('entity_type', 'Unknown')
    node_color = color_scheme.get(node_type, '#95a5a6')
    
    # Calculate node size based on degree
    degree = G.degree(node)
    node_size = 15 + (degree * 3)  # Scale size by connections
    
    # Create hover information
    in_degree = G.in_degree(node)
    out_degree = G.out_degree(node)
    hover_info = f"""
    <b>{node}</b><br>
    Type: {node_type}<br>
    Total Connections: {degree}<br>
    Incoming: {in_degree}<br>
    Outgoing: {out_degree}<br>
    DOI: {data.get('doi', 'N/A')}
    """
    
    net.add_node(
        node,
        label=node,
        title=hover_info.strip(),
        color=node_color,
        size=node_size,
        borderWidth=2,
        borderWidthSelected=4,
        font={'size': 12, 'face': 'arial', 'color': 'black'}
    )

# Add edges with styling
print("Adding edges to interactive visualization...")
for source, target, data in G.edges(data=True):
    relation = data.get('relation', 'unknown')
    edge_color = relation_colors.get(relation, '#95a5a6')
    confidence = data.get('confidence', 0.5)
    
    # Edge width based on confidence
    edge_width = 1 + (confidence * 3)
    
    # Create hover information
    sentence = data.get('sentence', 'N/A')
    if len(sentence) > 200:
        sentence = sentence[:197] + "..."
    
    hover_info = f"""
    <b>Relation: {relation}</b><br>
    Confidence: {confidence:.2f}<br>
    Context: {sentence}<br>
    DOI: {data.get('doi', 'N/A')}
    """
    
    net.add_edge(
        source,
        target,
        label=relation,
        title=hover_info.strip(),
        color=edge_color,
        width=edge_width,
        arrows='to',
        arrowStrikethrough=False
    )

# Enable physics controls
net.show_buttons(filter_=['physics'])

# Save the interactive visualization
output_file = "knowledge_graph_interactive.html"
net.write_html(output_file, open_browser=True)
print(f"\n Interactive graph saved to: {output_file}")

Adding nodes to interactive visualization...
Adding edges to interactive visualization...

 Interactive graph saved to: knowledge_graph_interactive.html
