# Graph Explorer - Parquet Visualization Example

This notebook demonstrates how to use the Graph Explorer Python helper
to visualize relationship data from Parquet files.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import os

# Import graph explorer helpers
from graph_explorer import process_parquet_for_graph, visualize_graph

## 1. Create Sample Parquet Data

For this example, we'll create a synthetic dataset of relationships between entities.

In [None]:
# Create a synthetic dataset
# In real-world usage, you'd load your own Parquet file instead

# Sample entities
entities = [
    "Alice", "Bob", "Charlie", "Diana", "Evan", "Fiona", "George", 
    "Hannah", "Ian", "Julia", "Kevin", "Laura", "Michael", "Natalie"
]

# Relationship types
rel_types = ["friend", "colleague", "family", "classmate", "neighbor", "partner"]

# Create random relationships
np.random.seed(42)  # For reproducibility
rows = []

for _ in range(100):  # Generate 100 random relationships
    # Get two random entities (ensure they're different)
    entity1, entity2 = np.random.choice(entities, size=2, replace=False)
    
    # Get a random relationship type
    rel = np.random.choice(rel_types)
    
    # Generate a random strength score
    strength = round(np.random.random(), 2)
    
    # Generate a random frequency
    freq = np.random.randint(1, 50)
    
    # Add to our dataset
    rows.append({
        "entity1": entity1,
        "entity2": entity2,
        "relationship": rel,
        "strength": strength,
        "frequency": freq,
        "active": np.random.choice([True, False], p=[0.8, 0.2])
    })

# Create a DataFrame
df = pd.DataFrame(rows)

# Display the first few rows
df.head()

In [None]:
# Save as Parquet file
parquet_path = "sample_relationships.parquet"
df.to_parquet(parquet_path)

print(f"Saved data to {parquet_path}")

## 2. Load and Analyze the Parquet Data

In a real-world scenario, you'd load your existing Parquet file and perform some analysis.

In [None]:
# Load the Parquet file
df = pd.read_parquet(parquet_path)

# Basic analysis
print(f"Dataset has {len(df)} rows and {len(df.columns)} columns")
print(f"Columns: {', '.join(df.columns)}")
print(f"\nRelationship types:")
print(df["relationship"].value_counts())
print(f"\nEntity counts:")
entities_count = pd.concat([df["entity1"], df["entity2"]]).value_counts().head(10)
print(entities_count)

## 3. Filter a Subset for Visualization

Let's extract a subset of the data that we want to visualize.

In [None]:
# Example 1: Extract and visualize all active relationships
active_df = process_parquet_for_graph(
    df,
    source_col="entity1",
    target_col="entity2",
    edge_type_col="relationship",
    filters={"active": True}
)

print(f"Processed {len(active_df)} active relationships for visualization")
active_df.head()

In [None]:
# Example 2: Extract only strong relationships (strength > 0.7)
strong_df = process_parquet_for_graph(
    df,
    source_col="entity1",
    target_col="entity2",
    edge_type_col="relationship",
    filters={
        "active": True,
        "strength": {"operator": ">", "value": 0.7}
    }
)

print(f"Processed {len(strong_df)} strong relationships for visualization")
strong_df.head()

In [None]:
# Example 3: Extract relationships involving a specific entity
alice_df = df[
    (df["entity1"] == "Alice") | (df["entity2"] == "Alice")
]

alice_graph_df = process_parquet_for_graph(
    alice_df,
    source_col="entity1",
    target_col="entity2",
    edge_type_col="relationship"
)

print(f"Found {len(alice_graph_df)} relationships involving Alice")
alice_graph_df

## 4. Visualize the Data

Now let's visualize these different subsets in the Graph Explorer application.

In [None]:
# Make sure the Graph Explorer app is running at http://localhost:3000
# before executing this cell

# Visualize all active relationships
visualize_graph(active_df, method="url")

In [None]:
# Visualize only strong relationships
visualize_graph(strong_df, method="url")

In [None]:
# Visualize Alice's relationship network
visualize_graph(alice_graph_df, method="url")

## 5. Advanced Example: Two-hop Neighborhood

Let's extract a more complex subgraph: all entities within two relationship hops of a given entity.

In [None]:
def get_two_hop_network(df, entity, source_col="entity1", target_col="entity2"):
    """Extract all nodes within 2 hops of the given entity"""
    # First hop: direct connections
    direct = df[(df[source_col] == entity) | (df[target_col] == entity)]
    
    # Get all directly connected entities
    connected = set()
    connected.update(direct[direct[source_col] == entity][target_col].tolist())
    connected.update(direct[direct[target_col] == entity][source_col].tolist())
    
    # Second hop: connections of connected entities
    second_hop = df[
        (df[source_col].isin(connected) & ~(df[target_col] == entity)) | 
        (df[target_col].isin(connected) & ~(df[source_col] == entity))
    ]
    
    # Combine direct and second-hop connections
    result = pd.concat([direct, second_hop]).drop_duplicates()
    
    return result

# Get Kevin's two-hop network
kevin_network = get_two_hop_network(df, "Kevin")
print(f"Kevin's network has {len(kevin_network)} relationships")

# Process for visualization
kevin_graph = process_parquet_for_graph(
    kevin_network,
    source_col="entity1",
    target_col="entity2",
    edge_type_col="relationship"
)

# Visualize
visualize_graph(kevin_graph, method="url")

## 6. Real-world Application: Working with Large Parquet Files

In a real-world scenario with very large Parquet files, you'd typically:

1. Load the Parquet file with filters to reduce memory usage
2. Perform exploratory analysis to identify interesting subgraphs
3. Extract the relevant subset for visualization

The code below shows a sketch of this workflow.

In [None]:
# Pseudocode for working with large parquet files
# (This is not meant to be executed)

'''
# 1. Load only the necessary columns and with filters
import pyarrow.parquet as pq

# Define columns to read
columns = ["source_id", "target_id", "relationship_type", "weight"]

# Read with predicate pushdown (filtering at file read time)
# This is much more efficient than loading the entire file
parquet_file = pq.ParquetFile("very_large_dataset.parquet")
df = parquet_file.read(columns=columns, filters=[("date", "=", "2023-01-01")]).to_pandas()

# 2. Find an interesting subset through analysis
# For example, finding high centrality nodes:
from collections import Counter
node_counts = Counter(df["source_id"].tolist() + df["target_id"].tolist())
top_nodes = [node for node, count in node_counts.most_items()[:10]]

# 3. Extract the subgraph for one of these top nodes (with some maximum size limit)
central_node = top_nodes[0]
k_hop_neighborhood = get_k_hop_neighborhood(df, central_node, k=2, max_edges=1000)

# 4. Process and visualize
graph_df = process_parquet_for_graph(
    k_hop_neighborhood,
    source_col="source_id",
    target_col="target_id",
    edge_type_col="relationship_type",
    filters={"weight": {"operator": ">", "value": 0.5}},
    max_records=500
)

visualize_graph(graph_df)
'''

## Conclusion

This notebook demonstrated how to:

1. Work with Parquet data in Python
2. Extract meaningful relationship subsets
3. Process the data for network visualization
4. Send the data directly to the Graph Explorer web application

With these tools, you can easily explore network relationships in large datasets.