## Graph Analytics with ArangoDB (Python-Arango Library) (Part 4)

### Install Required Libraries

In [1]:
!pip3 install --upgrade pip

In [2]:
%pip install python-arango
%pip install networkx
%pip install numpy
%pip install scipy
%pip install tabulate
%pip install geopandas

### Library Imports

In [3]:
import sys, csv
from statistics import mode, StatisticsError
from typing import Dict

from arango import ArangoClient
import networkx as nx
import pkg_resources

import numpy as np
import scipy
from tabulate import tabulate
import geopandas as gpd

### Library Versions

In [4]:
l = 15
r = 14

arango_version = pkg_resources.get_distribution("python-arango").version

print("Software & Library Versions".center(l+r))
print('-'* (l + 1) + '|' + '-' * (r - 2))
print('Python'.rjust(l), '|', sys.version[0:6])
print('Arango Client'.rjust(l), '|', arango_version)
print('NetworkX'.rjust(l), '|', nx.__version__)
print('NumPy'.rjust(l), '|', np.__version__)
print('SciPy'.rjust(l), '|', scipy.__version__)

 Software & Library Versions 
----------------|------------
         Python | 3.11.4
  Arango Client | 8.1.2
       NetworkX | 3.4.2
          NumPy | 2.1.3
          SciPy | 1.14.1


### Import Dataset

In [5]:
# Connect to ArangoDB
client = ArangoClient()
db = client.db('_system', username='root', password='testpassword')

# Access collections
nodes = db.collection('airports')
edges = db.collection('flights')

# Fetch graph data
graph_data = {
    'nodes': list(nodes.all()),
    'edges': list(edges.all())
}

flightGraph = nx.MultiDiGraph()

# Add nodes
for node in graph_data['nodes']:
    flightGraph.add_node(node['_key'], **node)

# Add edges
for edge in graph_data['edges']:
    flightGraph.add_edge(
        edge['_from'].split('/')[-1],
        edge['_to'].split('/')[-1],
        **edge
    )

### Some Introductory Commands

#### Show Number of Airports

In [6]:
query = """
FOR airport IN airports
    COLLECT WITH COUNT INTO numAirports
RETURN { NumberOfAirports: numAirports }
"""
result = db.aql.execute(query)
print(list(result))

[{'NumberOfAirports': 365}]


#### Show Number of Flights

In [7]:
query = """
FOR flight IN flights
    COLLECT WITH COUNT INTO numFlights
RETURN { NumberOfFlights: numFlights }
"""
result = db.aql.execute(query)

print(list(result))

[{'NumberOfFlights': 992298}]


#### Add Airline Names for Each Airline Code

In [8]:
# Load CSV file
with open('../import/airlines.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        query = """
        FOR flight IN flights
            FILTER flight.airline_id == @airline_id
            UPDATE flight WITH {
                airline_name: @airline_name,
                airline_code: @airline_code
            } IN flights
        """
        db.aql.execute(query, bind_vars={
            'airline_id': row['airline_id'],
            'airline_name': row['airline_name'],
            'airline_code': row['airline_code']
        })

#### Add Airport Name, City, & State

In [9]:
with open('../import/airports.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        query = """
        FOR airport IN airports
            FILTER airport.unique_id == @unique_id
            UPDATE airport WITH {
                airport_code: @airport_code,
                airport_name: @airport_name,
                airport_city: @city_name,
                airportState_code: @state,
                airportState_name: @state_name
            } IN airports
        """
        db.aql.execute(query, bind_vars={
            'unique_id': row['unique_id'],
            'airport_code': row['airport_code'],
            'airport_name': row['airport_name'],
            'city_name': row['city_name'],
            'state': row['state'],
            'state_name': row['state_name']
        })

#### Retrieve Number of Distinct States

In [10]:
query = """
FOR flight IN flights
    COLLECT state_name = flight.state_name
    WITH COUNT INTO distinct_state_names
RETURN distinct_state_names
"""
result = db.aql.execute(query)

print(list(result))

[992298]


### Utility Functions

### Functions to Create Maps Between Ids & Attribute Values

In [11]:
def create_airport_code_dict(G):
    """
    Extracts a dictionary mapping node IDs to their respective airport_code attributes from a NetworkX graph.

    Args:
        G (networkx.Graph): A NetworkX graph with nodes containing an 'airport_code' attribute.

    Returns:
        dict: A dictionary with node IDs as keys and airport_code as values.
    """
    airport_code_dict = {}

    for node, attributes in G.nodes(data=True):
        airport_code = attributes.get('airport_code')
        if airport_code:
            airport_code_dict[node] = airport_code

    return airport_code_dict

### --------------------------------------------------------------------
# Example output:

# {
#     1: "JFK",
#     2: "LAX"
# }
### --------------------------------------------------------------------

id_to_airport_code_mapping_dict = create_airport_code_dict(flightGraph)

print(id_to_airport_code_mapping_dict)
print(len(id_to_airport_code_mapping_dict))

{'189': 'FWA', '191': 'BZN', '193': 'RDM', '195': 'BKG', '197': 'MQT', '199': 'JAC', '201': 'MSP', '203': 'HIB', '205': 'UCA', '207': 'AUS', '209': 'HDN', '211': 'MAZ', '213': 'ANI', '215': 'CDC', '217': 'AMA', '219': 'PSC', '221': 'PVD', '223': 'AGS', '225': 'AVL', '227': 'CVG', '229': 'PLN', '231': 'PBI', '233': 'LMT', '235': 'BTM', '237': 'CPR', '239': 'ITH', '241': 'UTM', '243': 'CRW', '245': 'YUM', '247': 'DHN', '249': 'ATL', '251': 'MKG', '253': 'JAN', '255': 'LCH', '257': 'DIK', '259': 'LNY', '261': 'EWN', '263': 'ROW', '265': 'ITO', '267': 'ELM', '269': 'MWH', '271': 'HOB', '273': 'MCO', '275': 'EGE', '277': 'SLC', '279': 'CHO', '281': 'ECP', '283': 'ORD', '285': 'ISN', '287': 'SAN', '289': 'PDX', '291': 'GSO', '293': 'PIH', '295': 'MCN', '297': 'BNA', '299': 'SEA', '301': 'SAT', '303': 'SMX', '305': 'FSD', '307': 'BRD', '309': 'HOU', '311': 'EKO', '313': 'ERI', '315': 'GRR', '317': 'BFL', '319': 'SPN', '321': 'TUL', '323': 'INL', '325': 'DUT', '327': 'ATW', '329': 'HRL', '331'

### Apply Dictionary to Keys in Dictionary Results

In [12]:
def convert_output_keys_to_airport_codes(
    nx_output_dict, 
    node_to_airport_code_dict=id_to_airport_code_mapping_dict
    ):
    
    """
    Converts the keys of a NetworkX output dictionary from node IDs to airport codes.

    Args:
        nx_output_dict (dict): A dictionary output from NetworkX with node IDs as keys.
        node_to_airport_code_dict (dict): A dictionary mapping node IDs to airport codes.

    Returns:
        dict: A dictionary with airport codes as keys and the original values preserved.
    """
    converted_dict = {}

    for node_id, value in nx_output_dict.items():
        airport_code = node_to_airport_code_dict.get(node_id, f"({node_id})")
        converted_dict[airport_code] = value

    return converted_dict

# Example Use:
# converted_dict = convert_output_keys_to_airport_codes(nx_output_dict, id_to_airport_code_mapping_dict)

# Example Output:
# {
#     "JFK": 3,
#     "LAX": 5,
#     "ORD": 2
# }

#### Create Function to Return Statisics About Path Lengths (Dictionary Input)

In [13]:
def analysis_all_flights_all_orig_all_dest(
    cleaned_data: Dict[str, Dict[str, int]],
    node_to_airport_code_dict: Dict[int, str]
):
    """
    Analyze the values in a dictionary of path lengths.

    Parameters:
        cleaned_data (dict): A dictionary of dictionaries. The keys in the outer 
            dictionaries are the origin airport codes. The inner dictionary is 
            the destination airports and their respective path lengths.
        node_to_airport_code_dict (dict): A dictionary mapping node IDs to airport codes.

    Returns:
        None
    """
    metrics = {
        "average_min_length": None,
        "average_max_length": None,
        "max_length": None,
        "mean_per_key": {},
        "average_of_means": None,
        "std_dev_per_key": {},
        "average_std_dev": None,
        "mode_per_key": {},
        "average_mode": None,
    }

    min_lengths = []
    max_lengths = []
    means = []
    std_devs = []
    modes = []

    for outer_key, inner_dict in cleaned_data.items():
        values = list(inner_dict.values())
        if not values:
            continue

        min_lengths.append(min(values))
        max_lengths.append(max(values))
        means.append(np.mean(values))
        std_devs.append(np.std(values))

        try:
            mode_value = mode(values)
        except StatisticsError:
            mode_value = None
        if mode_value is not None:
            modes.append(mode_value)
            metrics["mode_per_key"][outer_key] = mode_value

        metrics["mean_per_key"][outer_key] = np.mean(values)
        metrics["std_dev_per_key"][outer_key] = np.std(values)

    metrics["average_min_length"] = np.mean(min_lengths)
    metrics["average_max_length"] = np.mean(max_lengths)
    metrics["max_length"] = max(max_lengths)
    metrics["average_of_means"] = np.mean(means)
    metrics["average_std_dev"] = np.mean(std_devs)
    metrics["average_mode"] = np.mean(modes) if modes else None

    # Convert dictionary keys to airport codes
    metrics["mean_per_key"] = convert_output_keys_to_airport_codes(
        metrics["mean_per_key"], node_to_airport_code_dict
    )
    metrics["std_dev_per_key"] = convert_output_keys_to_airport_codes(
        metrics["std_dev_per_key"], node_to_airport_code_dict
    )
    metrics["mode_per_key"] = convert_output_keys_to_airport_codes(
        metrics["mode_per_key"], node_to_airport_code_dict
    )

    # Print the results
    for metric, value in metrics.items():
        print(f"{metric}: {value}")


# The above function requires that the data input has the following applied to it:
# cleaned_data = {
#     outer_key: {inner_key: value for inner_key, value in inner_dict.items() if inner_key != outer_key}
#     for outer_key, inner_dict in unw_dijk_all_path_len.items()
# }

#### Return Both the number of States Inlcuded in Dataset & Their Names

In [14]:
# Return the names of all of the states

query = """
LET uniqueStates = UNIQUE(
    FOR airport IN airports
        SORT airport.airportState_name
        RETURN airport.airportState_name
)
RETURN {
    uniqueStates: uniqueStates,
    count: LENGTH(uniqueStates)
}
"""
result = db.aql.execute(query)
output = list(result)[0]
print(f"Number of unique states: {output['count']}")
print("States in alphabetical order:")
for state in output['uniqueStates']:
    print(state)


Number of unique states: 52
States in alphabetical order:

Alabama
Alaska
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
Florida
Georgia
Hawaii
Idaho
Illinois
Indiana
Iowa
Kentucky
Louisiana
Maine
Maryland
Massachusetts
Michigan
Minnesota
Mississippi
Missouri
Montana
Nebraska
Nevada
New Hampshire
New Jersey
New Mexico
New York
North Carolina
North Dakota
Ohio
Oregon
Pennsylvania
Puerto Rico
Rhode Island
South Carolina
South Dakota
Tennessee
Texas
U.S. Pacific Trust Territories and Possessions
U.S. Virgin Islands
Utah
Vermont
Virginia
Washington
West Virginia
Wisconsin
Wyoming


#### Results for Source Target Dijkstra's Unweighted Shortest Path Algorithm

In [15]:
# Unweighted dijkstra
weighted_field = None
method = "dijkstra"

unw_dijk_all_path_len = dict(nx.shortest_path_length(flightGraph, weight=weighted_field, method=method))

# print(unw_dijk_all_path_len)
unw_dijk_all_path_len_prepped = {
    outer_key: {inner_key: value for inner_key, value in inner_dict.items() if inner_key != outer_key}
    for outer_key, inner_dict in unw_dijk_all_path_len.items()
}

print(f"Analysis for Unweighted {method.title()} Shortest Path")
analysis_all_flights_all_orig_all_dest(unw_dijk_all_path_len_prepped, id_to_airport_code_mapping_dict)

Analysis for Unweighted Dijkstra Shortest Path
average_min_length: 1.0
average_max_length: 3.98
max_length: 5
mean_per_key: {'FWA': np.float64(2.2464183381088825), 'BZN': np.float64(2.128939828080229), 'RDM': np.float64(2.504297994269341), 'BKG': np.float64(2.340974212034384), 'MQT': np.float64(2.495702005730659), 'JAC': np.float64(2.1174785100286533), 'MSP': np.float64(1.6361031518624642), 'HIB': np.float64(2.6303724928366763), 'AUS': np.float64(1.9312320916905443), 'HDN': np.float64(2.1002865329512894), 'MAZ': np.float64(3.0888252148997135), 'ANI': np.float64(3.9312320916905446), 'CDC': np.float64(2.7392550143266474), 'AMA': np.float64(2.3810888252148996), 'PSC': np.float64(2.4011461318051577), 'PVD': np.float64(2.0773638968481376), 'AGS': np.float64(2.372492836676218), 'AVL': np.float64(2.2091690544412605), 'CVG': np.float64(1.6962750716332378), 'PLN': np.float64(2.6303724928366763), 'PBI': np.float64(2.0744985673352434), 'LMT': np.float64(2.7650429799426934), 'BTM': np.float64(2.74

#### Results for Source Target Bellman-Ford's Unweighted Shortest Path Algorithm

In [16]:
# Unweighted bellman-ford
weighted_field = None
method = "bellman-ford"

unw_bf_all_path_len = dict(nx.shortest_path_length(flightGraph, weight=weighted_field, method=method))

unw_bf_all_path_len_prepped = {
    outer_key: {inner_key: value for inner_key, value in inner_dict.items() if inner_key != outer_key}
    for outer_key, inner_dict in unw_bf_all_path_len.items()
}

print(f"Analysis for Unweighted {method.title()} Shortest Path")
analysis_all_flights_all_orig_all_dest(unw_bf_all_path_len_prepped, id_to_airport_code_mapping_dict)

Analysis for Unweighted Bellman-Ford Shortest Path
average_min_length: 1.0
average_max_length: 3.98
max_length: 5
mean_per_key: {'FWA': np.float64(2.2464183381088825), 'BZN': np.float64(2.128939828080229), 'RDM': np.float64(2.504297994269341), 'BKG': np.float64(2.340974212034384), 'MQT': np.float64(2.495702005730659), 'JAC': np.float64(2.1174785100286533), 'MSP': np.float64(1.6361031518624642), 'HIB': np.float64(2.6303724928366763), 'AUS': np.float64(1.9312320916905443), 'HDN': np.float64(2.1002865329512894), 'MAZ': np.float64(3.0888252148997135), 'ANI': np.float64(3.9312320916905446), 'CDC': np.float64(2.7392550143266474), 'AMA': np.float64(2.3810888252148996), 'PSC': np.float64(2.4011461318051577), 'PVD': np.float64(2.0773638968481376), 'AGS': np.float64(2.372492836676218), 'AVL': np.float64(2.2091690544412605), 'CVG': np.float64(1.6962750716332378), 'PLN': np.float64(2.6303724928366763), 'PBI': np.float64(2.0744985673352434), 'LMT': np.float64(2.7650429799426934), 'BTM': np.float64(

### Convert edge property (flight_distance) from string to float

In [17]:
# Convert edge property (flight_distance) from string to float

for u, v, data in flightGraph.edges(data=True):
    if 'flight_distance' in data:
        try:
            data['flight_distance'] = float(data['flight_distance'])
        except ValueError:
            print(f"Invalid flight_distance value on edge ({u}, {v}): {data['flight_distance']}")

#### Results for Source Target Dijkstra's Weighted Shortest Path Algorithm

In [18]:
# Weighted dijkstra
weighted_field = 'flight_distance'
method = "dijkstra"

w_dijk_all_path_len = dict(
    nx.shortest_path_length(
        flightGraph, 
        weight=weighted_field, 
        method=method
        )
    )

w_dijk_all_path_len_prepped = {
    outer_key: {inner_key: value for inner_key, value in inner_dict.items() if inner_key != outer_key}
    for outer_key, inner_dict in w_dijk_all_path_len.items()
}

w_dijk_all_path_len_prepped

# Print Results
print(f"Analysis for Unweighted {method.title()} Shortest Path")
analysis_all_flights_all_orig_all_dest(w_dijk_all_path_len_prepped, id_to_airport_code_mapping_dict)

Analysis for Unweighted Dijkstra Shortest Path
average_min_length: 188.22285714285715
average_max_length: 7787.705714285714
max_length: 9935.0
mean_per_key: {'FWA': np.float64(1229.7392550143265), 'BZN': np.float64(1487.3295128939828), 'RDM': np.float64(1776.1919770773638), 'BKG': np.float64(1247.7048710601719), 'MQT': np.float64(1369.4555873925501), 'JAC': np.float64(1453.888252148997), 'MSP': np.float64(1210.054441260745), 'HIB': np.float64(1382.432664756447), 'AUS': np.float64(1329.5845272206304), 'HDN': np.float64(1315.3724928366762), 'MAZ': np.float64(2697.8997134670485), 'ANI': np.float64(3466.223495702006), 'CDC': np.float64(1584.6332378223497), 'AMA': np.float64(1308.6389684813753), 'PSC': np.float64(1714.0601719197707), 'PVD': np.float64(1686.9856733524355), 'AGS': np.float64(1380.5100286532952), 'AVL': np.float64(1330.2521489971348), 'CVG': np.float64(1215.5558739255014), 'PLN': np.float64(1432.0315186246419), 'PBI': np.float64(1678.9942693409741), 'LMT': np.float64(1967.9283

#### Results for Source Target Bellman-Ford's Weighted Shortest Path Algorithm

In [19]:
# Weighted bellman-ford
weighted_field = "flight_distance"
method = "bellman-ford"

w_bf_all_path_len = dict(nx.shortest_path_length(flightGraph, source=None, target=None, weight=weighted_field, method=method))

w_bf_all_path_len_prepped = {
    outer_key: {inner_key: value for inner_key, value in inner_dict.items() if inner_key != outer_key}
    for outer_key, inner_dict in w_bf_all_path_len.items()
}

# Print Results
print(f"Analysis for Unweighted {method.title()} Shortest Path")
analysis_all_flights_all_orig_all_dest(w_bf_all_path_len_prepped, id_to_airport_code_mapping_dict)

Analysis for Unweighted Bellman-Ford Shortest Path
average_min_length: 188.22285714285715
average_max_length: 7787.705714285714
max_length: 9935.0
mean_per_key: {'FWA': np.float64(1229.7392550143265), 'BZN': np.float64(1487.3295128939828), 'RDM': np.float64(1776.1919770773638), 'BKG': np.float64(1247.7048710601719), 'MQT': np.float64(1369.4555873925501), 'JAC': np.float64(1453.888252148997), 'MSP': np.float64(1210.054441260745), 'HIB': np.float64(1382.432664756447), 'AUS': np.float64(1329.5845272206304), 'HDN': np.float64(1315.3724928366762), 'MAZ': np.float64(2697.8997134670485), 'ANI': np.float64(3466.223495702006), 'CDC': np.float64(1584.6332378223497), 'AMA': np.float64(1308.6389684813753), 'PSC': np.float64(1714.0601719197707), 'PVD': np.float64(1686.9856733524355), 'AGS': np.float64(1380.5100286532952), 'AVL': np.float64(1330.2521489971348), 'CVG': np.float64(1215.5558739255014), 'PLN': np.float64(1432.0315186246419), 'PBI': np.float64(1678.9942693409741), 'LMT': np.float64(1967.