This notebook calculates the geographic distances from stations that are located 800 meters or less from other stations. It produces summary statistics and extends the summary statistics to include nearby stations of different modes in the same system, different systems, and the number of adjacent systems

## **Calculate Distances Between Stations within 800 Meters from Other Stations**

In [7]:
import pandas as pd
import re
import numpy as np
from sklearn.neighbors import BallTree
from geopy.distance import geodesic

# Read CSV file into a DataFrame
data_new = pd.read_csv('/content/Final__NTD_Station_Data_REDUCEDv2 (8).csv')  # Replace with your file path

# Function to convert coordinate strings to numpy array of floats
def parse_coordinates_np(coord_str):
    match = re.match(r"\((.*), (.*)\)", coord_str)
    if match:
        return [float(match.group(1)), float(match.group(2))]
    return [np.nan, np.nan]

# Parsing coordinates into a numpy array
coordinates = np.array(data_new['coordinates'].apply(parse_coordinates_np).tolist())

# Creating a BallTree for efficient spatial queries
tree = BallTree(np.deg2rad(coordinates), metric='haversine')

# Query radius: 800 meters
EARTH_RADIUS = 6371000  # Earth's radius in meters
radius = 800 / EARTH_RADIUS

# Performing radius query
indices = tree.query_radius(np.deg2rad(coordinates), r=radius)

# Additional columns to retrieve for each facility
additional_columns = [
    'NTD ID', 'Agency Name', 'Primary Mode Served', 'Facility Type',
    'Facility Name', 'City', 'full_address', 'Category', 'Definition', 'ZIP Code'
]

# Processing results to find facilities within 800 meters of each other
nearby_facilities_optimized = []
for i, neighbors in enumerate(indices):
    for j in neighbors:
        if i != j:
            distance = geodesic(coordinates[i], coordinates[j]).meters
            # Retrieve additional information for both facilities
            facility_1_info = data_new.iloc[i][additional_columns].to_dict()
            facility_2_info = data_new.iloc[j][additional_columns].to_dict()
            # Combine info into a single record
            record = {
                'Facility ID 1': data_new.iloc[i]['Facility ID'],
                'Facility ID 2': data_new.iloc[j]['Facility ID'],
                'Distance': distance
            }
            for col in additional_columns:
                record[f'{col} 1'] = facility_1_info[col]
                record[f'{col} 2'] = facility_2_info[col]
            nearby_facilities_optimized.append(record)

# Creating a DataFrame from the results
results_df = pd.DataFrame(nearby_facilities_optimized)

# Exporting the results to a CSV file
results_df.to_csv('/content/nearby_facilities_optimized.csv', index=False)

# Optionally, display a message
print("Results exported to nearby_facilities_optimized.csv")


Results exported to nearby_facilities_optimized.csv


In [None]:
from google.colab import files
files.download('nearby_facilities_optimized.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## **Aggregate Station Data into Summary Statistics**

In [8]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from sklearn.neighbors import BallTree

# Assuming 'data_new' DataFrame is already loaded and contains the required data

# Convert coordinate strings to numpy array of floats
def parse_coordinates_np(coord_str):
    try:
        lat, lon = map(float, coord_str.strip("()").split(","))
        return [lat, lon]
    except ValueError:  # In case there are any parsing issues
        return [np.nan, np.nan]

# Parsing coordinates into a numpy array
coordinates = np.array(list(map(parse_coordinates_np, data_new['coordinates'])))

# Creating a BallTree for efficient spatial queries
tree = BallTree(np.deg2rad(coordinates), metric='haversine')

# Query radius: 800 meters, Earth's radius in meters
EARTH_RADIUS = 6371000
radius = 800 / EARTH_RADIUS

# Performing radius query
indices = tree.query_radius(np.deg2rad(coordinates), r=radius)

# Function to calculate average distance
def calculate_average_distance(facility_index, neighbors_indices):
    distances = [geodesic(coordinates[facility_index], coordinates[j]).meters for j in neighbors_indices if facility_index != j]
    return np.mean(distances) if distances else 0

# Gather summary statistics for each facility
summary_stats = []
for i, neighbors in enumerate(indices):
    facility_id = data_new.iloc[i]['Facility ID']
    proximate_stations = len(neighbors) - 1  # Exclude the station itself
    average_distance = calculate_average_distance(i, neighbors)
    summary_stats.append((facility_id, proximate_stations, average_distance))

# Create a DataFrame from the summary statistics
summary_df = pd.DataFrame(summary_stats, columns=['Facility ID 1', 'Proximate Stations', 'Average Distance'])

# Exporting the summary statistics to a CSV file
summary_df.to_csv('/content/summary_statistics.csv', index=False)

# Display a message
print("Summary statistics exported to summary_statistics.csv")
summary_df.head()  # Display the first few rows of the DataFrame


Summary statistics exported to summary_statistics.csv


Unnamed: 0,Facility ID 1,Proximate Stations,Average Distance
0,1849,1,624.322212
1,1850,0,0.0
2,1851,5,525.241985
3,1852,0,0.0
4,1853,2,529.066906


## **Update Summary Statistics to Include Modal Crossover, System Crossover, and Number of Adjacent Systems**

In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from sklearn.neighbors import BallTree
from google.colab import files

# Upload a file and read it into a DataFrame named 'data_new'
uploaded = files.upload()  # This will prompt you to upload a file from your computer
file_name = next(iter(uploaded))  # Assuming you're uploading a single file
data_new = pd.read_csv(file_name)

# Helper function to parse coordinates
def parse_coordinates_np(coord_str):
    try:
        lat, lon = map(float, coord_str.strip("()").split(","))
        return [lat, lon]
    except ValueError:
        return [np.nan, np.nan]

# Parse and prepare coordinates
coordinates = np.array(list(map(parse_coordinates_np, data_new['coordinates'])))
valid_coords_indices = ~np.isnan(coordinates[:, 0])

# Reset index of data_new to ensure alignment
data_new_filtered = data_new[valid_coords_indices].reset_index(drop=True)
coordinates_filtered = coordinates[valid_coords_indices]

# Create BallTree for spatial queries
tree = BallTree(np.deg2rad(coordinates_filtered), metric='haversine')

# Query parameters
EARTH_RADIUS = 6371000  # meters
radius = 800 / EARTH_RADIUS  # 800 meters in radians

# Performing spatial query
indices = tree.query_radius(np.deg2rad(coordinates_filtered), r=radius)

# Update the calculate_crossovers function to include total systems and total systems adjacent calculations
def calculate_crossovers_and_counts(index, neighbors_indices):
    unique_modes = {data_new_filtered.iloc[i]['Primary Mode Served'] for i in neighbors_indices if i != index}
    unique_ntd_ids = {data_new_filtered.iloc[i]['NTD ID'] for i in neighbors_indices if i != index}
    modal_crossover = int(any(data_new_filtered.iloc[i]['Primary Mode Served'] != data_new_filtered.iloc[index]['Primary Mode Served'] for i in neighbors_indices if i != index))
    system_crossover = int(any(data_new_filtered.iloc[i]['NTD ID'] != data_new_filtered.iloc[index]['NTD ID'] for i in neighbors_indices if i != index))
    total_systems = len(unique_modes)
    total_systems_adjacent = len(unique_ntd_ids)
    return modal_crossover, system_crossover, total_systems, total_systems_adjacent

# Gather summary statistics for each facility
summary_stats = []
for index, neighbors in enumerate(indices):
    facility_id = data_new_filtered.iloc[index]['Facility ID']
    proximate_stations = len(neighbors) - 1  # Exclude the station itself
    average_distance = np.mean([geodesic(coordinates_filtered[index], coordinates_filtered[j]).meters for j in neighbors if index != j]) if len(neighbors) > 1 else 0
    modal_crossover, system_crossover, total_systems, total_systems_adjacent = calculate_crossovers_and_counts(index, neighbors)
    summary_stats.append((facility_id, proximate_stations, average_distance, modal_crossover, system_crossover, total_systems, total_systems_adjacent))

# Create a DataFrame from summary statistics
summary_df = pd.DataFrame(summary_stats, columns=['Facility ID', 'Proximate Stations', 'Average Distance', 'Modal Crossover (y/n)', 'System Crossover (y/n)', 'Total Systems', 'Total Systems Adjacent'])

# Save to CSV
summary_df.to_csv('/content/facility_summary.csv', index=False)

# Code to download the file
files.download('/content/facility_summary.csv')

print("Facility summary statistics exported to facility_summary.csv")


Saving Final__NTD_Station_Data_REDUCEDv2 (8).csv to Final__NTD_Station_Data_REDUCEDv2 (8).csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Facility summary statistics exported to facility_summary.csv
