# Pre-processing: Replacing '-' with ';' in 'line' column in a shapefile

In [33]:
# import geopandas as gpd

# def modify_number_format(shapefile_path, column_name, output_path=None):
#     # Read the shapefile
#     gdf = gpd.read_file(shapefile_path)
    
#     # Check if column exists
#     if column_name not in gdf.columns:
#         print(f"Error: Column '{column_name}' not found in shapefile")
#         print("Available columns:", list(gdf.columns))
#         return
    
#     # Modify the specified column
#     gdf[column_name] = gdf[column_name].astype(str).str.replace('-', ';')
    
#     # Save the results
#     if output_path is None:
#         output_path = shapefile_path
    
#     gdf.to_file(output_path, driver='ESRI Shapefile')
#     print(f"Successfully modified and saved to {output_path}")

# # Example usage
# if __name__ == "__main__":
#     input_shp = r"E:\Freelancing\P_01_6.4.2025\data\gis/brt_stations.shp"
#     column_to_modify = "LineStatio"
#     output_shp = r"E:\Freelancing\P_01_6.4.2025\data\gis/brt_stations_new.shp"

#     # input_shp = r"E:\Freelancing\P_01_6.4.2025\data\gis/metro_stations.shp"
#     # column_to_modify = "LineStatio"
#     # output_shp = r"E:\Freelancing\P_01_6.4.2025\data\gis/metro_stations_new.shp"
    
#     modify_number_format(input_shp, column_to_modify, output_shp)

# 1. Creating a network graph from transit lines and stations, then generating adjacency matrices that capture connectivity between stations:
# * Individual Metro & BRT Adjacency Matrices
# * Combined Adjacency Matrix (Without Bridge Edges)

In [41]:
from pathlib import Path
import geopandas as gpd
import networkx as nx
import pandas as pd
import warnings
warnings.simplefilter("ignore")

# Create a network graph from transit lines and stations
def create_transit_network(lines_gdf: gpd.GeoDataFrame, stations_gdf: gpd.GeoDataFrame, 
                         buffer_distance: int = 50, line_id_col: str = 'Origin', 
                         station_id_col: str = 'Name') -> nx.Graph:
    G = nx.Graph()
    for _, station in stations_gdf.iterrows():
        G.add_node(station[station_id_col], name=station[station_id_col], geometry=station['geometry'])
    
    processed_lines = 0
    skipped_lines = 0
    for _, line in lines_gdf.iterrows():
        line_geom = line['geometry']
        current_line_id = line.get(line_id_col)
        buffered_line = line_geom.buffer(buffer_distance)
        stations_on_line = stations_gdf[stations_gdf.geometry.within(buffered_line)].copy()
        
        if len(stations_on_line) < 2:
            skipped_lines += 1
            continue
        processed_lines += 1
        
        stations_on_line['proj_dist'] = stations_on_line.geometry.apply(
            lambda x: line_geom.project(x))
        stations_sorted = stations_on_line.sort_values('proj_dist')
        station_ids = stations_sorted[station_id_col].tolist()
        for i in range(len(station_ids) - 1):
            G.add_edge(station_ids[i], station_ids[i + 1], 
                      line_id=current_line_id,
                      length=line_geom.project(stations_sorted.iloc[i + 1].geometry) - 
                          line_geom.project(stations_sorted.iloc[i].geometry))
    
    print(f"Processed {processed_lines} lines, skipped {skipped_lines} lines")
    return G

# Create a combined network from metro and BRT
def create_combined_network(metro_lines: gpd.GeoDataFrame, metro_stations: gpd.GeoDataFrame,
                           brt_lines: gpd.GeoDataFrame, brt_stations: gpd.GeoDataFrame,
                           metro_buffer: int = 50, brt_buffer: int = 150,
                           line_id_col: str = 'Origin', station_id_col: str = 'Name') -> nx.Graph:
    metro_network = create_transit_network(metro_lines, metro_stations, metro_buffer, line_id_col, station_id_col)
    brt_network = create_transit_network(brt_lines, brt_stations, brt_buffer, line_id_col, station_id_col)
    G_combined = nx.Graph()
    for node, data in metro_network.nodes(data=True):
        G_combined.add_node(node, **data)
    for node, data in brt_network.nodes(data=True):
        if node not in G_combined:
            G_combined.add_node(node, **data)
    for u, v, data in metro_network.edges(data=True):
        G_combined.add_edge(u, v, **data)
    for u, v, data in brt_network.edges(data=True):
        G_combined.add_edge(u, v, **data)
    
    print(f"\nCombined network: {G_combined.number_of_nodes()} nodes, {G_combined.number_of_edges()} edges")
    print(f"Number of isolated nodes in combined network: {len(list(nx.isolates(G_combined)))}")
    return G_combined

# Save the adjacency matrix of a network graph to a CSV file
def save_adjacency_matrix(G: nx.Graph, output_path: Path, filename: str) -> None:
    adj_matrix = nx.adjacency_matrix(G).todense()
    station_ids = list(G.nodes)
    print(f"Adjacency matrix shape: {adj_matrix.shape}")
    adj_df = pd.DataFrame(adj_matrix, index=station_ids, columns=station_ids)
    output_path.mkdir(parents=True, exist_ok=True)
    full_path = output_path / filename
    adj_df.to_csv(full_path, encoding='utf-8-sig')
    print(f"Adjacency matrix saved to {full_path}")

# New function to show adjacency list with distances
def show_adjacency_list_with_distances(G: nx.Graph, output_path: Path, filename: str) -> None:
    adjacency_data = []
    # print(f"\nAdjacency list with distances for graph:")
    for node in G.nodes:
        neighbors = list(G.neighbors(node))
        if neighbors:  # Only include nodes with neighbors
            for neighbor in neighbors:
                distance = G[node][neighbor].get('length', 0.0)  # Get edge length, default to 0 if missing
                # print(f"Station {node} -> Neighbor {neighbor}, Distance: {distance:.2f} meters")
                adjacency_data.append({
                    'Station': node,
                    'Neighbor': neighbor,
                    'Distance_m': distance
                })
    
    # Convert to DataFrame and save to CSV
    if adjacency_data:
        adj_df = pd.DataFrame(adjacency_data)
        output_path.mkdir(parents=True, exist_ok=True)
        full_path = output_path / filename
        adj_df.to_csv(full_path, index=False, encoding='utf-8-sig')
        print(f"Adjacency list with distances saved to {full_path}")
    else:
        print("No adjacency data to save (graph may have no edges).")

# Main execution
if __name__ == "__main__":
    data_path = r"E:\Freelancing\P_01_6.4.2025\data\gis"
    output_path = Path(r"E:\Freelancing\P_01_6.4.2025\output")
    
    try:
        metro_lines = gpd.read_file(f'{data_path}/metro_lines.shp')
        metro_stations = gpd.read_file(f'{data_path}/metro_stations.shp')
        brt_lines = gpd.read_file(f'{data_path}/brt_lines.shp')
        brt_stations = gpd.read_file(f'{data_path}/brt_stations.shp')
        traffic_zones = gpd.read_file(f'{data_path}/traffic_zones_utm.shp')
    except Exception as e:
        print(f"Error reading shapefiles: {e}")
        exit()

    station_id_col = 'Name'
    for stations_gdf, name in [(metro_stations, 'Metro'), (brt_stations, 'BRT')]:
        if station_id_col not in stations_gdf.columns:
            print(f"Error: '{station_id_col}' column not found in {name} stations")
            print(f"Available columns: {stations_gdf.columns.tolist()}")
            exit()

    for stations_gdf, name in [(metro_stations, 'Metro'), (brt_stations, 'BRT')]:
        duplicate_stations = stations_gdf[stations_gdf[station_id_col].duplicated()]
        if not duplicate_stations.empty:
            print(f"Duplicate {name} station names found: {duplicate_stations[station_id_col].tolist()}")

    shared_stations = set(metro_stations[station_id_col]) & set(brt_stations[station_id_col])
    print(f"\nShared stations between Metro and BRT: {len(shared_stations)}")
    if shared_stations:
        print(f"Shared station names: {sorted(shared_stations)}")

    print(f"\nMetro lines: {len(metro_lines)} rows")
    print(f"Metro stations: {len(metro_stations)} rows")
    print(f"Unique Metro station names: {metro_stations[station_id_col].nunique()}")
    print(f"Missing Metro station names: {metro_stations[station_id_col].isna().sum()}")
    print(f"\nBRT lines: {len(brt_lines)} rows")
    print(f"BRT stations: {len(brt_stations)} rows")
    print(f"Unique BRT station names: {brt_stations[station_id_col].nunique()}")
    print(f"Missing BRT station names: {brt_stations[station_id_col].isna().sum()}")

    print("\nMetro lines columns:", metro_lines.columns.tolist())
    print("Metro stations columns:", metro_stations.columns.tolist())
    print("BRT lines columns:", brt_lines.columns.tolist())
    print("BRT stations columns:", brt_stations.columns.tolist())

    metro_lines = metro_lines[metro_lines.geometry.is_valid]
    metro_stations = metro_stations[metro_stations.geometry.is_valid]
    brt_lines = brt_lines[brt_lines.geometry.is_valid]
    brt_stations = brt_stations[brt_stations.geometry.is_valid]
    
    print(f"\nMetro lines after geometry check: {len(metro_lines)}")
    print(f"Metro stations after geometry check: {len(metro_stations)}")
    print(f"BRT lines after geometry check: {len(brt_lines)}")
    print(f"BRT stations after geometry check: {len(brt_stations)}")

    metro_stations = metro_stations.to_crs(epsg=32639)
    metro_lines = metro_lines.to_crs(epsg=32639)
    brt_stations = brt_stations.to_crs(epsg=32639)
    brt_lines = brt_lines.to_crs(epsg=32639)
    traffic_zones = traffic_zones.to_crs(epsg=32639)

    # Create individual networks with 200-meter buffer
    metro_network = create_transit_network(
        lines_gdf=metro_lines,
        stations_gdf=metro_stations,
        buffer_distance=200
    )
    print(f"\nNumber of nodes in Metro network: {metro_network.number_of_nodes()}")
    print(f"Number of isolated nodes in Metro network: {len(list(nx.isolates(metro_network)))}")
    save_adjacency_matrix(metro_network, output_path, "metro_adjacency_matrix.csv")
    show_adjacency_list_with_distances(metro_network, output_path, "metro_adjacency_list.csv")

    brt_network = create_transit_network(
        lines_gdf=brt_lines,
        stations_gdf=brt_stations,
        buffer_distance=300
    )
    print(f"\nNumber of nodes in BRT network: {brt_network.number_of_nodes()}")
    print(f"Number of isolated nodes in BRT network: {len(list(nx.isolates(brt_network)))}")
    save_adjacency_matrix(brt_network, output_path, "brt_adjacency_matrix.csv")
    show_adjacency_list_with_distances(brt_network, output_path, "brt_adjacency_list.csv")

    combined_network = create_combined_network(
        metro_lines=metro_lines,
        metro_stations=metro_stations,
        brt_lines=brt_lines,
        brt_stations=brt_stations,
        metro_buffer=200,
        brt_buffer=300,
        line_id_col='Origin',
        station_id_col='Name'
    )
    save_adjacency_matrix(combined_network, output_path, "combined_adjacency_matrix.csv")
    show_adjacency_list_with_distances(combined_network, output_path, "combined_adjacency_list.csv")

Duplicate BRT station names found: ['شهید رجایی', 'وحدت اسلامی', 'چهارراه مولوی', 'هفده شهریور', 'چهارراه ولیعصرعجل الله تعالی فرجه شریف', 'شهید کشاورز', 'سبلان', 'سازمان آب', 'مخابرات', 'الوند', 'بوعلی', 'عباسی', 'امام خمینی رضوان الله تعالی علیه', 'دانشگاه تهران', 'نبوت', 'بیست متری افسریه', 'نیروی هوایی', 'شهید مخبری', 'شهید امامی', 'پونک', 'میدان امام حسین علیه السلام', 'پل', 'پل خاقانی', 'شهید دکترآیت', 'بهبودی', 'وحیدیه', 'دروازه دولت', 'شهید صفدری', 'شهید رحیمی', 'باغ فیض', 'سرسبز', 'داریوش', 'پیروزی', 'ائمه اطهار س', 'ولیعصرعجل الله تعالی فرجه شریف', 'سعادت', 'آیت الله طالقانی', 'شهید رحمانی', 'قالیشویی', 'آذربایجان', 'میدان فردوسی', 'میدان توحید', 'پل جوادیه', 'شمشیری', 'شهید آیت الله اشرفی اصفهانی', 'ابوریحان', 'فرهنگسرای قرآن', 'تیراژه', 'شهید منتظری', 'فرودگاه', 'هلال احمر', 'ولیعصرعجل الله تعالی فرجه شریف', 'آزادی', 'شهید آبشناسان', 'شهید نواب صفوی', 'باسکول', 'اتابک', 'مسجد جامع', 'مرزداران', 'بلوارمعلم', 'آموزش و پرورش', 'شهرداری منطقه هجده', 'کوی نصر', 'مترو آزادگان', '

# 2. Creating Combined Adjacency Matrix (With Bridge Edges)

In [44]:
from pathlib import Path
import geopandas as gpd
import networkx as nx
import pandas as pd
import warnings
warnings.simplefilter("ignore")

# Create a network graph from transit lines and stations
def create_transit_network(lines_gdf: gpd.GeoDataFrame, stations_gdf: gpd.GeoDataFrame, 
                         buffer_distance: int = 200, line_id_col: str = 'Origin', 
                         station_id_col: str = 'Name') -> nx.Graph:
    G = nx.Graph()
    for _, station in stations_gdf.iterrows():
        G.add_node(station[station_id_col], name=station[station_id_col], geometry=station['geometry'])
    
    processed_lines = 0
    skipped_lines = 0
    for _, line in lines_gdf.iterrows():
        line_geom = line['geometry']
        current_line_id = line.get(line_id_col)
        buffered_line = line_geom.buffer(buffer_distance)
        stations_on_line = stations_gdf[stations_gdf.geometry.within(buffered_line)].copy()
        
        if len(stations_on_line) < 2:
            skipped_lines += 1
            continue
        processed_lines += 1
        
        stations_on_line['distance_to_line'] = stations_on_line.geometry.apply(
            lambda x: x.distance(line_geom))
        # print(f"Line {current_line_id}:")
        # print(stations_on_line[[station_id_col, 'distance_to_line']])
        if stations_on_line['distance_to_line'].max() > buffer_distance:
            print(f"Warning: Stations farther than {buffer_distance}m from line {current_line_id}")
        
        stations_on_line['proj_dist'] = stations_on_line.geometry.apply(
            lambda x: line_geom.project(x))
        stations_sorted = stations_on_line.sort_values('proj_dist')
        station_ids = stations_sorted[station_id_col].tolist()
        for i in range(len(station_ids) - 1):
            G.add_edge(station_ids[i], station_ids[i + 1], 
                      line_id=current_line_id,
                      length=line_geom.project(stations_sorted.iloc[i + 1].geometry) - 
                          line_geom.project(stations_sorted.iloc[i].geometry),
                      edge_type='regular')
    
    print(f"Processed {processed_lines} lines, skipped {skipped_lines} lines")
    return G

# Create a combined network from metro and BRT with bridge edges
def create_combined_network(metro_lines: gpd.GeoDataFrame, metro_stations: gpd.GeoDataFrame,
                           brt_lines: gpd.GeoDataFrame, brt_stations: gpd.GeoDataFrame,
                           metro_buffer: int = 200, brt_buffer: int = 300,
                           bridge_distance: int = 300, line_id_col: str = 'Origin',
                           station_id_col: str = 'Name') -> nx.Graph:
    metro_network = create_transit_network(metro_lines, metro_stations, metro_buffer, line_id_col, station_id_col)
    brt_network = create_transit_network(brt_lines, brt_stations, brt_buffer, line_id_col, station_id_col)
    G_combined = nx.Graph()
    
    for node, data in metro_network.nodes(data=True):
        G_combined.add_node(node, **data, network='metro')
    for node, data in brt_network.nodes(data=True):
        if node not in G_combined:
            G_combined.add_node(node, **data, network='brt')
    
    for u, v, data in metro_network.edges(data=True):
        G_combined.add_edge(u, v, **data)
    for u, v, data in brt_network.edges(data=True):
        G_combined.add_edge(u, v, **data)
    
    bridge_edges = []
    for metro_node, metro_data in G_combined.nodes(data=True):
        if metro_data.get('network') != 'metro':
            continue
        metro_geom = metro_data['geometry']
        for brt_node, brt_data in G_combined.nodes(data=True):
            if brt_data.get('network') != 'brt':
                continue
            brt_geom = brt_data['geometry']
            distance = metro_geom.distance(brt_geom)
            if distance <= bridge_distance and metro_node != brt_node:
                G_combined.add_edge(metro_node, brt_node, edge_type='bridge', length=distance)
                bridge_edges.append((metro_node, brt_node, distance))
    
    print(f"\nCombined network: {G_combined.number_of_nodes()} nodes, {G_combined.number_of_edges()} edges")
    print(f"Number of isolated nodes in combined network: {len(list(nx.isolates(G_combined)))}")
    print(f"Number of bridge edges: {len(bridge_edges)}")
    if bridge_edges:
        print("Bridge edges (Metro -> BRT, distance):")
        for u, v, dist in bridge_edges:
            print(f"  {u} -> {v}: {dist:.2f} meters")
    
    return G_combined

# Save the adjacency matrix with bridge edges distinguished
def save_adjacency_matrix(G: nx.Graph, output_path: Path, filename: str) -> None:
    station_ids = list(G.nodes)
    n = len(station_ids)
    adj_matrix = nx.to_numpy_array(G, nodelist=station_ids, weight=None)
    
    for u, v, data in G.edges(data=True):
        i = station_ids.index(u)
        j = station_ids.index(v)
        edge_type = data.get('edge_type', 'regular')
        adj_matrix[i, j] = 2 if edge_type == 'bridge' else 1
        adj_matrix[j, i] = 2 if edge_type == 'bridge' else 1
    
    print(f"Adjacency matrix shape: {adj_matrix.shape}")
    adj_df = pd.DataFrame(adj_matrix, index=station_ids, columns=station_ids)
    output_path.mkdir(parents=True, exist_ok=True)
    full_path = output_path / filename
    adj_df.to_csv(full_path, encoding='utf-8-sig')
    print(f"Adjacency matrix saved to {full_path}")

# New function to show and save adjacency list with distances
def show_adjacency_list_with_distances(G: nx.Graph, output_path: Path, filename: str, stations_gdf: gpd.GeoDataFrame, lines_gdf: gpd.GeoDataFrame, line_id_col: str = 'Origin') -> None:
    adjacency_data = []
    # print(f"\nAdjacency list with distances for graph:")
    for node in G.nodes:
        neighbors = list(G.neighbors(node))
        if neighbors:
            station_geom = stations_gdf[stations_gdf['Name'] == node].geometry.iloc[0]
            min_distance = float('inf')
            associated_line = None
            for _, line in lines_gdf.iterrows():
                dist = station_geom.distance(line.geometry)
                if dist < min_distance:
                    min_distance = dist
                    associated_line = line[line_id_col]
            for neighbor in neighbors:
                edge_data = G[node][neighbor]
                distance = edge_data.get('length', 0.0)
                edge_type = edge_data.get('edge_type', 'regular')
                line_id = edge_data.get('line_id', 'N/A') if edge_type == 'regular' else 'Bridge'
                # print(f"Station {node} -> Neighbor {neighbor}, Distance: {distance:.2f}m, Edge Type: {edge_type}, Line: {line_id}, Perpendicular Distance to Line: {min_distance:.2f}m")
                adjacency_data.append({
                    'Station': node,
                    'Neighbor': neighbor,
                    'Distance_m': distance,
                    'Edge_Type': edge_type,
                    'Line_ID': line_id,
                    'Perpendicular_Distance_m': min_distance
                })
    
    if adjacency_data:
        adj_df = pd.DataFrame(adjacency_data)
        output_path.mkdir(parents=True, exist_ok=True)
        full_path = output_path / filename
        adj_df.to_csv(full_path, index=False, encoding='utf-8-sig')
        print(f"Adjacency list with distances saved to {full_path}")
    else:
        print("No adjacency data to save (graph may have no edges).")

# Main execution
if __name__ == "__main__":
    data_path = r"E:\Freelancing\P_01_6.4.2025\data\gis"
    output_path = Path(r"E:\Freelancing\P_01_6.4.2025\output")
    
    try:
        metro_lines = gpd.read_file(f'{data_path}/metro_lines.shp')
        metro_stations = gpd.read_file(f'{data_path}/metro_stations.shp')
        brt_lines = gpd.read_file(f'{data_path}/brt_lines.shp')
        brt_stations = gpd.read_file(f'{data_path}/brt_stations.shp')
        traffic_zones = gpd.read_file(f'{data_path}/traffic_zones_utm.shp')
    except Exception as e:
        print(f"Error reading shapefiles: {e}")
        exit()

    station_id_col = 'Name'
    for stations_gdf, name in [(metro_stations, 'Metro'), (brt_stations, 'BRT')]:
        if station_id_col not in stations_gdf.columns:
            print(f"Error: '{station_id_col}' column not found in {name} stations")
            print(f"Available columns: {stations_gdf.columns.tolist()}")
            exit()

    for stations_gdf, name in [(metro_stations, 'Metro'), (brt_stations, 'BRT')]:
        duplicate_stations = stations_gdf[stations_gdf[station_id_col].duplicated()]
        if not duplicate_stations.empty:
            print(f"Duplicate {name} station names found: {duplicate_stations[station_id_col].tolist()}")

    shared_stations = set(metro_stations[station_id_col]) & set(brt_stations[station_id_col])
    print(f"\nShared stations between Metro and BRT: {len(shared_stations)}")
    if shared_stations:
        print(f"Shared station names: {sorted(shared_stations)}")

    print(f"\nMetro lines: {len(metro_lines)} rows")
    print(f"Metro stations: {len(metro_stations)} rows")
    print(f"Unique Metro station names: {metro_stations[station_id_col].nunique()}")
    print(f"Missing Metro station names: {metro_stations[station_id_col].isna().sum()}")
    print(f"\nBRT lines: {len(brt_lines)} rows")
    print(f"BRT stations: {len(brt_stations)} rows")
    print(f"Unique BRT station names: {brt_stations[station_id_col].nunique()}")
    print(f"Missing BRT station names: {brt_stations[station_id_col].isna().sum()}")

    print("\nMetro lines columns:", metro_lines.columns.tolist())
    print("Metro stations columns:", metro_stations.columns.tolist())
    print("BRT lines columns:", brt_lines.columns.tolist())
    print("BRT stations columns:", brt_stations.columns.tolist())

    metro_lines = metro_lines[metro_lines.geometry.is_valid]
    metro_stations = metro_stations[metro_stations.geometry.is_valid]
    brt_lines = brt_lines[brt_lines.geometry.is_valid]
    brt_stations = brt_stations[brt_stations.geometry.is_valid]
    
    print(f"\nMetro lines after geometry check: {len(metro_lines)}")
    print(f"Metro stations after geometry check: {len(metro_stations)}")
    print(f"BRT lines after geometry check: {len(brt_lines)}")
    print(f"BRT stations after geometry check: {len(brt_stations)}")

    metro_stations = metro_stations.to_crs(epsg=32639)
    metro_lines = metro_lines.to_crs(epsg=32639)
    brt_stations = brt_stations.to_crs(epsg=32639)
    brt_lines = brt_lines.to_crs(epsg=32639)
    traffic_zones = traffic_zones.to_crs(epsg=32639)

    combined_network = create_combined_network(
        metro_lines=metro_lines,
        metro_stations=metro_stations,
        brt_lines=brt_lines,
        brt_stations=brt_stations,
        metro_buffer=200,
        brt_buffer=300,
        bridge_distance=300,
        line_id_col='Origin',
        station_id_col='Name'
    )
    save_adjacency_matrix(combined_network, output_path, "combined_adjacency_matrix_with_bridge_edges.csv")
    show_adjacency_list_with_distances(
        combined_network, 
        output_path, 
        "combined_adjacency_list_with_bridge_edges.csv", 
        pd.concat([metro_stations, brt_stations]), 
        pd.concat([metro_lines, brt_lines])
    )

Duplicate BRT station names found: ['شهید رجایی', 'وحدت اسلامی', 'چهارراه مولوی', 'هفده شهریور', 'چهارراه ولیعصرعجل الله تعالی فرجه شریف', 'شهید کشاورز', 'سبلان', 'سازمان آب', 'مخابرات', 'الوند', 'بوعلی', 'عباسی', 'امام خمینی رضوان الله تعالی علیه', 'دانشگاه تهران', 'نبوت', 'بیست متری افسریه', 'نیروی هوایی', 'شهید مخبری', 'شهید امامی', 'پونک', 'میدان امام حسین علیه السلام', 'پل', 'پل خاقانی', 'شهید دکترآیت', 'بهبودی', 'وحیدیه', 'دروازه دولت', 'شهید صفدری', 'شهید رحیمی', 'باغ فیض', 'سرسبز', 'داریوش', 'پیروزی', 'ائمه اطهار س', 'ولیعصرعجل الله تعالی فرجه شریف', 'سعادت', 'آیت الله طالقانی', 'شهید رحمانی', 'قالیشویی', 'آذربایجان', 'میدان فردوسی', 'میدان توحید', 'پل جوادیه', 'شمشیری', 'شهید آیت الله اشرفی اصفهانی', 'ابوریحان', 'فرهنگسرای قرآن', 'تیراژه', 'شهید منتظری', 'فرودگاه', 'هلال احمر', 'ولیعصرعجل الله تعالی فرجه شریف', 'آزادی', 'شهید آبشناسان', 'شهید نواب صفوی', 'باسکول', 'اتابک', 'مسجد جامع', 'مرزداران', 'بلوارمعلم', 'آموزش و پرورش', 'شهرداری منطقه هجده', 'کوی نصر', 'مترو آزادگان', '

# 3. Creating Station Summary Table

In [39]:
from pathlib import Path
import geopandas as gpd
import pandas as pd
import warnings
warnings.simplefilter("ignore")

# Disable scientific notation globally
pd.options.display.float_format = '{:.0f}'.format

def assign_traffic_zone(stations_gdf: gpd.GeoDataFrame, 
                       traffic_zones_gdf: gpd.GeoDataFrame, 
                       zone_id_col: str = 'ZoneID') -> gpd.GeoDataFrame:
    """
    Assign traffic zone to each station based on spatial join.
    """
    # Ensure Handle is string
    traffic_zones_gdf[zone_id_col] = traffic_zones_gdf[zone_id_col].astype(str)
    
    if stations_gdf.crs != traffic_zones_gdf.crs:
        traffic_zones_gdf = traffic_zones_gdf.to_crs(stations_gdf.crs)
    
    stations_with_zone = gpd.sjoin(stations_gdf, 
                                  traffic_zones_gdf[[zone_id_col, 'geometry']], 
                                  how='left', 
                                  predicate='within')
    
    stations_with_zone = stations_with_zone.rename(columns={zone_id_col: 'TrafficZone'})
    stations_with_zone['TrafficZone'] = stations_with_zone['TrafficZone'].fillna('Unknown').astype(str)
    
    return stations_with_zone

def assign_lines_to_stations(stations_gdf: gpd.GeoDataFrame, 
                            lines_gdf: gpd.GeoDataFrame, 
                            buffer_distance: int, 
                            line_id_col: str = 'Origin',
                            station_id_col: str = 'Name') -> pd.DataFrame:
    """
    Assign transit lines to each station based on proximity to lines.
    """
    station_lines = []
    
    for _, station in stations_gdf.iterrows():
        station_id = station[station_id_col]
        station_geom = station['geometry']
        
        buffered_station = station_geom.buffer(buffer_distance)
        lines_nearby = lines_gdf[lines_gdf.geometry.intersects(buffered_station)].copy()
        
        line_ids = [str(line_id) for line_id in lines_nearby[line_id_col].unique()]
        lines_str = ';'.join(sorted(line_ids)) if line_ids else 'None'
        
        station_lines.append({
            'StationID': station_id,
            'Lines': lines_str
        })
    
    df = pd.DataFrame(station_lines)
    df['Lines'] = df['Lines'].astype(str)
    
    return df

def create_station_summary_table(metro_stations: gpd.GeoDataFrame,
                                metro_lines: gpd.GeoDataFrame,
                                brt_stations: gpd.GeoDataFrame,
                                brt_lines: gpd.GeoDataFrame,
                                traffic_zones: gpd.GeoDataFrame,
                                output_path: Path,
                                metro_buffer: int = 50,
                                brt_buffer: int = 150,
                                station_id_col: str = 'Name',
                                line_id_col: str = 'Origin',
                                zone_id_col: str = 'Handle') -> None:
    """
    Create a comprehensive table of station attributes, traffic zones, and associated lines,
    and save it as an Excel file.
    """
    target_crs = 'EPSG:32639'
    metro_stations = metro_stations.to_crs(target_crs)
    metro_lines = metro_lines.to_crs(target_crs)
    brt_stations = brt_stations.to_crs(target_crs)
    brt_lines = brt_lines.to_crs(target_crs)
    traffic_zones = traffic_zones.to_crs(target_crs)

    # Print sample Handle values for verification
    print("Sample Handle values from traffic_zones.shp:")
    print(traffic_zones[zone_id_col].head(10).to_list())

    metro_stations['Network'] = 'Metro'
    brt_stations['Network'] = 'BRT'
    all_stations = pd.concat([metro_stations, brt_stations], ignore_index=True)

    stations_with_zone = assign_traffic_zone(all_stations, traffic_zones, zone_id_col)

    metro_station_lines = assign_lines_to_stations(metro_stations, 
                                                 metro_lines, 
                                                 metro_buffer, 
                                                 line_id_col, 
                                                 station_id_col)
    metro_station_lines['Network'] = 'Metro'

    brt_station_lines = assign_lines_to_stations(brt_stations, 
                                               brt_lines, 
                                               brt_buffer, 
                                               line_id_col, 
                                               station_id_col)
    brt_station_lines['Network'] = 'BRT'

    all_station_lines = pd.concat([metro_station_lines, brt_station_lines], ignore_index=True)

    station_summary = stations_with_zone.merge(all_station_lines[['StationID', 'Lines']], 
                                             left_on=station_id_col, 
                                             right_on='StationID', 
                                             how='left')

    station_summary['Lines'] = station_summary['Lines'].astype(str)
    station_summary['TrafficZone'] = station_summary['TrafficZone'].astype(str)

    # Print sample values for debugging
    print("\nSample TrafficZone and Lines values before saving:")
    print(station_summary[[station_id_col, 'TrafficZone', 'Lines']].head(10))

    columns_to_keep = [station_id_col, 'Network', 'TrafficZone', 'Lines'] + \
                      [col for col in stations_with_zone.columns if col not in ['geometry', 'index_right', 'StationID', 'TrafficZone', 'Network', 'Lines']]
    
    final_table = station_summary[columns_to_keep]

    # Save to Excel
    output_path.mkdir(parents=True, exist_ok=True)
    output_file = output_path / 'station_summary_table.xlsx'
    try:
        final_table.to_excel(output_file, index=False, engine='openpyxl')
        print(f"Station summary table saved to {output_file}")
    except ImportError:
        print("openpyxl not installed. Install it using: pip install openpyxl")
        return

    print(f"Total stations: {len(final_table)}")
    print(f"Columns in final table: {final_table.columns.tolist()}")

# Main execution
if __name__ == "__main__":
    data_path = r"E:\Freelancing\P_01_6.4.2025\data\gis"
    output_path = Path(r"E:\Freelancing\P_01_6.4.2025\output")
    
    try:
        metro_lines = gpd.read_file(f'{data_path}/metro_lines.shp')
        metro_stations = gpd.read_file(f'{data_path}/metro_stations.shp')
        brt_lines = gpd.read_file(f'{data_path}/brt_lines.shp')
        brt_stations = gpd.read_file(f'{data_path}/brt_stations.shp')
        traffic_zones = gpd.read_file(f'{data_path}/traffic_zones_utm_join_annotation.shp', dtype={'Handle': str})
    except Exception as e:
        print(f"Error reading shapefiles: {e}")
        exit()

    station_id_col = 'Name'
    for stations_gdf, name in [(metro_stations, 'Metro'), (brt_stations, 'BRT')]:
        if station_id_col not in stations_gdf.columns:
            print(f"Error: '{station_id_col}' column not found in {name} stations")
            print(f"Available columns: {stations_gdf.columns.tolist()}")
            exit()

    metro_lines = metro_lines[metro_lines.geometry.is_valid]
    metro_stations = metro_stations[metro_stations.geometry.is_valid]
    brt_lines = brt_lines[brt_lines.geometry.is_valid]
    brt_stations = brt_stations[brt_stations.geometry.is_valid]
    traffic_zones = traffic_zones[traffic_zones.geometry.is_valid]

    create_station_summary_table(
        metro_stations=metro_stations,
        metro_lines=metro_lines,
        brt_stations=brt_stations,
        brt_lines=brt_lines,
        traffic_zones=traffic_zones,
        output_path=output_path,
        metro_buffer=50,
        brt_buffer=150,
        station_id_col='Name',
        line_id_col='Origin',
        zone_id_col='Text'
    )

Sample Handle values from traffic_zones.shp:
['396', '364', '363', '362', '669', '659', '529', '501', '530', '658']

Sample TrafficZone and Lines values before saving:
              Name TrafficZone                             Lines
0       شهید حقانی         283                             تجریش
1         شهید همت         288                             تجریش
2         شهید همت         288  پارک وی;پایانه تجریش;پایانه معین
3         شهید همت         288  پارک وی;پایانه تجریش;پایانه معین
4  مصلي امام خمینی         188                             تجریش
5       شهيد بهشتي         188                     آزادگان;تجریش
6        شهيد مفتح         176                             تجریش
7   شهدای هفتم تير         167                             تجریش
8          طالقاني         167                             تجریش
9      دروازه دولت         166                     ارم سبز;تجریش
Station summary table saved to E:\Freelancing\P_01_6.4.2025\output\station_summary_table.xlsx
Total stations: 1046
Co

# Post-processing: Removing duplicate rows

In [40]:
import pandas as pd

def remove_duplicate_rows(input_file, output_file=None, sheet_name=0):
    """
    Remove rows with identical values in all columns from an Excel file.
    
    Parameters:
        input_file (str): Path to input Excel file
        output_file (str): Path to save cleaned file (if None, overwrites input)
        sheet_name (str/int): Sheet name or index to process
    """
    # Read the Excel file
    df = pd.read_excel(input_file, sheet_name=sheet_name)
    
    # Count initial rows
    initial_rows = len(df)
    
    # Remove duplicates (keeping first occurrence)
    df_cleaned = df.drop_duplicates(keep='first')
    
    # Count remaining rows
    final_rows = len(df_cleaned)
    
    # Determine output path
    if output_file is None:
        output_file = input_file
    
    # Save to Excel
    df_cleaned.to_excel(output_file, index=False)
    
    print(f"Removed {initial_rows - final_rows} duplicate rows.")
    print(f"Original rows: {initial_rows}, After cleaning: {final_rows}")
    print(f"Saved cleaned data to: {output_file}")

# Example usage:
remove_duplicate_rows(
    input_file=r"E:\Freelancing\P_01_6.4.2025\output\station_summary_table.xlsx",
    output_file=None,  # Set to None to overwrite original
    sheet_name='Sheet1'  # Can use sheet name or index (0 for first sheet)
)

Removed 384 duplicate rows.
Original rows: 1046, After cleaning: 662
Saved cleaned data to: E:\Freelancing\P_01_6.4.2025\output\station_summary_table.xlsx
