## Imports

In [1]:
import os
os.environ['USE_PYGEOS'] = '0'
import pickle
from shapely.geometry import LineString
import geopandas as gpd
import pandas as pd
from tqdm import tqdm
from shapely import wkt

In [2]:
# Define data folder paths
data_folder_path = os.path.join('..', '..', 'data', '01_input_data')
# Define routing  output folder path
routing_output_folder_path = os.path.join('..', '..', 'data', '02_intermediate_output', 'routing')
# Define output folder path
intermediate_output_folder_path = os.path.join('..', '..', 'data','02_intermediate_output')

## Categorize Line Segments


This section categorizes each line segment into four categories based on their intersection with short and safe routes and the average crime mean (csv value):

- 'ideal': The line segment is present in both the short and safe routes and has a crime mean (csv) less than 20.
- 'unavoidable': The line segment is present in both the short and safe routes but has a crime mean (csv) greater than or equal to 20.
- 'avoidable': The line segment is only present in the short route and does not intersect with the safe route.
- 'preferable': The line segment is only present in the safe route and does not intersect with the short route.

In [3]:
def categorize_geometries(gdf_short, gdf_safe):
    """
    Categorize line segments based on their intersection with other segments.
    
    Parameters:
    - gdf_short: GeoDataFrame containing short route line segments.
    - gdf_safe: GeoDataFrame containing safe route line segments.

    Returns:
    - GeoDataFrame with categorized geometries.
    """
    categorized_data = []
    unique_geometries = set()  # To track unique geometries

    # Process geometries from gdf_short
    for _, row_short in gdf_short.iterrows():
        geom_short = row_short['geometry']
        if not isinstance(geom_short, LineString):
            continue

        intersected = False
        for _, row_safe in gdf_safe.iterrows():
            geom_safe = row_safe['geometry']
            if not isinstance(geom_safe, LineString):
                continue

            if geom_short.intersects(geom_safe):
                intersection = geom_short.intersection(geom_safe)
                if isinstance(intersection, LineString):
                    intersected = True
                    csv_value = row_safe['csv']
                    category = 'unavoidable' if csv_value >= 20 else 'ideal'

                    if geom_short not in unique_geometries:
                        categorized_data.append({'geometry': geom_short, 'category': category})
                        unique_geometries.add(geom_short)
                    break

        if not intersected and geom_short not in unique_geometries:
            categorized_data.append({'geometry': geom_short, 'category': 'avoidable'})
            unique_geometries.add(geom_short)

    # Process geometries from gdf_safe
    for _, row_safe in gdf_safe.iterrows():
        geom_safe = row_safe['geometry']
        if not isinstance(geom_safe, LineString):
            continue

        intersected = False
        for _, row_short in gdf_short.iterrows():
            geom_short = row_short['geometry']
            if not isinstance(geom_short, LineString):
                continue

            if geom_safe.intersects(geom_short):
                intersection = geom_safe.intersection(geom_short)
                if isinstance(intersection, LineString):
                    intersected = True
                    break

        if not intersected and geom_safe not in unique_geometries:
            categorized_data.append({'geometry': geom_safe, 'category': 'preferable'})
            unique_geometries.add(geom_safe)

    return gpd.GeoDataFrame(categorized_data, geometry='geometry')

def load_and_categorize_data(short_dir, safe_dir, checkpoint_path=None):
    """
    Load CSV files from the given directories and return categorized GeoDataFrames.

    Parameters:
    - short_dir: Directory path containing short route line segment CSV files.
    - safe_dir: Directory path containing safe route line segment CSV files.
    - checkpoint_path: Path to a checkpoint file for resuming progress.

    Returns:
    - Dictionary of categorized GeoDataFrames keyed by file ID.
    """
    categorized_dfs = {}
    short_files = [f for f in os.listdir(short_dir) if f.endswith('.csv')]
    safe_files = [f for f in os.listdir(safe_dir) if f.endswith('.csv')]
    common_files = set(short_files).intersection(set(safe_files))

    # Load checkpoint if available
    if checkpoint_path and os.path.exists(checkpoint_path):
        with open(checkpoint_path, 'rb') as checkpoint_file:
            checkpoint_data = pickle.load(checkpoint_file)
            categorized_dfs = checkpoint_data['categorized_dfs']
            processed_files = checkpoint_data['processed_files']
    else:
        processed_files = set()

    file_count = 0

    for filename in tqdm(common_files, desc="Processing files"):
        if filename in processed_files:
            continue
        
        short_filepath = os.path.join(short_dir, filename)
        safe_filepath = os.path.join(safe_dir, filename)
        
        short_df = pd.read_csv(short_filepath)
        safe_df = pd.read_csv(safe_filepath)
        
        # Convert the 'geometry' column from WKT format to geometric objects
        short_df['geometry'] = short_df['geometry'].apply(wkt.loads)
        safe_df['geometry'] = safe_df['geometry'].apply(wkt.loads)
        
        # Create GeoDataFrames
        short_gdf = gpd.GeoDataFrame(short_df, geometry='geometry')
        safe_gdf = gpd.GeoDataFrame(safe_df, geometry='geometry')
        
        # Categorize the geometries
        categorized_gdf = categorize_geometries(short_gdf, safe_gdf)
        file_id = os.path.splitext(filename)[0]  # Extract file name without extension
        categorized_dfs[file_id] = categorized_gdf
        processed_files.add(filename)
        
        file_count += 1
        
        # Save checkpoint every 100 files
        if file_count % 100 == 0:
            with open(checkpoint_path, 'wb') as checkpoint_file:
                checkpoint_data = {
                    'categorized_dfs': categorized_dfs,
                    'processed_files': processed_files
                }
                pickle.dump(checkpoint_data, checkpoint_file)
    
    # Save the final checkpoint
    with open(checkpoint_path, 'wb') as checkpoint_file:
        checkpoint_data = {
            'categorized_dfs': categorized_dfs,
            'processed_files': processed_files
        }
        pickle.dump(checkpoint_data, checkpoint_file)
    
    return categorized_dfs

In [8]:
# Paths to CSV factor directories
short_path = os.path.join(routing_output_folder_path, 'csv_factor_0.0')
safe_path = os.path.join(routing_output_folder_path, 'csv_factor_1.0')

# Paths to checkpoint pickle
checkpoint_path = os.path.join(routing_output_folder_path, 'checkpoint.pkl')


# Load and categorize data
categorized_dfs = load_and_categorize_data(short_path, safe_path, checkpoint_path=checkpoint_path)


## Find Most Relevant Category


 In this section, we identify the most relevant category for each line segment by analyzing the population accessing that segment.\n
 
 Each segment may belong to multiple categories, so we need to determine which category is the most pertinent based on the number of people using that route.


In [7]:
# Read young population data
young_data = gpd.read_file(os.path.join(data_folder_path, 'young_population_grid.gpkg'))

In [64]:
gdfs = []

# Dictionary to keep track of geometries and their assigned IDs
geometry_to_id = {}
current_id = 1

# Process each categorized GeoDataFrame
for file_id, gdf in tqdm(categorized_dfs.items(), desc='Processing dataframes'):
    unique_ids = []
    file_ids = []
    for geom in gdf['geometry']:
        geom_str = geom.wkt
        if geom_str in geometry_to_id:
            unique_id = geometry_to_id[geom_str]
        else:
            unique_id = current_id
            geometry_to_id[geom_str] = current_id
            current_id += 1
        unique_ids.append(unique_id)
        file_ids.append(file_id)
    gdf['unique_id'] = unique_ids
    gdf['file_id'] = file_ids
    gdfs.append(gdf)

    
# Combine all GeoDataFrames
combined_gdf = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=gdfs[0].crs)
combined_gdf.rename(columns={'file_id': 'trip_id'}, inplace=True)
combined_gdf['trip_id'] = pd.to_numeric(combined_gdf['trip_id'], errors='coerce')
combined_gdf

Processing dataframes: 100%|██████████| 20943/20943 [00:15<00:00, 1380.31it/s]


Unnamed: 0,geometry,category,unique_id,keys,trip_id
0,"LINESTRING (-43.58349 -22.94150, -43.58351 -22...",ideal,1,7212,7212
1,"LINESTRING (-43.58351 -22.94129, -43.58293 -22...",ideal,2,7212,7212
2,"LINESTRING (-43.58293 -22.94126, -43.58298 -22...",ideal,3,7212,7212
3,"LINESTRING (-43.58298 -22.94031, -43.58232 -22...",ideal,4,7212,7212
4,"LINESTRING (-43.58232 -22.94029, -43.58237 -22...",ideal,5,7212,7212
...,...,...,...,...,...
757728,"LINESTRING (-43.67520 -22.93416, -43.67515 -22...",ideal,38611,2740,2740
757729,"LINESTRING (-43.67515 -22.93429, -43.67515 -22...",ideal,38612,2740,2740
757730,"LINESTRING (-43.67515 -22.93434, -43.67517 -22...",ideal,38613,2740,2740
757731,"LINESTRING (-43.67517 -22.93439, -43.67522 -22...",ideal,38614,2740,2740


Map population sum to each trip_id

In [66]:

id_to_population_sum = dict(zip(young_data['Id'], young_data['total_population'])) # Map Id to population sum
combined_gdf['population_sum'] = combined_gdf['trip_id'].map(id_to_population_sum) 
if 'Id' not in combined_gdf.columns:
    combined_gdf['Id'] = range(1, len(combined_gdf) + 1)
combined_gdf

Unnamed: 0,geometry,category,unique_id,keys,trip_id,population_sum,Id
0,"LINESTRING (-43.58349 -22.94150, -43.58351 -22...",ideal,1,7212,7212,4.270653,1
1,"LINESTRING (-43.58351 -22.94129, -43.58293 -22...",ideal,2,7212,7212,4.270653,2
2,"LINESTRING (-43.58293 -22.94126, -43.58298 -22...",ideal,3,7212,7212,4.270653,3
3,"LINESTRING (-43.58298 -22.94031, -43.58232 -22...",ideal,4,7212,7212,4.270653,4
4,"LINESTRING (-43.58232 -22.94029, -43.58237 -22...",ideal,5,7212,7212,4.270653,5
...,...,...,...,...,...,...,...
757728,"LINESTRING (-43.67520 -22.93416, -43.67515 -22...",ideal,38611,2740,2740,59.749283,757729
757729,"LINESTRING (-43.67515 -22.93429, -43.67515 -22...",ideal,38612,2740,2740,59.749283,757730
757730,"LINESTRING (-43.67515 -22.93434, -43.67517 -22...",ideal,38613,2740,2740,59.749283,757731
757731,"LINESTRING (-43.67517 -22.93439, -43.67522 -22...",ideal,38614,2740,2740,59.749283,757732


In [67]:
category_sum = combined_gdf.groupby(['unique_id', 'category'])['population_sum'].sum().unstack(fill_value=0)  # Sum population by category
category_sum['max_population_sum'] = category_sum.max(axis=1) # Find the maximum population sum
category_sum['max_category'] = category_sum.idxmax(axis=1) # Find the category with the maximum population sum
category_sum

category,avoidable,ideal,preferable,unavoidable,max_population_sum,max_category
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,4.270653,0.0,0.0,4.270653,ideal
2,0.0,4.270653,0.0,0.0,4.270653,ideal
3,0.0,4.270653,0.0,0.0,4.270653,ideal
4,0.0,37.677616,0.0,0.0,37.677616,ideal
5,0.0,86.771126,0.0,0.0,86.771126,ideal
...,...,...,...,...,...,...
144242,0.0,12.859805,0.0,0.0,12.859805,ideal
144243,0.0,12.859805,0.0,0.0,12.859805,ideal
144244,0.0,12.859805,0.0,0.0,12.859805,ideal
144245,0.0,59.749283,0.0,0.0,59.749283,ideal


Merge with geometries

In [69]:
category_sum.reset_index(inplace=True)
unique_geometries = combined_gdf.drop_duplicates(subset='unique_id')
category_sum_with_geometry = pd.merge(category_sum, unique_geometries[['unique_id', 'geometry']], on='unique_id', how='left')

# Convert to gdf
category_sum_with_geometry_gdf = gpd.GeoDataFrame(category_sum_with_geometry, geometry='geometry')

Display category distribution

In [73]:
category_distribution = category_sum['max_category'].value_counts()
category_percentage = category_distribution / category_distribution.sum() * 100
category_percentage

max_category
ideal          87.321659
preferable      6.596370
avoidable       5.693745
unavoidable     0.388226
Name: count, dtype: float64

 Save results

In [None]:
file_path = os.path.join(intermediate_output_folder_path, 'route_segment_type.gpkg')
category_sum_with_geometry_gdf.to_file(file_path, driver='GPKG')