# Load Data

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("All_Routes_Combined.csv")

In [4]:
df.head(1)

Unnamed: 0,id,name,distance_m,duration_s,ascent_m,descent_m,steps,turns,surface,waytype,waycategory,steepness
0,6101627,Ciclopedonale Lago Ghirla,1885.5,377.1,[51.8],[93.8],2,0,"[[0, 37, 3], [37, 38, 0], [38, 62, 3]]","[[0, 38, 6], [38, 62, 2]]","[[0, 62, 0]]","[[0, 7, -4], [7, 33, 1], [33, 39, -2], [39, 41..."


# Data Cleaning

## Descent & Ascent

In [5]:
# Clean values for ascent and descent
df['ascent_m'] = df['ascent_m'].astype(str).str.strip("[]").astype(float)
df['descent_m'] = df['descent_m'].astype(str).str.strip("[]").astype(float)

## Null Values

In [6]:
df['name'].fillna("Unnamed route", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['name'].fillna("Unnamed route", inplace=True)


In [7]:
# Check for null values
df.isna().sum()

id               0
name             0
distance_m     662
duration_s     662
ascent_m         0
descent_m        0
steps            0
turns            0
surface          0
waytype          0
waycategory      0
steepness        0
dtype: int64

In [8]:
# Drop null values
df = df.dropna()

## Surface Data

In [9]:
# Decode surface values
surface_map = {
    0: "Unknown",
    1: "Paved",
    2: "Unpaved",
    3: "Asphalt",
    4: "Concrete",
    5: "Cobblestone",
    6: "Metal",
    7: "Wood",
    8: "Compacted Gravel",
    9: "Fine Gravel",
    10: "Gravel",
    11: "Dirt",
    12: "Ground",
    13: "Ice",
    14: "Paving Stones",
    15: "Sand",
    16: "Woodchips",
    17: "Grass",
    18: "Grass Paver"
}

In [10]:
import ast

In [11]:
def calc_surface_percentages(surface_data):
    try:
        # Convert string to list if needed
        if isinstance(surface_data, str):
            surface_data = ast.literal_eval(surface_data)

        # Compute segment lengths
        total_length = 0
        surface_lengths = {}

        for seg in surface_data:
            start, end, surf_code = seg
            length = end - start
            total_length += length
            surface_lengths[surf_code] = surface_lengths.get(surf_code, 0) + length

        # Convert to percentage
        for surf in surface_lengths:
            surface_lengths[surf] = round(surface_lengths[surf] / total_length * 100, 2)

        return surface_lengths

    except Exception as e:
        return {}

In [12]:
df['surface_pct'] = df['surface'].apply(calc_surface_percentages)

In [13]:
surface_df = df['surface_pct'].apply(pd.Series)
surface_df = surface_df.rename(columns=surface_map).fillna(0)

In [14]:
df = pd.concat([df, surface_df], axis=1)

In [15]:
df = df.drop(['surface','surface_pct'], axis=1)

In [16]:
surface_df.head()

Unnamed: 0,Asphalt,Unknown,Concrete,Compacted Gravel,Wood,Paved,Gravel,Ground,Dirt,Paving Stones,Unpaved,Grass,Metal,Sand,Grass Paver
0,98.39,1.61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,96.21,3.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Waytype

In [17]:
waytype_map = {
    0: "Unknown",
    1: "State Road",
    2: "Road",
    3: "Street",
    4: "Path",
    5: "Track",
    6: "Cycleway",
    7: "Footway",
    8: "Steps",
    9: "Ferry",
    10: "Construction"
}

In [18]:
import ast

def calc_waytype_percentages(waytype_data):
    try:
        # Convert string repr to list if needed
        if isinstance(waytype_data, str):
            waytype_data = ast.literal_eval(waytype_data)

        total_length = 0
        waytype_lengths = {}

        for seg in waytype_data:
            start, end, code = seg
            length = end - start
            total_length += length
            waytype_lengths[code] = waytype_lengths.get(code, 0) + length

        # Convert to percentages
        for code in waytype_lengths:
            waytype_lengths[code] = round(waytype_lengths[code] / total_length * 100, 2)

        return waytype_lengths

    except Exception as e:
        return {}


In [19]:
df['wtype_pct'] = df['waytype'].apply(calc_waytype_percentages)

In [20]:
wtype_df = df['wtype_pct'].apply(pd.Series)
wtype_df = wtype_df.rename(columns=waytype_map).fillna(0)

In [21]:
df = pd.concat([df, wtype_df], axis=1)

In [22]:
df = df.drop(['waytype','wtype_pct'], axis=1)

In [23]:
wtype_df.head()

Unnamed: 0,Cycleway,Road,State Road,Street,Track,Path,Footway,Steps,Construction,Ferry,Unknown
0,61.29,38.71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Drop Waycategory

In [24]:
df = df.drop(['waycategory'], axis=1)

## Steepness

In [25]:
steepness_map = {
    -5: "downhill_extreme (<-15%)",
    -4: "downhill_very_steep (-15% to -10%)",
    -3: "downhill_steep (-10% to -7%)",
    -2: "downhill_moderate (-7% to -5%)",
    -1: "downhill_gentle (-5% to 0%)",
     0: "flat (0%)",
     1: "uphill_gentle (0% to 3%)",
     2: "uphill_moderate (3% to 5%)",
     3: "uphill_steep (5% to 7%)",
     4: "uphill_very_steep (7% to 10%)",
     5: "uphill_extreme (>10%)"
}


In [26]:
import ast

def calc_steep_percentages(steep_data):
    try:
        # Convert string repr to list if needed
        if isinstance(steep_data, str):
            steep_data = ast.literal_eval(steep_data)

        total_length = 0
        steep_lengths = {}

        for seg in steep_data:
            start, end, code = seg
            length = end - start
            total_length += length
            steep_lengths[code] = steep_lengths.get(code, 0) + length

        # Convert to percentages
        for code in steep_lengths:
            steep_lengths[code] = round(steep_lengths[code] / total_length * 100, 2)

        return steep_lengths

    except Exception as e:
        return {}


In [27]:
df['steep_pct'] = df['steepness'].apply(calc_steep_percentages)

In [28]:
steep_df = df['steep_pct'].apply(pd.Series)
steep_df = steep_df.rename(columns=steepness_map).fillna(0)

In [29]:
df = pd.concat([df, steep_df], axis=1)

In [30]:
df = df.drop(['steepness','steep_pct'], axis=1)

In [31]:
df.shape

(16878, 45)

In [32]:
df

Unnamed: 0,id,name,distance_m,duration_s,ascent_m,descent_m,steps,turns,Asphalt,Unknown,...,uphill_gentle (0% to 3%),downhill_moderate (-7% to -5%),downhill_extreme (<-15%),downhill_gentle (-5% to 0%),uphill_moderate (3% to 5%),uphill_extreme (>10%),downhill_steep (-10% to -7%),uphill_very_steep (7% to 10%),uphill_steep (5% to 7%),flat (0%)
0,6101627,Ciclopedonale Lago Ghirla,1885.5,377.1,51.8,93.8,2,0,98.39,1.61,...,75.81,9.68,3.23,0.00,0.00,0.00,0.0,0.00,0.00,0.0
1,10187640,Tour du Léman - Étape 4,4138.4,827.6,164.5,173.5,2,0,100.00,0.00,...,67.80,4.24,1.69,16.95,9.32,0.00,0.0,0.00,0.00,0.0
2,12509770,La Madeleine Nord,24714.5,4942.9,2182.8,670.8,2,0,96.21,3.79,...,20.85,1.56,1.56,7.69,29.77,7.69,1.9,12.93,15.16,0.0
3,15361105,Unnamed route,984.9,197.0,14.5,21.5,2,0,100.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,100.0
4,15630528,Unnamed route,1662.8,332.5,31.1,9.1,2,0,100.00,0.00,...,100.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17494,9742054,CN 05,74.1,14.8,0.0,0.0,2,0,100.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,100.0
17495,10421910,Pas de Bonnet (886 m) depuis Champtercier,278.5,55.7,22.0,0.0,2,0,100.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,100.00,0.0
17496,10421955,Col d'Espinouse (838 m) depuis la D12,268.7,53.7,31.3,0.3,2,0,100.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.0,66.67,33.33,0.0
17497,9314026,Ciclostrada Terre dell'Ovest - Dalla Dora ai p...,142.6,28.5,0.0,2.0,2,0,33.33,66.67,...,0.00,0.00,0.00,100.00,0.00,0.00,0.0,0.00,0.00,0.0


In [34]:
df.to_csv('All_Routes_Processed.csv', index=False)