In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("../results/enriched_routes/beynes_routes_enriched.csv", quotechar='"')



In [4]:

# Cechy ciągłe do normalizacji
continuous_features = [
    'free_flow_time',
    'total_length',
    'mean_speed',
    'speed_std',
    'speed_range',
    'lane_changes_per_km',
    'priority_changes_per_km',
    'yield_priority_changes_per_km',
    'traffic_lights_per_km',
    'bearing_std',
    'turns_per_km',
    'left_yield_turns_per_km',
    'mean_circuity',
    'edge_length_std',
    'edges_per_km'
]

# Procentowe cechy w skali 0–1 (nie wymagają normalizacji)
percentage_features = [
    'pct_high_speed',
    'pct_motorway',
    'pct_trunk',
    'pct_primary',
    'pct_secondary',
    'pct_tertiary',
    'pct_unclassified',
    'pct_residential'
]

# Wszystkie cechy używane w modelu
features = continuous_features + percentage_features



In [5]:
import numpy as np

print("Braki danych w kolumnach:")
print(df[features].isna().sum())
print(df[features].isna().sum()[lambda x: x > 0])

# print(df['pct_unpaved'].value_counts())
# # print(df['bridges_per_km'].value_counts())
# columns_to_check = ['tunnels_per_km', 'bridges_per_km', 'pct_motorway', 'pct_trunk', 'pct_lit', 'pct_unpaved']

# # Liczenie zer i innych wartości
# for col in columns_to_check:
#     zeros = (df[col] == 0).sum()
#     non_zeros = (df[col] != 0).sum()
#     print(f"{col}: zeros = {zeros}, non-zeros = {non_zeros}")


Braki danych w kolumnach:
free_flow_time                   0
total_length                     0
mean_speed                       0
speed_std                        0
speed_range                      0
lane_changes_per_km              0
priority_changes_per_km          0
yield_priority_changes_per_km    0
traffic_lights_per_km            0
bearing_std                      0
turns_per_km                     0
left_yield_turns_per_km          0
mean_circuity                    0
edge_length_std                  0
edges_per_km                     0
pct_high_speed                   0
pct_motorway                     0
pct_trunk                        0
pct_primary                      0
pct_secondary                    0
pct_tertiary                     0
pct_unclassified                 0
pct_residential                  0
dtype: int64
Series([], dtype: int64)


In [6]:
# Normalizacja cech ciągłych
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[continuous_features] = scaler.fit_transform(df[continuous_features])
# weighted mean_speed 
weight = 10.0
df_scaled['mean_speed'] = df_scaled['mean_speed'] * weight

X = df_scaled[features].values

print("Dane gotowe do clusteringu:", X.shape)


Dane gotowe do clusteringu: (4048, 23)


In [7]:
from sklearn.cluster import KMeans
import pandas as pd


kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_

# Dodajemy etykiety do oryginalnego DataFrame dla analizy
df['cluster'] = labels

# Sprawdzenie ile tras w każdym klastrze
print(df['cluster'].value_counts())
df.to_csv("../results/clustered_routes/beynes_routes_clustered.csv", index=False)

cluster
0    2372
2    1298
1     378
Name: count, dtype: int64


In [8]:

global_speed = df['mean_speed'].mean()
print(global_speed)
# Średnia prędkość w każdym klastrze
cluster_speed = df.groupby('cluster')['mean_speed'].mean()
print(cluster_speed)

# Który klaster najszybszy
fastest_cluster = cluster_speed.idxmax()
print(f"Najszybszy klaster to: {fastest_cluster}")

35.37449526412878
cluster
0    31.155113
1    47.238330
2    39.630155
Name: mean_speed, dtype: float64
Najszybszy klaster to: 1


In [9]:

global_speed = df['free_flow_time'].mean()
print(global_speed)

cluster_speed = df.groupby('cluster')['free_flow_time'].mean()
print(cluster_speed)

cluster_speed_std = df.groupby('cluster')['free_flow_time'].std()
print(cluster_speed_std)

fastest_cluster = cluster_speed.idxmin()
print(f"Najszybszy(free_flow_time) klaster to: {fastest_cluster}")

4.062122524516334
cluster
0    2.872941
1    4.621395
2    6.072395
Name: free_flow_time, dtype: float64
cluster
0    1.285145
1    3.104947
2    3.366328
Name: free_flow_time, dtype: float64
Najszybszy(free_flow_time) klaster to: 0


In [10]:

global_speed = df['total_length'].mean()
print(global_speed)

cluster_speed = df.groupby('cluster')['total_length'].mean()
print(cluster_speed)

fastest_cluster = cluster_speed.idxmin()
print(f"Najkrótszy klaster to: {fastest_cluster}")

2839.1473568630813
cluster
0    1718.950619
1    4142.035611
2    4506.801365
Name: total_length, dtype: float64
Najkrótszy klaster to: 0


In [11]:

global_speed = df['traffic_lights_per_km'].mean()
print(global_speed)
cluster_speed = df.groupby('cluster')['traffic_lights_per_km'].mean()
print(cluster_speed)

fastest_cluster = cluster_speed.idxmin()
print(f"Najmniej swiatel klaster to: {fastest_cluster}")

0.34546760705742124
cluster
0    0.429822
1    0.092148
2    0.265088
Name: traffic_lights_per_km, dtype: float64
Najmniej swiatel klaster to: 1
