In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("../results/enriched_routes/beynes_routes_enriched.csv", quotechar='"')



In [2]:

# Cechy ciągłe do normalizacji
continuous_features = [
    'free_flow_time',
    'total_length',
    'mean_speed',
    'speed_std',
    'speed_range',
    'lane_changes_per_km',
    'priority_changes_per_km',
    'yield_priority_changes_per_km',
    'traffic_lights_per_km',
    # 'bearing_std',
    'turns_per_km',
    'left_yield_turns_per_km',
    # 'mean_circuity',
    # 'edge_length_std',
    # 'edges_per_km'
]

# Procentowe cechy w skali 0–1 (nie wymagają normalizacji)
percentage_features = [
    # 'pct_high_speed',
    # 'pct_motorway',
    # 'pct_trunk',
    # 'pct_primary',
    # 'pct_secondary',
    # 'pct_tertiary',
    # 'pct_unclassified',
    # 'pct_residential'
]

# Wszystkie cechy używane w modelu
features = continuous_features + percentage_features



In [3]:
import numpy as np

print("Braki danych w kolumnach:")
print(df[features].isna().sum())
print(df[features].isna().sum()[lambda x: x > 0])

# print(df['pct_unpaved'].value_counts())
# # print(df['bridges_per_km'].value_counts())
# columns_to_check = ['tunnels_per_km', 'bridges_per_km', 'pct_motorway', 'pct_trunk', 'pct_lit', 'pct_unpaved']

# # Liczenie zer i innych wartości
# for col in columns_to_check:
#     zeros = (df[col] == 0).sum()
#     non_zeros = (df[col] != 0).sum()
#     print(f"{col}: zeros = {zeros}, non-zeros = {non_zeros}")


Braki danych w kolumnach:
free_flow_time                   0
total_length                     0
mean_speed                       0
speed_std                        0
speed_range                      0
lane_changes_per_km              0
priority_changes_per_km          0
yield_priority_changes_per_km    0
traffic_lights_per_km            0
turns_per_km                     0
left_yield_turns_per_km          0
dtype: int64
Series([], dtype: int64)


In [4]:
# Normalizacja cech ciągłych
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[continuous_features] = scaler.fit_transform(df[continuous_features])
# weighted mean_speed 
weight = 10.0
df_scaled['mean_speed'] = df_scaled['mean_speed'] * weight

X = df_scaled[features].values

print("Dane gotowe do clusteringu:", X.shape)


Dane gotowe do clusteringu: (4048, 11)


In [6]:
from sklearn.cluster import KMeans
import pandas as pd


kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_

# Dodajemy etykiety do oryginalnego DataFrame dla analizy
df['cluster'] = labels

# Sprawdzenie ile tras w każdym klastrze
print(df['cluster'].value_counts())
df.to_csv("../results/clustered_routes/beynes_routes_clustered.csv", index=False)

cluster
0    1336
2    1180
4    1021
1     298
3     213
Name: count, dtype: int64


In [7]:

global_speed = df['mean_speed'].mean()
print(global_speed)
# Średnia prędkość w każdym klastrze
cluster_speed = df.groupby('cluster')['mean_speed'].mean()
print(cluster_speed)

# Który klaster najszybszy
fastest_cluster = cluster_speed.idxmax()
print(f"Najszybszy klaster to: {fastest_cluster}")

35.37449526412878
cluster
0    30.275089
1    48.151823
2    40.451426
3    26.427647
4    34.316771
Name: mean_speed, dtype: float64
Najszybszy klaster to: 1


In [8]:

global_speed = df['free_flow_time'].mean()
print(global_speed)

cluster_speed = df.groupby('cluster')['free_flow_time'].mean()
print(cluster_speed)

cluster_speed_std = df.groupby('cluster')['free_flow_time'].std()
print(cluster_speed_std)

fastest_cluster = cluster_speed.idxmin()
print(f"Najszybszy(free_flow_time) klaster to: {fastest_cluster}")

4.062122524516334
cluster
0    2.518662
1    4.493584
2    6.387142
3    3.086669
4    3.472247
Name: free_flow_time, dtype: float64
cluster
0    1.171885
1    3.007594
2    3.404159
3    1.456169
4    1.455026
Name: free_flow_time, dtype: float64
Najszybszy(free_flow_time) klaster to: 0


In [8]:

global_speed = df['total_length'].mean()
print(global_speed)

cluster_speed = df.groupby('cluster')['total_length'].mean()
print(cluster_speed)

fastest_cluster = cluster_speed.idxmin()
print(f"Najkrótszy klaster to: {fastest_cluster}")

2839.1473568630813
cluster
0    1718.950619
1    4142.035611
2    4506.801365
Name: total_length, dtype: float64
Najkrótszy klaster to: 0


In [9]:

global_speed = df['traffic_lights_per_km'].mean()
print(global_speed)
cluster_speed = df.groupby('cluster')['traffic_lights_per_km'].mean()
print(cluster_speed)

fastest_cluster = cluster_speed.idxmin()
print(f"Najmniej swiatel klaster to: {fastest_cluster}")

0.34546760705742124
cluster
0    0.429822
1    0.092148
2    0.265088
Name: traffic_lights_per_km, dtype: float64
Najmniej swiatel klaster to: 1


In [9]:
# Liczymy ile ścieżek ma te same origin i destination
df['num_paths_same_od'] = df.groupby(['origins', 'destinations'])['path'].transform('count')
print(df[['origins', 'destinations', 'num_paths_same_od']].head(10))

# Nadanie unikalnego numeru ścieżki dla każdej pary origin-destination
df['route_id'] = df.groupby(['origins', 'destinations']).ngroup()
print(df[['origins', 'destinations', 'route_id']].head(10))

    origins destinations  num_paths_same_od
0  57680159   41273128#1                 20
1  57680159   41273128#1                 20
2  57680159   41273128#1                 20
3  57680159   41273128#1                 20
4  57680159   41273128#1                 20
5  57680159   41273128#1                 20
6  57680159   41273128#1                 20
7  57680159   41273128#1                 20
8  57680159   41273128#1                 20
9  57680159   41273128#1                 20
    origins destinations  route_id
0  57680159   41273128#1       203
1  57680159   41273128#1       203
2  57680159   41273128#1       203
3  57680159   41273128#1       203
4  57680159   41273128#1       203
5  57680159   41273128#1       203
6  57680159   41273128#1       203
7  57680159   41273128#1       203
8  57680159   41273128#1       203
9  57680159   41273128#1       203


In [10]:
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

counts = df.groupby(["route_id", "cluster"]).size().reset_index(name="count")
print(counts)
dup = counts[counts["count"] < 1]
print(dup)
print(df['route_id'].unique().shape)



     route_id  cluster  count
0           0        2     19
1           0        4      1
2           1        2     20
3           2        2     19
4           2        4      1
5           3        0     20
6           4        0     20
7           5        2     20
8           6        2     20
9           7        1     12
10          7        2      8
11          8        2      1
12          8        4      9
13          9        0     20
14         10        2     19
15         10        4      1
16         11        1     19
17         11        2      1
18         12        2      4
19         12        4     16
20         13        0     34
21         13        3      6
22         14        2     20
23         15        0     20
24         16        0      3
25         16        3      1
26         16        4     12
27         17        0     20
28         18        2     20
29         19        1      3
30         19        2     17
31         20        2     19
32        