In [13]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [14]:
# INPUTS:

df = pd.read_csv("generated_features.csv")
df

Unnamed: 0,longitude,latitude,num_parcel_machines,num_parking_spots,population_density_per_sqkm,num_bus_stops,num_retail_stores,num_restaurants_cafes,office_space_sqm,residential_units,...,accessibility_score_1_10,crime_rate_index_0_1,num_schools_univ,num_public_transport_lines,has_24_7_access_bool,dist_major_road_m,num_poi_general,foot_traffic_index_1_100,zoning_category,avg_household_income_pln_k
0,14.081522,52.977954,3,18,4204.869436,4,3,13,4027,3,...,2,0.891454,3,10,1,353.663921,2,26,3,3.172278
1,14.080502,52.986913,5,33,13338.516750,4,14,9,4391,372,...,10,0.013801,3,10,0,69.933507,19,83,3,12.032770
2,14.098398,52.960650,2,2,13419.330511,3,23,11,3372,460,...,8,0.708190,1,6,0,358.371557,30,86,3,8.678552
3,14.097382,52.969609,4,35,11506.063569,1,9,12,9562,334,...,6,0.373970,2,0,1,352.691188,41,84,2,6.323578
4,14.096365,52.978568,4,42,9723.853980,3,23,5,913,294,...,3,0.162139,2,8,1,183.629688,33,79,1,7.997408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308969,23.760091,51.121173,5,40,1247.807737,2,3,14,7871,129,...,3,0.272758,2,1,0,389.256994,45,60,2,9.862997
308970,23.761015,51.130137,5,6,1867.465613,0,14,20,2634,294,...,5,0.691966,1,5,0,58.735044,13,96,1,13.967261
308971,23.761939,51.139101,3,8,7060.637289,0,4,7,2777,194,...,5,0.092285,1,0,0,320.016726,6,53,4,9.179476
308972,23.762863,51.148064,5,33,14849.930287,3,22,10,3694,268,...,2,0.192998,2,10,1,108.754389,8,27,4,6.920888


# K-MEANS

In [15]:
# Parameters

n_clusters = 7

## Preprocessing

In [16]:
# Bool features coversion
df['has_24_7_access_bool'] = df['has_24_7_access_bool'].astype(int)

In [18]:
numerical_features = [
    'num_parking_spots', 'population_density_per_sqkm', 'num_bus_stops',
    'num_retail_stores', 'num_restaurants_cafes', 'office_space_sqm',
    'residential_units', 'avg_property_value_pln_sqm', 'num_competitor_machines',
    'visibility_score_1_10', 'accessibility_score_1_10', 'crime_rate_index_0_1',
    'num_schools_univ', 'num_public_transport_lines', 'has_24_7_access_bool',
    'dist_major_road_m', 'num_poi_general', 'foot_traffic_index_1_100',
    'avg_household_income_pln_k'
]
categorical_features = ['zoning_category']

numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # sparse_output=False dla łatwiejszego użycia z RandomForest
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ], 
    remainder='passthrough' # zachowaj kolumny nie wymienione (np. long, lat, num_parcel_machines)
                            # ale do modelowania użyjemy tylko przetworzonych
)

In [22]:
features_for_model_and_clustering = numerical_features + categorical_features
df_to_transform = df[features_for_model_and_clustering].copy()
features_for_model_and_clustering

['num_parking_spots',
 'population_density_per_sqkm',
 'num_bus_stops',
 'num_retail_stores',
 'num_restaurants_cafes',
 'office_space_sqm',
 'residential_units',
 'avg_property_value_pln_sqm',
 'num_competitor_machines',
 'visibility_score_1_10',
 'accessibility_score_1_10',
 'crime_rate_index_0_1',
 'num_schools_univ',
 'num_public_transport_lines',
 'has_24_7_access_bool',
 'dist_major_road_m',
 'num_poi_general',
 'foot_traffic_index_1_100',
 'avg_household_income_pln_k',
 'zoning_category']

In [24]:
processed_features_array = preprocessor.fit_transform(df_to_transform)
ohe_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
processed_feature_names = numerical_features + list(ohe_feature_names)
processed_features_df = pd.DataFrame(processed_features_array, columns=processed_feature_names, index=df.index)


## K-MEANS Clustering

In [25]:
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
df['cluster_label'] = kmeans.fit_predict(processed_features_df)
print(f"Klastrowanie zakończone. Liczba klastrów: {n_clusters}")

Klastrowanie zakończone. Liczba klastrów: 7


# Regression model

## Model

In [29]:
target_imputer = SimpleImputer(strategy='median')
df["num_parcel_machines"] = target_imputer.fit_transform(df[["num_parcel_machines"]])
y_target = df["num_parcel_machines"]

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10, min_samples_split=10)
print("Rozpoczynam trenowanie modelu RandomForestRegressor...")
rf_model.fit(processed_features_df, y_target)
print("Trenowanie modelu RandomForestRegressor zakończone.")

Rozpoczynam trenowanie modelu RandomForestRegressor...
Trenowanie modelu RandomForestRegressor zakończone.


In [42]:
feature_importance_df = pd.DataFrame({
    'Feature': processed_feature_names,
    'Importance': feature_importances
})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

print("Feature Importances (sorted):")
print(feature_importance_df)

Feature Importances (sorted):
                        Feature  Importance
18   avg_household_income_pln_k    0.096143
11         crime_rate_index_0_1    0.095426
1   population_density_per_sqkm    0.095399
15            dist_major_road_m    0.092403
5              office_space_sqm    0.088833
7    avg_property_value_pln_sqm    0.086114
6             residential_units    0.070858
17     foot_traffic_index_1_100    0.055910
0             num_parking_spots    0.049775
16              num_poi_general    0.048069
3             num_retail_stores    0.041414
4         num_restaurants_cafes    0.037983
9         visibility_score_1_10    0.027159
13   num_public_transport_lines    0.026007
10     accessibility_score_1_10    0.025867
2                 num_bus_stops    0.016871
12             num_schools_univ    0.012773
8       num_competitor_machines    0.012307
21            zoning_category_3    0.004701
14         has_24_7_access_bool    0.004700
19            zoning_category_1    0.004135
22

## Score calculation

In [31]:
# --- 4. Obliczenie "Surowego Potencjału" (Raw Potential Score) ---
# Surowy potencjał = suma (przeskalowana_cecha * jej_waznosc_z_modelu_RF)
# Używamy `processed_features_df` które są już przeskalowane i zakodowane
df['raw_potential_score'] = np.sum(processed_features_df.values * feature_importances, axis=1)
print("Obliczono 'raw_potential_score'.")

Obliczono 'raw_potential_score'.


In [34]:
# --- 5. Obliczenie Finalnego "Opportunity Score" ---
# Opportunity Score = Raw Potential Score / (liczba_istniejacych_paczkomatow + epsilon)
# Epsilon zapobiega dzieleniu przez zero i lekko zmniejsza score, gdy paczkomaty już istnieją.
epsilon = 0.1 
df['opportunity_score'] = df['raw_potential_score'] / (df["num_parcel_machines"] + epsilon)


In [35]:
# Można dodatkowo przeskalować opportunity_score, np. do zakresu 0-100, jeśli to potrzebne
# MinMaxScaler dla opportunity_score
min_score = df['opportunity_score'].min()
max_score = df['opportunity_score'].max()
if max_score > min_score: # Uniknięcie dzielenia przez zero jeśli wszystkie wartości są takie same
        df['opportunity_score_scaled_0_100'] = 100 * (df['opportunity_score'] - min_score) / (max_score - min_score)
else:
        df['opportunity_score_scaled_0_100'] = 50 # lub inna domyślna wartość, jeśli wszystkie score'y są identyczne

print("Obliczono 'opportunity_score' i 'opportunity_score_scaled_0_100'.")

Obliczono 'opportunity_score' i 'opportunity_score_scaled_0_100'.


In [36]:
df

Unnamed: 0,longitude,latitude,num_parcel_machines,num_parking_spots,population_density_per_sqkm,num_bus_stops,num_retail_stores,num_restaurants_cafes,office_space_sqm,residential_units,...,has_24_7_access_bool,dist_major_road_m,num_poi_general,foot_traffic_index_1_100,zoning_category,avg_household_income_pln_k,cluster_label,raw_potential_score,opportunity_score,opportunity_score_scaled_0_100
0,14.081522,52.977954,3.0,18,4204.869436,4,3,13,4027,3,...,1,353.663921,2,26,3,3.172278,1,-0.394405,-0.127227,49.588483
1,14.080502,52.986913,5.0,33,13338.516750,4,14,9,4391,372,...,0,69.933507,19,83,3,12.032770,4,0.191652,0.037579,50.391941
2,14.098398,52.960650,2.0,2,13419.330511,3,23,11,3372,460,...,0,358.371557,30,86,3,8.678552,5,0.429839,0.204685,51.206612
3,14.097382,52.969609,4.0,35,11506.063569,1,9,12,9562,334,...,1,352.691188,41,84,2,6.323578,2,0.279989,0.068290,50.541663
4,14.096365,52.978568,4.0,42,9723.853980,3,23,5,913,294,...,1,183.629688,33,79,1,7.997408,2,-0.149843,-0.036547,50.030565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308969,23.760091,51.121173,5.0,40,1247.807737,2,3,14,7871,129,...,0,389.256994,45,60,2,9.862997,6,-0.118686,-0.023272,50.095284
308970,23.761015,51.130137,5.0,6,1867.465613,0,14,20,2634,294,...,0,58.735044,13,96,1,13.967261,4,-0.172512,-0.033826,50.043831
308971,23.761939,51.139101,3.0,8,7060.637289,0,4,7,2777,194,...,0,320.016726,6,53,4,9.179476,4,-0.599734,-0.193463,49.265575
308972,23.762863,51.148064,5.0,33,14849.930287,3,22,10,3694,268,...,1,108.754389,8,27,4,6.920888,3,-0.079985,-0.015683,50.132279
