In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
# INPUTS:

df = pd.read_csv("final_df.csv")
df.drop(columns=["Unnamed: 0", "geometry", "grid_id"], inplace=True)
df

Unnamed: 0,longitude,latitude,Allegro One Box,DHL BOX 24/7,DPD Pickup,Orlen Paczka,Paczkomat InPost,atm,bank,bus_stop,...,retail,school,secondary,suburb,subway,supermarket,tertiary,town,tram_stop,university
0,14.106323,52.956477,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14.104295,52.974395,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14.102263,52.992314,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,14.140023,52.921858,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,14.138011,52.939778,0,0,0,0,0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77268,23.762134,51.071580,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77269,23.763979,51.089508,0,0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77270,23.765826,51.107436,0,0,0,0,0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77271,23.767675,51.125364,0,0,0,0,0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# K-MEANS

In [3]:
# Parameters

n_clusters = 7

## Preprocessing

In [4]:
# all of the features are numerical except parcel machine columns as they are target variables
numerical_features = df.columns 

parcel_providers = [
    'Allegro One Box', 'DHL BOX 24/7', 'DPD Pickup', 'Orlen Paczka', 'Paczkomat InPost'
]
numerical_features = numerical_features[~numerical_features.isin(parcel_providers)]

# Define preprocessing steps for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Create the preprocessor with only numerical transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ],
    remainder='drop'  # Keep other columns not specified
)


In [5]:
df_to_transform = df.copy()

In [6]:
# Apply preprocessing to the data
processed_features_array = preprocessor.fit_transform(df_to_transform)

# Use numerical feature names directly since all features are numerical
processed_feature_names = list(numerical_features)

# Create DataFrame with transformed features
processed_features_df = pd.DataFrame(processed_features_array, columns=processed_feature_names, index=df.index)


## K-MEANS Clustering

In [7]:
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
df['cluster_label'] = kmeans.fit_predict(processed_features_df)
print(f"Klastrowanie zakończone. Liczba klastrów: {n_clusters}")

Klastrowanie zakończone. Liczba klastrów: 7


# Regression model

## Model

In [8]:
# Calculate the total number of parcel machines from different providers
df['num_parcel_machines'] = df[parcel_providers].sum(axis=1)

# Set target variable for the regression model
y_target = df["num_parcel_machines"]

In [9]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10, min_samples_split=10)
print("Rozpoczynam trenowanie modelu RandomForestRegressor...")
rf_model.fit(processed_features_df, y_target)
print("Trenowanie modelu RandomForestRegressor zakończone.")

Rozpoczynam trenowanie modelu RandomForestRegressor...
Trenowanie modelu RandomForestRegressor zakończone.


In [10]:
processed_features_df

Unnamed: 0,longitude,latitude,atm,bank,bus_stop,cafe,city,college,commercial,convenience,...,retail,school,secondary,suburb,subway,supermarket,tertiary,town,tram_stop,university
0,-2.087290,0.618154,-0.096629,-0.101721,-0.342474,-0.052147,-0.028791,-0.048658,-0.138877,-0.201934,...,-0.147629,-0.198476,-0.221333,-0.117161,-0.006453,-0.205103,-0.276981,-0.109745,-0.052838,-0.034287
1,-2.088104,0.631266,-0.096629,-0.101721,-0.342474,-0.052147,-0.028791,-0.048658,-0.138877,-0.201934,...,-0.147629,-0.198476,-0.221333,-0.117161,-0.006453,-0.205103,-0.276981,-0.109745,-0.052838,-0.034287
2,-2.088919,0.644377,-0.096629,-0.101721,-0.342474,-0.052147,-0.028791,-0.048658,-0.138877,-0.201934,...,-0.147629,-0.198476,-0.221333,-0.117161,-0.006453,-0.205103,-0.276981,-0.109745,-0.052838,-0.034287
3,-2.073777,0.592823,-0.096629,-0.101721,-0.342474,-0.052147,-0.028791,-0.048658,-0.138877,-0.201934,...,-0.147629,-0.198476,-0.221333,-0.117161,-0.006453,-0.205103,-0.276981,-0.109745,-0.052838,-0.034287
4,-2.074584,0.605935,-0.096629,-0.101721,-0.127386,-0.052147,-0.028791,-0.048658,-0.138877,-0.201934,...,0.288679,-0.198476,-0.221333,-0.117161,-0.006453,-0.205103,-0.071968,-0.109745,-0.052838,-0.034287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77268,1.784743,-0.761051,-0.096629,-0.101721,-0.342474,-0.052147,-0.028791,-0.048658,-0.138877,-0.201934,...,-0.147629,-0.198476,-0.221333,-0.117161,-0.006453,-0.205103,-0.276981,-0.109745,-0.052838,-0.034287
77269,1.785483,-0.747933,-0.096629,-0.101721,-0.342474,-0.052147,-0.028791,-0.048658,-0.138877,-0.201934,...,-0.147629,-0.198476,-0.221333,-0.117161,-0.006453,-0.205103,-0.276981,-0.109745,-0.052838,-0.034287
77270,1.786224,-0.734815,-0.096629,-0.101721,0.302790,-0.052147,-0.028791,-0.048658,-0.138877,-0.201934,...,-0.147629,-0.198476,-0.221333,-0.117161,-0.006453,-0.205103,-0.276981,-0.109745,-0.052838,-0.034287
77271,1.786965,-0.721697,-0.096629,-0.101721,-0.127386,-0.052147,-0.028791,-0.048658,-0.138877,-0.201934,...,-0.147629,-0.198476,-0.221333,-0.117161,-0.006453,-0.205103,-0.276981,-0.109745,-0.052838,-0.034287


In [11]:
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': processed_feature_names,
    'Importance': np.round(feature_importances, 4)
})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

print("Feature Importances (sorted):")
print(feature_importance_df)

Feature Importances (sorted):
            Feature  Importance
10          footway      0.5458
15          parking      0.2070
28      supermarket      0.0322
1          latitude      0.0224
31        tram_stop      0.0192
0         longitude      0.0187
2               atm      0.0156
9       convenience      0.0142
21      residential      0.0132
12    living_street      0.0127
29         tertiary      0.0091
8        commercial      0.0085
17         pharmacy      0.0081
4          bus_stop      0.0075
25        secondary      0.0072
26           suburb      0.0059
24           school      0.0059
23           retail      0.0053
22       restaurant      0.0052
16       pedestrian      0.0052
3              bank      0.0046
18      post_office      0.0040
7           college      0.0033
11             fuel      0.0032
19          primary      0.0031
5              cafe      0.0026
13             mall      0.0024
32       university      0.0023
14     market_place      0.0017
6         

## Score calculation

In [12]:
# --- 4. Obliczenie "Surowego Potencjału" (Raw Potential Score) ---
# Surowy potencjał = suma (przeskalowana_cecha * jej_waznosc_z_modelu_RF)
# Używamy `processed_features_df` które są już przeskalowane i zakodowane
df['raw_potential_score'] = np.sum(processed_features_df.values * feature_importances, axis=1)
print("Obliczono 'raw_potential_score'.")

Obliczono 'raw_potential_score'.


In [13]:
# --- 5. Obliczenie Finalnego "Opportunity Score" ---
# Opportunity Score = Raw Potential Score / (liczba_istniejacych_paczkomatow + epsilon)
# Epsilon zapobiega dzieleniu przez zero i lekko zmniejsza score, gdy paczkomaty już istnieją.

epsilon = 0.1 
# Dla każdego providera osobno obliczamy opportunity score
for provider in parcel_providers:
    df[f'{provider}_opportunity_score'] = df['raw_potential_score'] / (df[provider] + epsilon)



In [14]:
# Można dodatkowo przeskalować opportunity_score, np. do zakresu 0-100, jeśli to potrzebne
# MinMaxScaler dla opportunity_score

# Dla każdego providera obliczamy znormalizowany opportunity score
for provider in parcel_providers:
        min_score = df[f'{provider}_opportunity_score'].min()
        max_score = df[f'{provider}_opportunity_score'].max()
        df[f'scaled_{provider}_opportunity_score'] = 100 * (df[f'{provider}_opportunity_score'] - min_score) / (max_score - min_score)

print("Obliczono 'opportunity_score' i 'opportunity_score_scaled_0_100'.")

Obliczono 'opportunity_score' i 'opportunity_score_scaled_0_100'.


In [15]:
df

Unnamed: 0,longitude,latitude,Allegro One Box,DHL BOX 24/7,DPD Pickup,Orlen Paczka,Paczkomat InPost,atm,bank,bus_stop,...,Allegro One Box_opportunity_score,DHL BOX 24/7_opportunity_score,DPD Pickup_opportunity_score,Orlen Paczka_opportunity_score,Paczkomat InPost_opportunity_score,scaled_Allegro One Box_opportunity_score,scaled_DHL BOX 24/7_opportunity_score,scaled_DPD Pickup_opportunity_score,scaled_Orlen Paczka_opportunity_score,scaled_Paczkomat InPost_opportunity_score
0,14.106323,52.956477,0,0,0,0,0,0.0,0.0,0.0,...,-1.654857,-1.654857,-1.654857,-1.654857,-1.654857,0.110348,0.097301,0.090467,0.086520,0.493117
1,14.104295,52.974395,0,0,0,0,0,0.0,0.0,0.0,...,-1.652074,-1.652074,-1.652074,-1.652074,-1.652074,0.111560,0.098370,0.091461,0.087471,0.498534
2,14.102263,52.992314,0,0,0,0,0,0.0,0.0,0.0,...,-1.649292,-1.649292,-1.649292,-1.649292,-1.649292,0.112772,0.099438,0.092455,0.088421,0.503952
3,14.140023,52.921858,0,0,0,0,0,0.0,0.0,0.0,...,-1.658000,-1.658000,-1.658000,-1.658000,-1.658000,0.108978,0.096093,0.089345,0.085447,0.486998
4,14.138011,52.939778,0,0,0,0,0,0.0,0.0,1.0,...,-1.559194,-1.559194,-1.559194,-1.559194,-1.559194,0.152023,0.134048,0.124634,0.119197,0.679353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77268,23.762134,51.071580,0,0,0,0,0,0.0,0.0,0.0,...,-1.239561,-1.239561,-1.239561,-1.239561,-1.239561,0.291270,0.256832,0.238794,0.228376,1.301616
77269,23.763979,51.089508,0,0,0,0,0,0.0,0.0,0.0,...,-1.236486,-1.236486,-1.236486,-1.236486,-1.236486,0.292610,0.258013,0.239893,0.229427,1.307602
77270,23.765826,51.107436,0,0,0,0,0,0.0,0.0,3.0,...,-1.177148,-1.177148,-1.177148,-1.177148,-1.177148,0.318460,0.280806,0.261085,0.249695,1.423120
77271,23.767675,51.125364,0,0,0,0,0,0.0,0.0,1.0,...,-1.210320,-1.210320,-1.210320,-1.210320,-1.210320,0.304009,0.268064,0.249238,0.238364,1.358542


In [16]:
df.to_csv("final_df_with_opportunity_scores.csv", index=False)