In [1]:
import os

from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Dataset preparation + train/val/test split

In [2]:
DATA_DIR = Path(os.getcwd()).parent / "data" / "train_data"

df = pd.read_csv(DATA_DIR / "ukraine_all_data.csv")
df.head()

Unnamed: 0,cell,city,is_positive,geometry,bench_count,cafe_count,pharmacy_count,waste_disposal_count,atm_count,post_office_count,...,wash_count,water_crane_count,farmland_count,forest_count,grass_count,allotments_count,meadow_count,cemetery_count,orchard_count,farmyard_count
0,881e7159e9fffff,Ярмолинці,1,POLYGON ((26.83393817535367 49.192988306649454...,0,0,1,0,0,1,...,0,0,,,1.0,,,,,
1,881e6d3a5bfffff,Тростянець,1,POLYGON ((34.95934457201796 50.473494753294666...,11,1,4,0,3,2,...,0,0,1.0,,34.0,2.0,,,,
2,881e6824dbfffff,Полтава,1,"POLYGON ((34.49924943744107 49.5775947405843, ...",10,1,2,3,1,1,...,0,0,,,8.0,,,,,
3,881e6c32c3fffff,Ромни,1,"POLYGON ((33.49082533801754 50.7484969657831, ...",0,0,1,0,0,0,...,0,0,,,,,,,,
4,881e652d99fffff,Чернігів,1,POLYGON ((31.283317311257992 51.49889221417866...,0,0,0,0,0,0,...,0,0,,,21.0,,,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7084 entries, 0 to 7083
Columns: 107 entries, cell to farmyard_count
dtypes: float64(18), int64(85), object(4)
memory usage: 5.8+ MB


In [4]:
sorted(df.columns.to_list())

['allotments_count',
 'apartments_count',
 'apteka_dobrego_dnya_count',
 'apteka_nyzkykh_zin_count',
 'apteka_optovykh_zin_count',
 'apteka_podorozhnyk_count',
 'atb_count',
 'atm_count',
 'bank_count',
 'bar_count',
 'bench_count',
 'bicycle_parking_count',
 'border_count',
 'buffer_stop_count',
 'bureau_de_change_count',
 'cafe_count',
 'car_wash_count',
 'cell',
 'cemetery_count',
 'city',
 'clinic_count',
 'community_centre_count',
 'construction_count',
 'crossing_count',
 'crossover_count',
 'dentist_count',
 'derail_count',
 'detached_count',
 'doctors_count',
 'drinking_water_count',
 'eko_market_count',
 'farmland_count',
 'farmyard_count',
 'fast_food_count',
 'fora_count',
 'forest_count',
 'fuel_count',
 'garage_count',
 'garages_count',
 'geometry',
 'grass_count',
 'halt_count',
 'house_count',
 'hump_yard_count',
 'industrial_count',
 'is_positive',
 'junction_count',
 'level_crossing;crossing_count',
 'level_crossing_count',
 'library_count',
 'meadow_count',
 'mileston

In [56]:
df_data.columns.to_list()

['is_positive',
 'bench_count',
 'cafe_count',
 'pharmacy_count',
 'waste_disposal_count',
 'atm_count',
 'post_office_count',
 'bank_count',
 'restaurant_count',
 'waste_basket_count',
 'fuel_count',
 'shelter_count',
 'toilets_count',
 'fast_food_count',
 'place_of_worship_count',
 'bicycle_parking_count',
 'parking_count',
 'bar_count',
 'dentist_count',
 'drinking_water_count',
 'clinic_count',
 'car_wash_count',
 'payment_terminal_count',
 'recycling_count',
 'library_count',
 'school_count',
 'community_centre_count',
 'vending_machine_count',
 'pub_count',
 'bureau_de_change_count',
 'doctors_count',
 'atb_count',
 'novus_count',
 'eko_market_count',
 'fora_count',
 'apteka_nyzkykh_zin_count',
 'apteka_optovykh_zin_count',
 'apteka_dobrego_dnya_count',
 'apteka_podorozhnyk_count',
 'yes_count',
 'house_count',
 'detached_count',
 'apartments_count',
 'residential_count',
 'shed_count',
 'industrial_count',
 'garage_count',
 'garages_count',
 'retail_count',
 'level_crossing_coun

In [5]:
metadata_cols = ["cell", "city", "geometry", "wkt"]

df_metadata = df[metadata_cols]
df_data = df.drop(columns=metadata_cols)
target_col = "is_positive"

In [6]:
city_vc = df_metadata[df_data['is_positive'].astype(bool)]['city'].value_counts()
city_vc[city_vc > 20]

city
Київ             152
Дніпро            67
Харків            67
Кривий Ріг        52
Запоріжжя         42
Одеса             37
Полтава           36
Миколаїв          32
Львів             25
Кропивницький     23
Суми              23
Чернігів          23
Кам'янське        22
Name: count, dtype: int64

In [7]:
cols_with_nan = df_data.columns[df_data.isna().any()].to_list()

# fill cols with nan with 0 since they all are counts
df_data[cols_with_nan] = df_data[cols_with_nan].fillna(0)

In [8]:
test_cities = ['Львів', 'Харків', 'Одеса']

df_data_test = df_data[df_metadata['city'].isin(test_cities)]
df_data_train = df_data[~df_metadata['city'].isin(test_cities)]

X_train = df_data_train.drop(columns=[target_col])
y_train = df_data_train[target_col]

X_test = df_data_test.drop(columns=[target_col])
y_test = df_data_test[target_col]

In [31]:
X_test

Unnamed: 0,bench_count,cafe_count,pharmacy_count,waste_disposal_count,atm_count,post_office_count,bank_count,restaurant_count,waste_basket_count,fuel_count,...,wash_count,water_crane_count,farmland_count,forest_count,grass_count,allotments_count,meadow_count,cemetery_count,orchard_count,farmyard_count
351,0,0,0,0,0,0,0,0,0,0,...,0,0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0
359,4,5,10,11,4,4,5,0,0,0,...,0,0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0.0,1.0,13.0,1.0,0.0,0.0,0.0,0.0
401,0,1,5,0,1,2,2,0,0,0,...,0,0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0
407,0,1,5,1,5,2,1,2,0,0,...,0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6580,0,1,1,0,0,1,1,0,0,0,...,0,0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
6581,0,0,0,0,0,0,0,0,0,0,...,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6582,0,0,0,0,0,0,0,0,0,0,...,0,0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6583,0,0,0,0,0,0,0,0,0,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"Train set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Train set size: 5244
Validation set size: 1312
Test set size: 528


In [32]:
print(f"Number of positive samples in train set: {y_train.sum()} | negative: {len(y_train) - y_train.sum()}")
print(f"Number of positive samples in validation set: {y_val.sum()} | negative: {len(y_val) - y_val.sum()}")
print(f"Number of positive samples in test set: {y_test.sum()} | negative: {len(y_test) - y_test.sum()}")

Number of positive samples in train set: 1255 | negative: 3989
Number of positive samples in validation set: 314 | negative: 998
Number of positive samples in test set: 129 | negative: 399


In [10]:
# check X_train types
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [12]:
data = {
    "X_train_scaled": X_train_scaled,
    "y_train": y_train,
    "X_val_scaled": X_val_scaled,
    "y_val": y_val,
    "X_test_scaled": X_test_scaled,
    "y_test": y_test,
}

# 2. Training

In [13]:
def compute_metrics(data, model, model_name, verbose=True):
    results = {}
    splits = ['train', 'val', 'test']

    for split in splits:
        X = data[f'X_{split}_scaled']
        y = data[f'y_{split}']
        
        y_pred = model.predict(X)
        accuracy = accuracy_score(y, y_pred)
        report = classification_report(y, y_pred, output_dict=True)
        f1_score = report['1']['f1-score']
        precision = report['1']['precision']
        recall = report['1']['recall']

        to_percantage = lambda x: np.round(x * 100, 3)

        results[split] = {
            'accuracy': to_percantage(accuracy),
            'f1_score': to_percantage(f1_score),
            'precision': to_percantage(precision),
            'recall': to_percantage(recall),
        }
    
        if verbose:
            print(f"{model_name} - {split} set metrics:")
            print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1_score:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
            print(classification_report(y, y_pred))

    return results


## 2.1. Baseine Logistic regression

In [14]:
# checj which columns has NaN values
nan_cols = X_train.columns[X_train.isna().any()].tolist()
if nan_cols:
    print(f"Columns with NaN values: {nan_cols}")

In [15]:
print("Any NaN?", np.isnan(X_train_scaled).any())
print("Any ±inf?", np.isinf(X_train_scaled).any())
print("Feature ranges:", X_train_scaled.min(axis=0), X_train_scaled.max(axis=0))

Any NaN? False
Any ±inf? False
Feature ranges: [-0.14527252 -0.18696475 -0.21801917 -0.15707255 -0.18763595 -0.28459122
 -0.18749523 -0.15580738 -0.08258855 -0.22451987 -0.06224243 -0.18748882
 -0.15292978 -0.173111   -0.10676861 -0.12823332 -0.13959524 -0.16383576
 -0.13106569 -0.12644685 -0.19946854 -0.09300622 -0.11726633 -0.17021339
 -0.11773076 -0.0654196  -0.14688009 -0.14510222 -0.1123136  -0.06131132
 -0.19427445 -0.04572132 -0.07292416 -0.12368431 -0.08648482 -0.17192718
 -0.07884733 -0.11424363 -0.54424159 -0.29092205 -0.12367565 -0.36047254
 -0.48714007 -0.10400653 -0.46920825 -0.08751085 -0.24087503 -0.04539758
 -0.17536402 -0.09775393 -0.15029888 -0.12365742 -0.07343058 -0.11000028
 -0.11548891 -0.06058068 -0.15782168 -0.06938515 -0.04146321 -0.07095008
 -0.06319329 -0.02392509 -0.3328586  -0.01953289 -0.01381052 -0.02610583
 -0.03223822  0.         -0.01953289 -0.01381052 -0.01381052 -0.2502887
 -0.01381052 -0.01381052 -0.01381052  0.          0.          0.
  0.         

In [16]:
lr = LogisticRegression(penalty='l2', C=0.1, solver='lbfgs', max_iter=1000, random_state=42)

lr.fit(X_train_scaled, y_train)

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [17]:
lr_metrics = compute_metrics(
    data, lr, "Logistic Regression", verbose=True
)

Logistic Regression - train set metrics:
Accuracy: 0.9268, F1 Score: 0.8323, Precision: 0.9208, Recall: 0.7594
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      3989
           1       0.92      0.76      0.83      1255

    accuracy                           0.93      5244
   macro avg       0.92      0.87      0.89      5244
weighted avg       0.93      0.93      0.92      5244

Logistic Regression - val set metrics:
Accuracy: 0.9177, F1 Score: 0.8118, Precision: 0.8962, Recall: 0.7420
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       998
           1       0.90      0.74      0.81       314

    accuracy                           0.92      1312
   macro avg       0.91      0.86      0.88      1312
weighted avg       0.92      0.92      0.91      1312

Logistic Regression - test set metrics:
Accuracy: 0.9072, F1 Score: 0.8350, Precision: 0.7381, Recall: 0.9612
              pr

  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [18]:
print(f"Feature importances (coefficients):")
sorted_indices = np.argsort(lr.coef_[0])

# print the top 10 positive and negative coefficients
top_positive_indices = sorted_indices[-10:]
top_negative_indices = sorted_indices[:10]

print("Top 10 positive coefficients:")
for idx in top_positive_indices:
    print(f"{X_train.columns[idx]}: {lr.coef_[0][idx]:.4f}")

print("\nTop 10 negative coefficients:")
for idx in top_negative_indices:
    print(f"{X_train.columns[idx]}: {lr.coef_[0][idx]:.4f}")

Feature importances (coefficients):
Top 10 positive coefficients:
waste_basket_count: 0.5175
atb_count: 0.6002
community_centre_count: 0.6667
apteka_optovykh_zin_count: 0.6753
atm_count: 0.7341
retail_count: 0.9116
bank_count: 1.1002
pharmacy_count: 1.2099
post_office_count: 1.5512
apartments_count: 1.6885

Top 10 negative coefficients:
restaurant_count: -0.7854
clinic_count: -0.5847
stop_position_count: -0.5212
apteka_dobrego_dnya_count: -0.4465
novus_count: -0.4155
buffer_stop_count: -0.4024
doctors_count: -0.3862
fast_food_count: -0.3711
farmland_count: -0.3566
payment_terminal_count: -0.3226


## 2.1. Decision Tree + Random Forest

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Train a Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [20]:
dt_metrics = compute_metrics(
    data, dt, "Decision Tree Classifier", verbose=True
)

Decision Tree Classifier - train set metrics:
Accuracy: 0.9992, F1 Score: 0.9984, Precision: 1.0000, Recall: 0.9968
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3989
           1       1.00      1.00      1.00      1255

    accuracy                           1.00      5244
   macro avg       1.00      1.00      1.00      5244
weighted avg       1.00      1.00      1.00      5244

Decision Tree Classifier - val set metrics:
Accuracy: 0.9070, F1 Score: 0.8076, Precision: 0.8000, Recall: 0.8153
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       998
           1       0.80      0.82      0.81       314

    accuracy                           0.91      1312
   macro avg       0.87      0.88      0.87      1312
weighted avg       0.91      0.91      0.91      1312

Decision Tree Classifier - test set metrics:
Accuracy: 0.8504, F1 Score: 0.7476, Precision: 0.6359, Recall: 0.9070
 

In [21]:
rf = RandomForestClassifier(random_state=42)

rf.fit(X_train_scaled, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
rf_metrics = compute_metrics(
    data, rf, "Random Forest Classifier", verbose=True
)

Random Forest Classifier - train set metrics:
Accuracy: 0.9992, F1 Score: 0.9984, Precision: 1.0000, Recall: 0.9968
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3989
           1       1.00      1.00      1.00      1255

    accuracy                           1.00      5244
   macro avg       1.00      1.00      1.00      5244
weighted avg       1.00      1.00      1.00      5244

Random Forest Classifier - val set metrics:
Accuracy: 0.9360, F1 Score: 0.8671, Precision: 0.8616, Recall: 0.8726
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       998
           1       0.86      0.87      0.87       314

    accuracy                           0.94      1312
   macro avg       0.91      0.91      0.91      1312
weighted avg       0.94      0.94      0.94      1312

Random Forest Classifier - test set metrics:
Accuracy: 0.8693, F1 Score: 0.7837, Precision: 0.6579, Recall: 0.9690
 

## pruning (DecisionTree only)

In [23]:
from sklearn.model_selection import GridSearchCV

dt_prunned = DecisionTreeClassifier(random_state=42)
dt_prunned.fit(X_train_scaled, y_train)

path = dt_prunned.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas  # list of candidate alphas

In [24]:
param_grid = {'ccp_alpha': ccp_alphas}
grid = GridSearchCV(DecisionTreeClassifier(random_state=0),
                    param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_train, y_train)

best_tree = grid.best_estimator_

In [25]:
best_dt_metrics = compute_metrics(
    data, best_tree, "Best Decision Tree Classifier", verbose=True
)

Best Decision Tree Classifier - train set metrics:
Accuracy: 0.8600, F1 Score: 0.6015, Precision: 0.9438, Recall: 0.4414
              precision    recall  f1-score   support

           0       0.85      0.99      0.92      3989
           1       0.94      0.44      0.60      1255

    accuracy                           0.86      5244
   macro avg       0.90      0.72      0.76      5244
weighted avg       0.87      0.86      0.84      5244

Best Decision Tree Classifier - val set metrics:
Accuracy: 0.8575, F1 Score: 0.5890, Precision: 0.9504, Recall: 0.4268
              precision    recall  f1-score   support

           0       0.85      0.99      0.91       998
           1       0.95      0.43      0.59       314

    accuracy                           0.86      1312
   macro avg       0.90      0.71      0.75      1312
weighted avg       0.87      0.86      0.84      1312

Best Decision Tree Classifier - test set metrics:
Accuracy: 0.8769, F1 Score: 0.7111, Precision: 0.8333, R



## Hyperparameter search for Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [10, 25, 50, 100],
    'max_depth': [None, 1, 3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(RandomForestClassifier(random_state=0),
                    param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_train_scaled, y_train)

print("Best params:", grid.best_params_)
best_rf = grid.best_estimator_


Best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [27]:
best_rf_metrics = compute_metrics(
    data, best_rf, "Best Random Forest Classifier", verbose=True
)

Best Random Forest Classifier - train set metrics:
Accuracy: 0.9992, F1 Score: 0.9984, Precision: 1.0000, Recall: 0.9968
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3989
           1       1.00      1.00      1.00      1255

    accuracy                           1.00      5244
   macro avg       1.00      1.00      1.00      5244
weighted avg       1.00      1.00      1.00      5244

Best Random Forest Classifier - val set metrics:
Accuracy: 0.9337, F1 Score: 0.8643, Precision: 0.8471, Recall: 0.8822
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       998
           1       0.85      0.88      0.86       314

    accuracy                           0.93      1312
   macro avg       0.90      0.92      0.91      1312
weighted avg       0.93      0.93      0.93      1312

Best Random Forest Classifier - test set metrics:
Accuracy: 0.8674, F1 Score: 0.7812, Precision: 0.6545, R

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score

# define your hyper-parameter grid
param_grid = {
    'n_estimators': [10, 25, 50, 100, 200, 300],
    'max_depth': [None, 2, 5, 8, 12, 25, 50],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': ['sqrt','log2']
}

# use stratified k-fold on *training* data only
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid, cv=cv,
    scoring='f1', n_jobs=-1,
)
grid.fit(X_train_scaled, y_train)

# now evaluate the *held-out* validation set
val_preds = grid.best_estimator_.predict(X_val_scaled)
print("Val F1:", f1_score(y_val, val_preds))

test_preds = grid.best_estimator_.predict(X_test_scaled)
print("Test F1:", f1_score(y_test, test_preds))

Val F1: 0.8738170347003155
Test F1: 0.7886435331230284


In [36]:
grid.cv_results_

{'mean_fit_time': array([0.02928104, 0.06166043, 0.12521749, 0.27441921, 0.51909161,
        0.81590323, 0.02590284, 0.05886102, 0.12237897, 0.23901477,
        0.50409718, 0.71823888, 0.02683783, 0.06149278, 0.11372375,
        0.21804323, 0.439217  , 0.63647432, 0.01845183, 0.05681257,
        0.10102162, 0.20027428, 0.40305586, 0.57462068, 0.0245008 ,
        0.05408974, 0.11943603, 0.23159423, 0.45197577, 0.65587401,
        0.02168446, 0.04770646, 0.09417319, 0.19404874, 0.39808054,
        0.6241879 , 0.02745662, 0.0424428 , 0.08384104, 0.16220856,
        0.35791898, 0.51896958, 0.0190814 , 0.04357257, 0.06975002,
        0.17508402, 0.30212455, 0.47293367, 0.01403804, 0.02744303,
        0.05963979, 0.09538121, 0.20317259, 0.30260944, 0.01096597,
        0.02797856, 0.06228476, 0.11548314, 0.21532097, 0.296947  ,
        0.01190615, 0.03244157, 0.04701209, 0.09538417, 0.20058365,
        0.2931417 , 0.01388531, 0.02914357, 0.04472213, 0.10387783,
        0.19335051, 0.28688002,

In [29]:
best_rf_stratified = grid.best_estimator_

best_rf_stratified_metrics = compute_metrics(
    data, best_rf_stratified, "Best Random Forest Classifier (Stratified CV)", verbose=True
)

Best Random Forest Classifier (Stratified CV) - train set metrics:
Accuracy: 0.9992, F1 Score: 0.9984, Precision: 1.0000, Recall: 0.9968
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3989
           1       1.00      1.00      1.00      1255

    accuracy                           1.00      5244
   macro avg       1.00      1.00      1.00      5244
weighted avg       1.00      1.00      1.00      5244

Best Random Forest Classifier (Stratified CV) - val set metrics:
Accuracy: 0.9390, F1 Score: 0.8742, Precision: 0.8634, Recall: 0.8854
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       998
           1       0.86      0.89      0.87       314

    accuracy                           0.94      1312
   macro avg       0.91      0.92      0.92      1312
weighted avg       0.94      0.94      0.94      1312

Best Random Forest Classifier (Stratified CV) - test set metrics:
Accuracy

In [37]:
# save the best model
import joblib

model_dir = Path(os.getcwd()).parent / "models"
model_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(best_rf_stratified, model_dir / "best_rf_stratified.pkl")
joblib.dump(best_tree, model_dir / "best_tree.pkl")
joblib.dump(lr, model_dir / "lr.pkl")
# save the scaler
joblib.dump(scaler, model_dir / "scaler.pkl")
# save the metadata
df_metadata.to_csv(model_dir / "metadata.csv", index=False)
# save the data
df_data.to_csv(model_dir / "data.csv", index=False)

# save scaled data
np.savez(model_dir / "scaled_data.npz",
    X_train_scaled=X_train_scaled,
    y_train=y_train,
    X_val_scaled=X_val_scaled,
    y_val=y_val,
    X_test_scaled=X_test_scaled,
    y_test=y_test,
)

In [39]:
df_metadata

Unnamed: 0,cell,city,geometry,wkt
0,881e7159e9fffff,Ярмолинці,POLYGON ((26.83393817535367 49.192988306649454...,POLYGON ((26.83393817535367 49.192988306649454...
1,881e6d3a5bfffff,Тростянець,POLYGON ((34.95934457201796 50.473494753294666...,POLYGON ((34.95934457201796 50.473494753294666...
2,881e6824dbfffff,Полтава,"POLYGON ((34.49924943744107 49.5775947405843, ...","POLYGON ((34.49924943744107 49.5775947405843, ..."
3,881e6c32c3fffff,Ромни,"POLYGON ((33.49082533801754 50.7484969657831, ...","POLYGON ((33.49082533801754 50.7484969657831, ..."
4,881e652d99fffff,Чернігів,POLYGON ((31.283317311257992 51.49889221417866...,POLYGON ((31.283317311257992 51.49889221417866...
...,...,...,...,...
7079,881e7159cdfffff,Ярмолинці,"POLYGON ((26.84273660477837 49.20551313896144,...","POLYGON ((26.84273660477837 49.20551313896144,..."
7080,881e7249a9fffff,Ясіня,"POLYGON ((24.3639022185752 48.25988410456817, ...","POLYGON ((24.3639022185752 48.25988410456817, ..."
7081,881e7249c1fffff,Ясіня,POLYGON ((24.368559397351653 48.28869462500301...,POLYGON ((24.368559397351653 48.28869462500301...
7082,881e724981fffff,Ясіня,POLYGON ((24.384125548528036 48.26971463352792...,POLYGON ((24.384125548528036 48.26971463352792...


In [46]:
test_cities[0]

'Львів'

In [50]:
city_index

Index([ 518,  569,  632,  690,  707,  710,  752,  789,  926,  963,
       ...
       4584, 4585, 4586, 4587, 4588, 4589, 4590, 4591, 4592, 4593],
      dtype='int64', length=101)

In [51]:
idx = 0
df_metadata_test = df_metadata[df_metadata['city'].isin(test_cities)].reset_index(drop=True)
df_metadata_test = df_metadata_test[df_metadata_test['city'] == test_cities[idx]]
city_index = df_metadata_test.index
df_metadata_test['lr_prob'] = lr.predict_proba(X_test_scaled[city_index])[:, 1]
df_metadata_test['dt_prob'] = best_tree.predict_proba(X_test_scaled[city_index])[:, 1]
df_metadata_test['rf_prob'] = best_rf_stratified.predict_proba(X_test_scaled[city_index])[:, 1]

  ret = a @ b
  ret = a @ b
  ret = a @ b


In [53]:
import folium

def build_map(gdf: gpd.GeoDataFrame, shop_points: gpd.GeoDataFrame, location: list[float, float], predict_col: str) -> folium.Map:
    """Create an interactive folium map with two layers:
    1. Choropleth of predicted probabilities
    2. Circle markers for existing Aurora shops
    """

    print("Building interactive map …")

    map_ = folium.Map(
        location=location,
        zoom_start=12,
        tiles="cartodbpositron",
    )

    # -------------- Probability choropleth layer --------------
    choropleth = folium.Choropleth(
        geo_data=gdf,
        name="Predicted probability",
        data=gdf,
        # columns=["cell", predict_col],
        columns=["geometry", predict_col],
        key_on="feature.properties.cell",
        fill_color="YlOrRd",
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name="Aurora shop probability",
        highlight=True,
    ).add_to(map_)

    # Add tooltips to show probability per hexagon
    folium.GeoJsonTooltip(
        # fields=["cell", predict_col],
        fields=["geometry", predict_col],
        aliases=["Cell:", "Probability:"],
        localize=True,
    ).add_to(choropleth.geojson)

    # -------------- Existing shop markers layer --------------
    shops_layer = folium.FeatureGroup(name="Existing Aurora shops")
    # for _, row in shop_points.iterrows():
    #     folium.CircleMarker(
    #         location=[row["lat"], row["lon"]],
    #         radius=6,
    #         color="blue",
    #         fill=True,
    #         fill_color="blue",
    #         fill_opacity=0.9,
    #         tooltip=(
    #             folium.Tooltip(
    #                 f"Existing Aurora shop\nCell: {row['cell']}\n"
    #                 f"Predicted prob.: {row[predict_col]:.2%}"
    #             )
    #         ),
    #     ).add_to(shops_layer)
    # shops_layer.add_to(map_)

    # -------------- Controls --------------
    folium.LayerControl(collapsed=False).add_to(map_)

    return map_

In [54]:
lviv_map = build_map(gdf=df_metadata_test, shop_points=None, location=[49.8397, 24.0297], predict_col='lr_prob')
output_html = "lviv_aurora_lr_prediction_map.html"
lviv_map.save(output_html)

Building interactive map …


ValueError: Cannot render objects with any missing geometries:                 cell   city  \
17   881e7689bbfffff  Львів   
24   881e768803fffff  Львів   
35   881e7689e7fffff  Львів   
44   881e7689e7fffff  Львів   
49   881e768917fffff  Львів   
..               ...    ...   
200  881e7689ddfffff  Львів   
201  881e768947fffff  Львів   
202  881e768b01fffff  Львів   
203  881e768b2bfffff  Львів   
204  881e768935fffff  Львів   

                                              geometry  \
17   POLYGON ((24.046877017363837 49.8010789703691,...   
24   POLYGON ((24.01953367184666 49.86641450152708,...   
35   POLYGON ((23.98779478773291 49.80806124131609,...   
44   POLYGON ((23.98779478773291 49.80806124131609,...   
49   POLYGON ((23.971799074603958 49.82660989877568...   
..                                                 ...   
200  POLYGON ((24.027111683396114 49.83523319213955...   
201  POLYGON ((23.92108163748902 49.846197075985124...   
202  POLYGON ((23.921856170869873 49.89000468478745...   
203  POLYGON ((23.913456716874855 49.87736730272355...   
204  POLYGON ((23.94090044305711 49.81206541473399,...   

                                                   wkt   lr_prob   dt_prob  \
17   POLYGON ((24.046877017363837 49.8010789703691,...  0.999898  0.950704   
24   POLYGON ((24.01953367184666 49.86641450152708,...  0.983961  0.539326   
35   POLYGON ((23.98779478773291 49.80806124131609,...  0.999149  0.020559   
44   POLYGON ((23.98779478773291 49.80806124131609,...  1.000000  0.950704   
49   POLYGON ((23.971799074603958 49.82660989877568...  0.999985  0.194245   
..                                                 ...       ...       ...   
200  POLYGON ((24.027111683396114 49.83523319213955...  1.000000  0.845161   
201  POLYGON ((23.92108163748902 49.846197075985124...  0.054524  0.020559   
202  POLYGON ((23.921856170869873 49.89000468478745...  0.054793  0.020559   
203  POLYGON ((23.913456716874855 49.87736730272355...  0.093102  0.020559   
204  POLYGON ((23.94090044305711 49.81206541473399,...  0.053234  0.020559   

     rf_prob  
17     0.962  
24     0.932  
35     0.994  
44     0.958  
49     0.974  
..       ...  
200    0.924  
201    0.120  
202    0.000  
203    0.060  
204    0.016  

[101 rows x 7 columns]

In [38]:
best_rf_stratified.predict_proba(X_test_scaled)[:, 1]

array([0.984     , 0.97      , 0.866     , 0.954     , 0.926     ,
       0.91      , 0.986     , 0.89      , 0.72      , 0.798     ,
       0.888     , 0.96      , 0.234     , 0.872     , 0.948     ,
       0.988     , 0.906     , 0.962     , 0.89      , 0.92      ,
       0.928     , 0.89      , 0.724     , 0.918     , 0.932     ,
       0.922     , 0.874     , 0.972     , 0.984     , 0.424     ,
       0.77      , 0.88      , 0.972     , 0.806     , 0.894     ,
       0.994     , 0.782     , 0.822     , 0.712     , 0.9       ,
       0.992     , 0.912     , 0.988     , 0.866     , 0.958     ,
       0.594     , 0.984     , 0.984     , 0.956     , 0.974     ,
       0.908     , 0.806     , 0.908     , 0.974     , 0.878     ,
       0.866     , 0.888     , 0.958     , 0.862     , 0.952     ,
       0.93      , 0.884     , 0.874     , 0.914     , 0.718     ,
       0.948     , 0.796     , 0.914     , 0.986     , 0.932     ,
       0.996     , 0.986     , 0.91      , 0.976     , 0.914  