In [25]:
import os

from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Dataset preparation + train/val/test split

In [4]:
DATA_DIR = Path(os.getcwd()).parent / "data" / "train_data"

df = pd.read_csv(DATA_DIR / "ukraine_all_data.csv")
df.head()

Unnamed: 0,cell,city,is_positive,geometry,bench_count,cafe_count,pharmacy_count,waste_disposal_count,atm_count,post_office_count,...,dairy_count,frozen_food_count,trade_count,funeral_directors_count,baby_goods_count,wkt,atb_count,novus_count,eko_market_count,fora_count
0,881e7159e9fffff,Ярмолинці,1,POLYGON ((26.83393817535367 49.192988306649454...,0,0,1,0,0,1,...,0,0,0,0,0,POLYGON ((26.83393817535367 49.192988306649454...,0,0,0,0
1,881e6d3a5bfffff,Тростянець,1,POLYGON ((34.95934457201796 50.473494753294666...,11,1,4,0,3,2,...,0,0,0,0,0,POLYGON ((34.95934457201796 50.473494753294666...,1,0,0,0
2,881e6824dbfffff,Полтава,1,"POLYGON ((34.49924943744107 49.5775947405843, ...",10,1,2,3,1,1,...,0,0,0,0,0,"POLYGON ((34.49924943744107 49.5775947405843, ...",0,0,0,0
3,881e6c32c3fffff,Ромни,1,"POLYGON ((33.49082533801754 50.7484969657831, ...",0,0,1,0,0,0,...,0,0,0,0,0,"POLYGON ((33.49082533801754 50.7484969657831, ...",0,0,0,0
4,881e652d99fffff,Чернігів,1,POLYGON ((31.283317311257992 51.49889221417866...,0,0,0,0,0,0,...,0,0,0,0,0,POLYGON ((31.283317311257992 51.49889221417866...,0,0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7084 entries, 0 to 7083
Columns: 109 entries, cell to fora_count
dtypes: int64(105), object(4)
memory usage: 5.9+ MB


In [8]:
sorted(df.columns.to_list())

['alcohol_count',
 'atb_count',
 'atm_count',
 'baby_goods_count',
 'bakery_count',
 'bank_count',
 'bar_count',
 'beauty_count',
 'bench_count',
 'bicycle_count',
 'bicycle_parking_count',
 'bicycle_rental_count',
 'books_count',
 'bureau_de_change_count',
 'bus_station_count',
 'butcher_count',
 'cafe_count',
 'car_count',
 'car_parts_count',
 'car_repair_count',
 'car_wash_count',
 'cell',
 'charging_station_count',
 'chemist_count',
 'cinema_count',
 'city',
 'clinic_count',
 'clothes_count',
 'community_centre_count',
 'computer_count',
 'confectionery_count',
 'convenience_count',
 'copyshop_count',
 'cosmetics_count',
 'courthouse_count',
 'dairy_count',
 'dentist_count',
 'department_store_count',
 'doctors_count',
 'doityourself_count',
 'drinking_water_count',
 'driving_school_count',
 'eko_market_count',
 'electronics_count',
 'fast_food_count',
 'fire_station_count',
 'florist_count',
 'fora_count',
 'fountain_count',
 'frozen_food_count',
 'fuel_count',
 'funeral_directors

In [13]:
metadata_cols = ["cell", "city", "geometry", "wkt"]

df_metadata = df[metadata_cols]
df_data = df.drop(columns=metadata_cols)
target_col = "is_positive"

In [18]:
city_vc = df_metadata[df_data['is_positive'].astype(bool)]['city'].value_counts()
city_vc[city_vc > 20]

city
Київ             152
Дніпро            67
Харків            67
Кривий Ріг        52
Запоріжжя         42
Одеса             37
Полтава           36
Миколаїв          32
Львів             25
Кропивницький     23
Суми              23
Чернігів          23
Кам'янське        22
Name: count, dtype: int64

In [28]:
test_cities = ['Львів', 'Харків', 'Одеса']

df_data_test = df_data[df_metadata['city'].isin(test_cities)]
df_data_train = df_data[~df_metadata['city'].isin(test_cities)]

X_train = df_data_train.drop(columns=[target_col])
y_train = df_data_train[target_col]

X_test = df_data_test.drop(columns=[target_col])
y_test = df_data_test[target_col]

In [29]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"Train set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Train set size: 5244
Validation set size: 1312
Test set size: 528


In [30]:
# check X_train types
X_train.dtypes.unique()

array([dtype('int64')], dtype=object)

In [31]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# 2. Training

In [47]:
def compute_metrics(data, model, model_name, verbose=True):
    results = {}
    splits = ['train', 'val', 'test']

    for split in splits:
        X = data[f'X_{split}_scaled']
        y = data[f'y_{split}']
        
        y_pred = model.predict(X)
        accuracy = accuracy_score(y, y_pred)
        report = classification_report(y, y_pred, output_dict=True)
        f1_score = report['1']['f1-score']
        precision = report['1']['precision']
        recall = report['1']['recall']

        to_percantage = lambda x: np.round(x * 100, 3)

        results[split] = {
            'accuracy': to_percantage(accuracy),
            'f1_score': to_percantage(f1_score),
            'precision': to_percantage(precision),
            'recall': to_percantage(recall),
        }
    
        if verbose:
            print(f"{model_name} - {split} set metrics:")
            print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1_score:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
            print(classification_report(y, y_pred))

    return results


## 2.1. Baseine Logistic regression

In [41]:
import numpy as np
print("Any NaN?", np.isnan(X_train_scaled).any())
print("Any ±inf?", np.isinf(X_train_scaled).any())
print("Feature ranges:", X_train_scaled.min(axis=0), X_train_scaled.max(axis=0))

Any NaN? False
Any ±inf? False
Feature ranges: [-0.14527252 -0.18696475 -0.21801917 -0.15707255 -0.18763595 -0.28459122
 -0.18749523 -0.15580738 -0.08258855 -0.22451987 -0.06224243 -0.18748882
 -0.15292978 -0.173111   -0.10676861 -0.12823332 -0.13959524 -0.16383576
 -0.13106569 -0.12644685 -0.19946854 -0.09300622 -0.11726633 -0.17021339
 -0.11773076 -0.0654196  -0.14688009 -0.14510222 -0.1123136  -0.06131132
 -0.11059022 -0.11924338 -0.12577316 -0.1444025  -0.11570028 -0.13187334
 -0.1219537  -0.13982477 -0.11198395 -0.0901695  -0.06417896 -0.0871815
 -0.09193461 -0.08102192 -0.06771571 -0.07995512 -0.0983533  -0.070536
 -0.09118552 -0.08785294 -0.2828194  -0.11185488 -0.27643351 -0.18806708
 -0.05233379 -0.23393371 -0.12712169 -0.17247018 -0.18551248 -0.16203958
 -0.18240807 -0.20983212 -0.17070503 -0.11164931 -0.15421142 -0.14694834
 -0.15381547 -0.17745908 -0.10761122 -0.14945227 -0.15203951 -0.0898286
 -0.11518253 -0.14854399 -0.09422418 -0.09350386 -0.11889062 -0.11563224
 -0.0447

In [43]:
lr = LogisticRegression(penalty='l2', C=0.1, solver='lbfgs', max_iter=1000, random_state=42)

lr.fit(X_train_scaled, y_train)

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [44]:
y_pred_train = lr.predict(X_train_scaled)
y_pred_val = lr.predict(X_val_scaled)
y_pred_test = lr.predict(X_test_scaled)

print("Train set accuracy:", accuracy_score(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))

print("Validation set accuracy:", accuracy_score(y_val, y_pred_val))
print(classification_report(y_val, y_pred_val))

print("Test set accuracy:", accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

Train set accuracy: 0.9050343249427918
              precision    recall  f1-score   support

           0       0.90      0.99      0.94      3989
           1       0.94      0.65      0.76      1255

    accuracy                           0.91      5244
   macro avg       0.92      0.82      0.85      5244
weighted avg       0.91      0.91      0.90      5244

Validation set accuracy: 0.8925304878048781
              precision    recall  f1-score   support

           0       0.89      0.99      0.93       998
           1       0.93      0.60      0.73       314

    accuracy                           0.89      1312
   macro avg       0.91      0.79      0.83      1312
weighted avg       0.90      0.89      0.88      1312

Test set accuracy: 0.865530303030303
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       399
           1       0.75      0.67      0.71       129

    accuracy                           0.87       528
   macro

  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [36]:
print(f"Feature importances (coefficients):")
sorted_indices = np.argsort(lr.coef_[0])
for idx in sorted_indices:
    print(f"{X_train.columns[idx]}: {lr.coef_[0][idx]:.4f}")

Feature importances (coefficients):
travel_agency_count: -2.7719
books_count: -1.5696
clothes_count: -1.3660
shoes_count: -1.2941
clinic_count: -1.1832
fast_food_count: -0.8850
confectionery_count: -0.7847
gift_count: -0.7405
cosmetics_count: -0.6365
parcel_locker_count: -0.5871
butcher_count: -0.5726
dairy_count: -0.5235
cafe_count: -0.5038
taxi_count: -0.4123
vending_machine_count: -0.4072
payment_terminal_count: -0.3800
novus_count: -0.3437
dentist_count: -0.3352
car_count: -0.3184
parking_entrance_count: -0.2901
drinking_water_count: -0.2723
tyres_count: -0.2681
optician_count: -0.2442
charging_station_count: -0.2343
post_box_count: -0.1901
shelter_count: -0.1900
florist_count: -0.1745
bicycle_rental_count: -0.1484
kiosk_count: -0.1437
car_repair_count: -0.1248
jewelry_count: -0.1110
restaurant_count: -0.1106
toys_count: -0.0917
fuel_count: -0.0800
bar_count: -0.0780
car_wash_count: -0.0615
recycling_count: -0.0601
doctors_count: -0.0493
department_store_count: -0.0120
alcohol_coun

## 2.1. Decision Tree + Random Forest

In [37]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Train a Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)

y_train_pred_dt = dt.predict(X_train_scaled)
y_val_pred_dt = dt.predict(X_val_scaled)
y_test_pred_dt = dt.predict(X_test_scaled)

In [38]:
print(f"Accuracy of Decision Tree on Train set: {accuracy_score(y_train, y_train_pred_dt)}")
print(classification_report(y_train, y_train_pred_dt))

print(f"Accuracy of Decision Tree on Validation set: {accuracy_score(y_val, y_val_pred_dt)}")
print(classification_report(y_val, y_val_pred_dt))

print(f"Accuracy of Decision Tree on Test set: {accuracy_score(y_test, y_test_pred_dt)}")
print(classification_report(y_test, y_test_pred_dt))

Accuracy of Decision Tree on Train set: 0.9464149504195271
              precision    recall  f1-score   support

           0       0.93      1.00      0.97      3989
           1       1.00      0.78      0.87      1255

    accuracy                           0.95      5244
   macro avg       0.97      0.89      0.92      5244
weighted avg       0.95      0.95      0.94      5244

Accuracy of Decision Tree on Validation set: 0.8803353658536586
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       998
           1       0.84      0.62      0.71       314

    accuracy                           0.88      1312
   macro avg       0.86      0.79      0.82      1312
weighted avg       0.88      0.88      0.87      1312

Accuracy of Decision Tree on Test set: 0.8143939393939394
              precision    recall  f1-score   support

           0       0.88      0.87      0.88       399
           1       0.62      0.64      0.63       129

 

In [45]:
rf = RandomForestClassifier(random_state=42)

rf.fit(X_train_scaled, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [48]:
data = {
    'X_train_scaled': X_train_scaled,
    'y_train': y_train,
    'X_val_scaled': X_val_scaled,
    'y_val': y_val,
    'X_test_scaled': X_test_scaled,
    'y_test': y_test
}

compute_metrics(data, rf, "Random Forest")

Random Forest - train set metrics:
Accuracy: 0.9464, F1 Score: 0.8742, Precision: 0.9980, Recall: 0.7777
              precision    recall  f1-score   support

           0       0.93      1.00      0.97      3989
           1       1.00      0.78      0.87      1255

    accuracy                           0.95      5244
   macro avg       0.97      0.89      0.92      5244
weighted avg       0.95      0.95      0.94      5244

Random Forest - val set metrics:
Accuracy: 0.8941, F1 Score: 0.7566, Precision: 0.8405, Recall: 0.6879
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       998
           1       0.84      0.69      0.76       314

    accuracy                           0.89      1312
   macro avg       0.87      0.82      0.84      1312
weighted avg       0.89      0.89      0.89      1312

Random Forest - test set metrics:
Accuracy: 0.8277, F1 Score: 0.6715, Precision: 0.6284, Recall: 0.7209
              precision    recall 

{'train': {'accuracy': np.float64(94.641),
  'f1_score': np.float64(87.416),
  'precision': np.float64(99.796),
  'recall': np.float64(77.769)},
 'val': {'accuracy': np.float64(89.405),
  'f1_score': np.float64(75.657),
  'precision': np.float64(84.047),
  'recall': np.float64(68.79)},
 'test': {'accuracy': np.float64(82.765),
  'f1_score': np.float64(67.148),
  'precision': np.float64(62.838),
  'recall': np.float64(72.093)}}