In [4]:
import pandas as pd
import pyarrow.parquet as pq
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time
import copy
import os
import pickle
from tqdm.auto import tqdm
import math

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import json

FILE_PATH = r"/content/drive/MyDrive/Mastercard/paczkomaty.json"

with open(FILE_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

if isinstance(data, list) and data:
    first_item = data[0]
    # Variables assigned for potential use in a notebook environment
    item_name = first_item.get('name')
    item_status = first_item.get('status')
    item_address_details = first_item.get('address_details', {})
    item_city = item_address_details.get('city')
elif isinstance(data, dict):
    # Variable assigned for potential use in a notebook environment
    data_keys = list(data.keys())

generate folder structure

In [3]:
import zipfile
import os
import shutil

source_folder = "/content/drive/MyDrive/Mastercard/dane/"
destination_base_folder = "/content/drive/MyDrive/Mastercard/unzipped_archives/"

print(f"Starting: Unzipping files from '{source_folder}' to '{destination_base_folder}' subfolders.")

os.makedirs(destination_base_folder, exist_ok=True)

all_files_in_source = os.listdir(source_folder)
zip_files = [
    f for f in all_files_in_source
    if f.endswith('.zip') and os.path.isfile(os.path.join(source_folder, f))
]

if not zip_files:
    print(f"No .zip files found in {source_folder}.")
else:
    print(f"Found {len(zip_files)} zip file(s): {', '.join(zip_files)}")

    successful_extractions = 0
    for zip_file_name in zip_files:
        full_zip_file_path = os.path.join(source_folder, zip_file_name)
        new_folder_name = os.path.splitext(zip_file_name)[0]

        if new_folder_name in os.listdir(destination_base_folder):
            print(f"Folder '{new_folder_name}' already exists in '{destination_base_folder}'. Skipping.")
            continue

        specific_zip_destination_folder = os.path.join(destination_base_folder, new_folder_name)

        print(f"Processing: {zip_file_name} -> {specific_zip_destination_folder}")

        os.makedirs(specific_zip_destination_folder, exist_ok=True)

        with zipfile.ZipFile(full_zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(specific_zip_destination_folder)
        successful_extractions += 1

    print(f"\nFinished: Successfully processed {successful_extractions} of {len(zip_files)} zip file(s).")

In [3]:
malopolskie_shp_folder = r"/content/drive/MyDrive/Mastercard/unzipped_archives/malopolskie-latest-free.shp/"
paczkomaty_csv_path = r"/content/drive/MyDrive/Mastercard/paczkomaty.csv"

paczkomaty_df_raw = pd.read_csv(paczkomaty_csv_path)

In [5]:
paczkomaty_df_raw.columns

Index(['address', 'address_details', 'agency', 'air_index_level',
       'apm_doubled', 'distance', 'easy_access_zone', 'functions', 'href',
       'image_url', 'is_next', 'location', 'location_247', 'location_date',
       'location_description', 'location_description_1',
       'location_description_2', 'location_type', 'name', 'opening_hours',
       'operating_hours_extended', 'partner_id', 'payment_available',
       'payment_point_descr', 'payment_type', 'phone_number',
       'physical_type_description', 'physical_type_mapped',
       'recommended_low_interest_box_machines_list', 'status', 'type',
       'virtual'],
      dtype='object')

In [11]:
paczkomaty_df_raw.head(3)

Unnamed: 0,address,address_details,agency,air_index_level,apm_doubled,distance,easy_access_zone,functions,href,image_url,...,payment_available,payment_point_descr,payment_type,phone_number,physical_type_description,physical_type_mapped,recommended_low_interest_box_machines_list,status,type,virtual
0,"{'line1': 'Kościuszki 27', 'line2': '21-412 Ad...","{'city': 'Adamów', 'province': 'lubelskie', 'p...",SDL,,,,True,"['allegro_courier_collect', 'allegro_courier_r...",https://api-pl-points.easypack24.net/v1/points...,https://static.easypack24.net/points/pl/images...,...,True,Płatność apką InPost oraz PayByLink,{'0': 'Payments are not supported'},,,4.0,,Operating,['parcel_locker'],0
1,"{'line1': 'Kleeberga 5B', 'line2': '21-412 Ada...","{'city': 'Adamów', 'province': 'lubelskie', 'p...",SDL,,,,True,"['allegro_courier_collect', 'allegro_courier_r...",https://api-pl-points.easypack24.net/v1/points...,https://static.easypack24.net/points/pl/images...,...,True,Płatność apką InPost oraz PayByLink,{'0': 'Payments are not supported'},,,5.0,['ADA01M'],Operating,['parcel_locker'],0
2,"{'line1': 'Adamów 45', 'line2': '62-590 Adamów'}","{'city': 'Adamów', 'province': 'wielkopolskie'...",KON,,,,True,"['allegro_courier_collect', 'allegro_courier_r...",https://api-pl-points.easypack24.net/v1/points...,https://static.easypack24.net/points/pl/images...,...,True,Płatność apką InPost oraz PayByLink,{'0': 'Payments are not supported'},,,5.0,"['GLN01M', 'GLN03M']",Operating,['parcel_locker'],0


In [23]:
paczkomaty_df = paczkomaty_df_raw.drop(columns=['address', 'air_index_level', 'apm_doubled', 'distance', 'location_date', 'location_description', 'location_description_2', 'phone_number', 'physical_type_description'])

In [25]:
paczkomaty_df['address_details'].iloc[0]

"{'city': 'Adamów', 'province': 'lubelskie', 'post_code': '21-412', 'street': 'Kościuszki', 'building_number': '27', 'flat_number': None}"

In [27]:
# grid 500m/500m - we have center points. Meaning we want to have points 250m in every direction aggregated

paczkomaty_df['address_details'].iloc[0]

"{'city': 'Adamów', 'province': 'lubelskie', 'post_code': '21-412', 'street': 'Kościuszki', 'building_number': '27', 'flat_number': None}"

In [31]:
import ast

def parse_address_string(address_str):
    try:
        return ast.literal_eval(address_str)
    except (ValueError, SyntaxError):
        return None # Or return an empty dict: {}

parsed_addresses = paczkomaty_df['address_details'].apply(parse_address_string)

# Normalize the Series of dictionaries into a DataFrame
address_details_df = pd.json_normalize(parsed_addresses)

paczkomaty_df = pd.concat([paczkomaty_df.drop(columns=['address_details']), address_details_df], axis=1)

In [34]:
destination_base_folder = "/content/drive/MyDrive/Mastercard/unzipped_archives/"

if os.path.exists(destination_base_folder):
    print(f"Listing contents of: {destination_base_folder}")
    for root, dirs, files in os.walk(destination_base_folder):

        level = root.replace(destination_base_folder, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 4 * (level + 1)
        for f in files:



            print(f'{subindent}{f}')

else:
    print(f"Folder not found: {destination_base_folder}")


Listing contents of: /content/drive/MyDrive/Mastercard/unzipped_archives/
/
malopolskie-latest-free.shp/
    README
    gis_osm_buildings_a_free_1.cpg
    gis_osm_buildings_a_free_1.dbf
    gis_osm_buildings_a_free_1.prj
    gis_osm_buildings_a_free_1.shp
    gis_osm_buildings_a_free_1.shx
    gis_osm_landuse_a_free_1.cpg
    gis_osm_landuse_a_free_1.dbf
    gis_osm_landuse_a_free_1.prj
    gis_osm_landuse_a_free_1.shp
    gis_osm_landuse_a_free_1.shx
    gis_osm_natural_a_free_1.cpg
    gis_osm_natural_a_free_1.dbf
    gis_osm_natural_a_free_1.prj
    gis_osm_natural_a_free_1.shp
    gis_osm_natural_a_free_1.shx
    gis_osm_natural_free_1.cpg
    gis_osm_natural_free_1.dbf
    gis_osm_natural_free_1.prj
    gis_osm_natural_free_1.shp
    gis_osm_natural_free_1.shx
    gis_osm_places_a_free_1.cpg
    gis_osm_places_a_free_1.dbf
    gis_osm_places_a_free_1.prj
    gis_osm_places_a_free_1.shp
    gis_osm_places_a_free_1.shx
    gis_osm_places_free_1.cpg
    gis_osm_places_free_1.dbf
    

In [36]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import os
import ast # Importujemy moduł ast do parsowania stringów

# --- Konfiguracja Ścieżek ---
# ZMIEŃ TĘ ŚCIEŻKĘ na główny folder zawierający podfoldery z danymi shapefile (np. dla różnych regionów)
bazowy_folder_shp = r"/content/drive/MyDrive/Mastercard/unzipped_archives/"
paczkomaty_csv_path = r"/content/drive/MyDrive/Mastercard/paczkomaty.csv"

# --- Lista kategorii i mapowanie na nazwy plików Shapefile ---
# Ta konfiguracja definiuje, jakich DOKŁADNYCH nazw plików shapefile szukamy w każdym folderze
geofabrik_layers_config = {
    "buildings": "gis_osm_buildings_a_free_1.shp",
    "landuse": "gis_osm_landuse_a_free_1.shp",
    "natural_areas": "gis_osm_natural_a_free_1.shp",
    "natural_points_lines": "gis_osm_natural_free_1.shp",
    "places_areas": "gis_osm_places_a_free_1.shp",
    "places_points": "gis_osm_places_free_1.shp",
    "pofw_areas": "gis_osm_pofw_a_free_1.shp",
    "pofw_points": "gis_osm_pofw_free_1.shp",
    "pois_areas": "gis_osm_pois_a_free_1.shp",
    "pois_points": "gis_osm_pois_free_1.shp",
    "railways": "gis_osm_railways_free_1.shp",
    "roads": "gis_osm_roads_free_1.shp",
    "traffic_areas": "gis_osm_traffic_a_free_1.shp",
    "traffic_points": "gis_osm_traffic_free_1.shp",
    "transport_areas": "gis_osm_transport_a_free_1.shp",
    "transport_points": "gis_osm_transport_free_1.shp",
    "water_areas": "gis_osm_water_a_free_1.shp",
    "waterways": "gis_osm_waterways_free_1.shp"
}

def wczytaj_i_polacz_shapefiles(base_folder, layers_config, target_crs="EPSG:4326"):
    """
    Rekursywnie przeszukuje base_folder w poszukiwaniu plików shapefile zdefiniowanych
    w layers_config, wczytuje je, transformuje CRS i łączy w jeden GeoDataFrame.
    """
    all_geodataframes = []

    if not os.path.exists(base_folder):
        print(f"Błąd: Folder bazowy nie istnieje: {base_folder}")
        return gpd.GeoDataFrame()

    print(f"--- Rozpoczynanie skanowania folderu bazowego: {base_folder} ---")

    for root, dirs, files in os.walk(base_folder):
        # Nazwa bieżącego katalogu, używana jako źródło regionu
        # Jeśli root to sam base_folder, region_source może być np. 'base' lub puste
        region_source = os.path.basename(root) if root != base_folder else "base_directory"

        # Iteruj przez zdefiniowane warstwy w konfiguracji
        for category_key, shp_filename in layers_config.items():
            shp_file_path = os.path.join(root, shp_filename)

            if os.path.exists(shp_file_path):
                print(f"  Znaleziono plik dla '{category_key}': {shp_file_path} (Region: {region_source})")
                try:
                    gdf = gpd.read_file(shp_file_path)
                    if gdf.empty:
                        print(f"    Plik {shp_filename} jest pusty, pomijam.")
                        continue

                    # Ujednolicenie CRS
                    if gdf.crs is None:
                        print(f"    Ostrzeżenie: Brak zdefiniowanego CRS dla {shp_filename}. Zakładam {target_crs}.")
                        gdf = gdf.set_crs(target_crs, allow_override=True) # Użyj ostrożnie
                    elif gdf.crs != target_crs:
                        print(f"    Transformuję CRS dla {shp_filename} z {gdf.crs} do {target_crs}")
                        gdf = gdf.to_crs(target_crs)

                    # Dodanie kolumn źródłowych
                    gdf['layer_source'] = category_key
                    gdf['filepath_source'] = shp_file_path
                    gdf['region_source'] = region_source

                    all_geodataframes.append(gdf)
                    print(f"    Wczytano i przetworzono: {category_key} - {len(gdf)} obiektów.")
                except Exception as e:
                    print(f"    Błąd podczas wczytywania lub przetwarzania {shp_filename} dla kategorii {category_key}: {e}")
            # else:
                # Można dodać informację, jeśli oczekiwany plik nie został znaleziony w danym folderze
                # print(f"  Plik {shp_filename} (dla {category_key}) nie istnieje w {root}")
                pass


    if not all_geodataframes:
        print("Nie znaleziono żadnych pasujących plików shapefile do wczytania.")
        return gpd.GeoDataFrame(crs=target_crs) # Zwróć pusty GDF z docelowym CRS

    print("\n--- Łączenie wszystkich wczytanych GeoDataFrame ---")
    try:
        # Upewnij się, że wszystkie GDF mają kolumnę geometrii przed konkatenacją
        # Geopandas zwykle radzi sobie z tym dobrze, jeśli kolumna geometrii jest spójnie nazwana
        unified_gdf = gpd.GeoDataFrame(pd.concat(all_geodataframes, ignore_index=True), crs=target_crs)
        print(f"Zunifikowany GeoDataFrame utworzony: {len(unified_gdf)} obiektów, CRS: {unified_gdf.crs}")
        return unified_gdf
    except Exception as e:
        print(f"Błąd podczas łączenia GeoDataFrames: {e}")
        # Próba zdiagnozowania problemu z geometrią
        for i, gdf_item in enumerate(all_geodataframes):
            if 'geometry' not in gdf_item.columns:
                print(f"GeoDataFrame pod indeksem {i} (layer: {gdf_item['layer_source'].iloc[0] if not gdf_item.empty else 'N/A'}) nie ma kolumny 'geometry'.")
            elif not isinstance(gdf_item.geometry, gpd.GeoSeries):
                 print(f"GeoDataFrame pod indeksem {i} (layer: {gdf_item['layer_source'].iloc[0] if not gdf_item.empty else 'N/A'}) ma kolumnę 'geometry', ale nie jest to GeoSeries.")
        return gpd.GeoDataFrame(crs=target_crs)


# --- Krok 1: Wczytywanie i łączenie danych Shapefile ---
# Wszystkie shapefile zostaną wczytane i połączone w jeden GeoDataFrame
# z ujednoliconym CRS (domyślnie EPSG:4326)
zunifikowany_geo_data = wczytaj_i_polacz_shapefiles(bazowy_folder_shp, geofabrik_layers_config)

# --- Krok 2: Wczytywanie i przygotowanie danych o paczkomatach z CSV ---
print("\n--- Wczytywanie danych o paczkomatach ---")
paczkomaty_gdf = None # Inicjalizacja na wypadek błędów
try:
    if not os.path.exists(paczkomaty_csv_path):
        raise FileNotFoundError(f"Plik paczkomatów nie istnieje: {paczkomaty_csv_path}")

    paczkomaty_df = pd.read_csv(paczkomaty_csv_path)
    print(f"Wczytano paczkomaty.csv: {len(paczkomaty_df)} wierszy.")

    if 'location' in paczkomaty_df.columns:
        print("Znaleziono kolumnę 'location'. Próba parsowania współrzędnych...")

        def parse_location_string(loc_str):
            try:
                if pd.isna(loc_str) or not isinstance(loc_str, str) or loc_str.strip() == "":
                    return None, None
                loc_dict = ast.literal_eval(loc_str)
                if isinstance(loc_dict, dict):
                    return loc_dict.get('longitude'), loc_dict.get('latitude')
                else:
                    return None, None
            except (ValueError, SyntaxError, TypeError):
                return None, None

        parsed_coords = paczkomaty_df['location'].apply(lambda x: pd.Series(parse_location_string(x), index=['longitude_parsed', 'latitude_parsed']))
        paczkomaty_df['longitude'] = parsed_coords['longitude_parsed']
        paczkomaty_df['latitude'] = parsed_coords['latitude_parsed']

        num_parsed = paczkomaty_df['longitude'].notna().sum()
        print(f"Pomyślnie sparsowano współrzędne dla {num_parsed} paczkomatów.")

        if num_parsed > 0:
            paczkomaty_df.dropna(subset=['longitude', 'latitude'], inplace=True)

            paczkomaty_gdf = gpd.GeoDataFrame(
                paczkomaty_df,
                geometry=gpd.points_from_xy(paczkomaty_df.longitude, paczkomaty_df.latitude),
                crs="EPSG:4326" # Zakładamy, że współrzędne z CSV są w WGS84
            )
            print(f"Utworzono GeoDataFrame dla paczkomatów: {len(paczkomaty_gdf)} obiektów, CRS: {paczkomaty_gdf.crs}")
        else:
            print("Nie udało się sparsować żadnych współrzędnych z kolumny 'location'.")
            paczkomaty_gdf = gpd.GeoDataFrame(crs="EPSG:4326") # Pusty GDF z CRS

    else:
        print("Błąd: Brak kolumny 'location' w pliku paczkomaty.csv, która jest potrzebna do uzyskania współrzędnych.")
        paczkomaty_gdf = gpd.GeoDataFrame(crs="EPSG:4326")

except FileNotFoundError:
    print(f"Błąd: Nie znaleziono pliku {paczkomaty_csv_path}")
    paczkomaty_gdf = gpd.GeoDataFrame(crs="EPSG:4326")
except Exception as e:
    print(f"Błąd podczas wczytywania lub przetwarzania paczkomaty.csv: {e}")
    paczkomaty_gdf = gpd.GeoDataFrame(crs="EPSG:4326")


# --- Krok 3: Przygotowanie do wizualizacji - ujednolicenie CRS (jeśli konieczne) ---
# Zunifikowany_geo_data już powinien być w docelowym CRS (EPSG:4326)
# Paczkomaty_gdf również powinien być w EPSG:4326 po wczytaniu
display_crs = "EPSG:4326"
print(f"\n--- Sprawdzanie CRS przed wizualizacją (docelowy: {display_crs}) ---")

if not zunifikowany_geo_data.empty:
    if zunifikowany_geo_data.crs != display_crs:
        print(f"Ostrzeżenie: Zunifikowany GeoDataFrame ma CRS {zunifikowany_geo_data.crs} zamiast oczekiwanego {display_crs}. Sprawdź logikę wczytywania.")
    else:
        print(f"Zunifikowany GeoDataFrame jest w docelowym CRS: {zunifikowany_geo_data.crs}")
else:
    print("Zunifikowany GeoDataFrame jest pusty.")


if paczkomaty_gdf is not None and not paczkomaty_gdf.empty:
    if paczkomaty_gdf.crs != display_crs:
        print(f"Transformuję CRS dla paczkomatów z {paczkomaty_gdf.crs} do {display_crs}")
        paczkomaty_gdf = paczkomaty_gdf.to_crs(display_crs)
    else:
        print(f"GeoDataFrame paczkomatów jest w docelowym CRS: {paczkomaty_gdf.crs}")
else:
    print("GeoDataFrame paczkomatów jest pusty lub nie został wczytany.")

# --- Krok 4: Wyświetlanie informacji o wczytanych danych ---
if not zunifikowany_geo_data.empty:
    print("\n--- Informacje o zunifikowanym GeoDataFrame (Shapefiles) ---")
    print(f"Liczba obiektów: {len(zunifikowany_geo_data)}")
    print(f"CRS: {zunifikowany_geo_data.crs}")
    print("Przykładowe dane (pierwsze 2 wiersze):")
    print(zunifikowany_geo_data.head(2))
    if 'layer_source' in zunifikowany_geo_data.columns:
        print("\nLiczba obiektów per 'layer_source':")
        print(zunifikowany_geo_data['layer_source'].value_counts())
    if 'region_source' in zunifikowany_geo_data.columns:
        print("\nLiczba obiektów per 'region_source':")
        print(zunifikowany_geo_data['region_source'].value_counts())
else:
    print("\nZunifikowany GeoDataFrame (Shapefiles) jest pusty.")

if paczkomaty_gdf is not None and not paczkomaty_gdf.empty:
    print("\n--- Informacje o GeoDataFrame (Paczkomaty) ---")
    print(f"Liczba paczkomatów: {len(paczkomaty_gdf)}")
    print(f"CRS: {paczkomaty_gdf.crs}")
    print("Przykładowe dane paczkomatów (pierwsze 2 wiersze):")
    print(paczkomaty_gdf.head(2))


--- Rozpoczynanie skanowania folderu bazowego: /content/drive/MyDrive/Mastercard/unzipped_archives/ ---
  Znaleziono plik dla 'buildings': /content/drive/MyDrive/Mastercard/unzipped_archives/malopolskie-latest-free.shp/gis_osm_buildings_a_free_1.shp (Region: malopolskie-latest-free.shp)
    Wczytano i przetworzono: buildings - 1540045 obiektów.
  Znaleziono plik dla 'landuse': /content/drive/MyDrive/Mastercard/unzipped_archives/malopolskie-latest-free.shp/gis_osm_landuse_a_free_1.shp (Region: malopolskie-latest-free.shp)
    Wczytano i przetworzono: landuse - 358997 obiektów.
  Znaleziono plik dla 'natural_areas': /content/drive/MyDrive/Mastercard/unzipped_archives/malopolskie-latest-free.shp/gis_osm_natural_a_free_1.shp (Region: malopolskie-latest-free.shp)
    Wczytano i przetworzono: natural_areas - 205 obiektów.
  Znaleziono plik dla 'natural_points_lines': /content/drive/MyDrive/Mastercard/unzipped_archives/malopolskie-latest-free.shp/gis_osm_natural_free_1.shp (Region: malopolski

In [1]:
#zunifikowany_geo_data.to_parquet(r"/content/drive/MyDrive/Mastercard/geo_pandas_all_voievodeship.parquet")

import geopandas as gpd
geo_df = gpd.read_parquet(r"/content/drive/MyDrive/Mastercard/geo_pandas_all_voievodeship.parquet")

pd.set_option("display.max_rows", None)

In [6]:
objects_to_save = [
    # Residential & Population Density
    'residential',        # Areas where people live
    'city',               # General high-density urban area
    'town',               # Medium-density urban area
    'suburb',             # Residential area, often near a city

    # Commercial & Retail (High Foot Traffic, Shopping)
    'retail',             # General retail areas
    'commercial',         # General commercial/business areas
    'supermarket',        # Key shopping destination
    'mall',               # Major shopping hub
    'convenience',        # High-frequency visit shops
    'market_place',       # Open-air markets, can be high traffic

    # Services & Amenities (Frequent Errands)
    'post_office',        # Existing mail/parcel infrastructure
    'bank',               # Common errand destination
    'atm',                # Indicates commercial activity
    'pharmacy',           # Essential service, regular visits
    'fuel',               # Service stations, often with convenience stores

    # Educational Institutions (Concentrated Population)
    'university',         # High concentration of students & staff
    'college',            # Similar to university
    'school',             # Daily magnet for students, parents, staff

    # Transportation Hubs (High Transit & Accessibility)
    'bus_stop',           # Key public transport access point
    'railway_station',    # Major transportation hub
    'tram_stop',          # Public transport access
    'subway',             # Subway stations in relevant cities

    # Social & Leisure (Indicating Active Areas)
    'restaurant',         # Social gathering points
    'cafe',               # Social gathering points, daily visits

    # Infrastructure & Accessibility
    'parking',            # Indicates destination points with car access
    'footway',            # Presence/density indicates walkability
    'pedestrian',         # Pedestrianized zones with high foot traffic
    'primary',            # Major roads, indicative of accessibility/activity
    'secondary',          # Secondary roads
    'tertiary',           # Local through-roads
    'living_street'       # Residential streets with high pedestrian priority
]

In [7]:
geo_df = geo_df[geo_df['fclass'].isin(objects_to_save)]

In [37]:
geo_df.to_parquet(r"/content/drive/MyDrive/Mastercard/geo_pandas_all_voievodeship_filtered.parquet")
#geo_df = gpd.read_parquet(r"/content/drive/MyDrive/Mastercard/geo_pandas_all_voievodeship_filtered.parquet")

In [34]:
geo_df = geo_df.drop(columns=['type','code', 'osm_id', 'name', 'layer_source', 'filepath_source', 'region_source', 'population',	'layer',	'bridge',	'tunnel',	'ref',	'oneway',	'maxspeed'	,'width'])
geo_df.head(10)

Unnamed: 0,fclass,geometry
1540066,retail,"POLYGON ((19.93483 50.07331, 19.93564 50.07337..."
1540085,residential,"POLYGON ((19.94151 50.08376, 19.94182 50.08457..."
1540265,retail,"POLYGON ((20.00991 50.01664, 20.01013 50.01697..."
1540266,residential,"POLYGON ((20.00119 50.01472, 20.00324 50.01588..."
1540267,residential,"POLYGON ((20.00005 50.01406, 20.00009 50.01412..."
1540268,residential,"POLYGON ((20.00433 50.0156, 20.00491 50.0165, ..."
1540271,residential,"POLYGON ((20.00425 50.01385, 20.00454 50.01436..."
1540272,commercial,"POLYGON ((20.00773 50.01282, 20.00808 50.01359..."
1540273,residential,"POLYGON ((19.99805 50.01608, 19.99834 50.01663..."
1540274,residential,"POLYGON ((19.99872 50.01731, 19.99896 50.0179,..."


In [35]:
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString # Example imports


original_crs = geo_df.crs

projected_crs = "EPSG:4326"

# Reproject the geometries to the projected CRS
geo_df_projected = geo_df.to_crs(projected_crs)

# Calculate the centroid on the projected geometries
# This provides more accurate centroids in meters (or the units of the projected CRS)
centroids_projected = geo_df_projected['geometry'].centroid

# Create a GeoDataFrame from the projected centroids
# This is necessary to reproject them back to the original CRS
centroids_projected_gdf = gpd.GeoDataFrame(geometry=centroids_projected, crs=projected_crs)

# Reproject the centroids back to the original geographic CRS
centroids_geographic = centroids_projected_gdf.to_crs(original_crs)

# Extract latitude (y-coordinate) and longitude (x-coordinate)
# from the reprojected geographic centroids
geo_df['longitude'] = centroids_geographic.geometry.x
geo_df['latitude'] = centroids_geographic.geometry.y

geo_df.head(10)


  centroids_projected = geo_df_projected['geometry'].centroid


Unnamed: 0,fclass,geometry,longitude,latitude
1540066,retail,"POLYGON ((19.93483 50.07331, 19.93564 50.07337...",19.935363,50.073061
1540085,residential,"POLYGON ((19.94151 50.08376, 19.94182 50.08457...",19.944936,50.085236
1540265,retail,"POLYGON ((20.00991 50.01664, 20.01013 50.01697...",20.011092,50.016835
1540266,residential,"POLYGON ((20.00119 50.01472, 20.00324 50.01588...",20.002765,50.014999
1540267,residential,"POLYGON ((20.00005 50.01406, 20.00009 50.01412...",20.005177,50.012645
1540268,residential,"POLYGON ((20.00433 50.0156, 20.00491 50.0165, ...",20.006563,50.015678
1540271,residential,"POLYGON ((20.00425 50.01385, 20.00454 50.01436...",20.006182,50.013663
1540272,commercial,"POLYGON ((20.00773 50.01282, 20.00808 50.01359...",20.0082,50.013035
1540273,residential,"POLYGON ((19.99805 50.01608, 19.99834 50.01663...",19.999342,50.01636
1540274,residential,"POLYGON ((19.99872 50.01731, 19.99896 50.0179,...",20.000161,50.017585


In [36]:
geo_df['fclass'].value_counts()

Unnamed: 0_level_0,count
fclass,Unnamed: 1_level_1
footway,1394387
residential,1232695
parking,284722
tertiary,212309
bus_stop,125070
secondary,110872
primary,73815
living_street,51400
convenience,33485
retail,26487


In [42]:
coords_df = pd.read_csv(r"/content/drive/MyDrive/Mastercard/better_coords.csv")
coords_df.columns

Index(['longitude', 'latitude', 'geometry'], dtype='object')

In [43]:
geo_df.columns

Index(['fclass', 'geometry', 'longitude', 'latitude'], dtype='object')

In [46]:
import pandas as pd
import geopandas
from shapely.geometry import Point, Polygon
import os
import time
import math

base_path = r"/content/drive/MyDrive/Mastercard/"
# grid_file_path = os.path.join(base_path, "better_coords.csv")
# lockers_file_path = os.path.join(base_path, "geo_pandas_all_voievodeship_filtered.parquet")
# output_file_path = os.path.join(base_path, "grid_2km_with_locker_counts.csv")

HALF_SIDE_KM = 1.0
KM_PER_DEGREE_LAT = 111.132
KM_PER_DEGREE_LON_BASE = 111.320

def calculate_delta_lat_degrees(km):
    return km / KM_PER_DEGREE_LAT

def calculate_delta_lon_degrees(km, center_lat_degrees):
    center_lat_radians = math.radians(center_lat_degrees)
    cos_lat = math.cos(center_lat_radians)
    if abs(cos_lat) < 1e-9:
        return float('inf')
    km_per_degree_lon_at_lat = KM_PER_DEGREE_LON_BASE * cos_lat
    if km_per_degree_lon_at_lat == 0:
        return float('inf')
    return km / km_per_degree_lon_at_lat

print("Krok 1: Wczytywanie paczkomatów...")
lockers_df = geo_df
lockers_df.dropna(subset=['fclass', 'geometry', 'longitude', 'latitude'], inplace=True)
#lockers_df['brand'] = lockers_df['brand'].astype(str).str.strip()
#lockers_df = lockers_df[lockers_df['brand'] != '']
lockers_df['latitude'] = pd.to_numeric(lockers_df['latitude'], errors='coerce')
lockers_df['longitude'] = pd.to_numeric(lockers_df['longitude'], errors='coerce')
lockers_df.dropna(subset=['latitude', 'longitude'], inplace=True)
geometry_lockers = [Point(xy) for xy in zip(lockers_df['longitude'], lockers_df['latitude'])]
lockers_gdf = geopandas.GeoDataFrame(lockers_df, geometry=geometry_lockers, crs="EPSG:4326")
print(f"Wczytano {len(lockers_gdf)} paczkomatów.")

if lockers_gdf.empty:
    print("Brak danych o paczkomatach. Przerywam.")
    exit()

print("Krok 2: Wczytywanie siatki i tworzenie poligonów...")
grid_df_cols_to_use = ['longitude', 'latitude', 'geometry']
grid_df = coords_df
grid_df.dropna(subset=['latitude', 'longitude'], inplace=True)
grid_df['latitude'] = pd.to_numeric(grid_df['latitude'], errors='coerce')
grid_df['longitude'] = pd.to_numeric(grid_df['longitude'], errors='coerce')
grid_df.dropna(subset=['latitude', 'longitude'], inplace=True)
grid_df.reset_index(drop=True, inplace=True)
grid_df['grid_id'] = grid_df.index

if grid_df.empty:
    print("Brak danych siatki. Przerywam.")
    exit()

print(f"Wczytano {len(grid_df)} środków kwadratów. Tworzenie poligonów...")
grid_polygons = []
total_grids = len(grid_df)
for index, row in grid_df.iterrows():
    center_lon = row['longitude']
    center_lat = row['latitude']
    delta_lat_deg = calculate_delta_lat_degrees(HALF_SIDE_KM)
    delta_lon_deg = calculate_delta_lon_degrees(HALF_SIDE_KM, center_lat)
    min_lon, max_lon = center_lon - delta_lon_deg, center_lon + delta_lon_deg
    min_lat, max_lat = center_lat - delta_lat_deg, center_lat + delta_lat_deg
    polygon = Polygon([(min_lon, min_lat), (max_lon, min_lat), (max_lon, max_lat), (min_lon, max_lat), (min_lon, min_lat)])
    grid_polygons.append(polygon)
    if (index + 1) % 10000 == 0 or (index + 1) == total_grids:
        print(f"Utworzono geometrię dla {index + 1}/{total_grids} kwadratów...")

grid_gdf = geopandas.GeoDataFrame(grid_df, geometry=grid_polygons, crs="EPSG:4326")
print(f"Utworzono {len(grid_gdf)} poligonów.")

print("Krok 3: Łączenie przestrzenne...")
start_sjoin_time = time.time()
joined_gdf = geopandas.sjoin(lockers_gdf, grid_gdf, how="inner", predicate="within", lsuffix="locker", rsuffix="grid")
print(f"Łączenie przestrzenne zakończone w {time.time() - start_sjoin_time:.2f}s. Znaleziono {len(joined_gdf)} paczkomatów w kwadratach.")

Krok 1: Wczytywanie paczkomatów...
Wczytano 3704151 paczkomatów.
Krok 2: Wczytywanie siatki i tworzenie poligonów...
Wczytano 77273 środków kwadratów. Tworzenie poligonów...
Utworzono geometrię dla 10000/77273 kwadratów...
Utworzono geometrię dla 20000/77273 kwadratów...
Utworzono geometrię dla 30000/77273 kwadratów...
Utworzono geometrię dla 40000/77273 kwadratów...
Utworzono geometrię dla 50000/77273 kwadratów...
Utworzono geometrię dla 60000/77273 kwadratów...
Utworzono geometrię dla 70000/77273 kwadratów...
Utworzono geometrię dla 77273/77273 kwadratów...
Utworzono 77273 poligonów.
Krok 3: Łączenie przestrzenne...
Łączenie przestrzenne zakończone w 4.23s. Znaleziono 3651309 paczkomatów w kwadratach.
Krok 4: Agregacja...


KeyError: 'brand'

In [51]:
joined_gdf.to_csv(r"/content/drive/MyDrive/Mastercard/joined_gdf.csv")

In [59]:
joined_gdf.drop(columns = ['longitude_locker', 'latitude_locker', 'geometry', 'index_grid', 'grid_id'], inplace=True)

In [63]:
# Group by grid and fclass, then count occurrences
grid_counts = (
    joined_gdf
    .groupby(['longitude_grid', 'latitude_grid', 'fclass'])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)

grid_counts.shape

(65165, 33)

In [54]:
lockers_df = pd.read_csv("/content/drive/MyDrive/Mastercard/grid_2km.csv")

In [56]:
lockers_df.merge(joined_gdf)

Unnamed: 0,longitude,latitude,geometry,grid_id,Allegro One Box,DHL BOX 24/7,DPD Pickup,Orlen Paczka,Paczkomat InPost
0,14.106323,52.956477,POINT (14.106323370274234 52.95647661733012),0,0,0,0,0,0
1,14.104295,52.974395,POINT (14.104294515967855 52.97439533721254),1,0,0,0,0,0
2,14.102263,52.992314,POINT (14.102263503574965 52.992313948428475),2,0,0,0,0,0
3,14.140023,52.921858,POINT (14.140023369985208 52.92185814476897),3,0,0,0,0,0
4,14.138011,52.939778,POINT (14.138010980293023 52.939777869952025),4,0,0,0,0,0
5,14.135996,52.957697,POINT (14.135996451409314 52.957697487255),5,0,0,0,0,0
6,14.13398,52.975617,POINT (14.13397978029525 52.97561699660909),6,0,0,0,0,0
7,14.131961,52.993536,POINT (14.13196096390612 52.99353639794549),7,0,0,0,0,0
8,14.12994,53.011456,POINT (14.129939999191313 53.011455691195195),8,0,0,0,0,0
9,14.127917,53.029375,POINT (14.127916883094302 53.02937487628905),9,0,0,0,0,0


In [64]:
lockers_df.shape

(77273, 9)

In [70]:
final_df = lockers_df.merge(
    grid_counts,
    right_on=['longitude_grid', 'latitude_grid'],
    left_on=['longitude', 'latitude'],
    how='left'
).fillna(0).drop(columns=['longitude_grid', 'latitude_grid'])

print(final_df.shape)
final_df.head(10)

(77273, 40)


Unnamed: 0,longitude,latitude,geometry,grid_id,Allegro One Box,DHL BOX 24/7,DPD Pickup,Orlen Paczka,Paczkomat InPost,atm,...,retail,school,secondary,suburb,subway,supermarket,tertiary,town,tram_stop,university
0,14.106323,52.956477,POINT (14.106323370274234 52.95647661733012),0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14.104295,52.974395,POINT (14.104294515967855 52.97439533721254),1,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14.102263,52.992314,POINT (14.102263503574965 52.992313948428475),2,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,14.140023,52.921858,POINT (14.140023369985208 52.92185814476897),3,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,14.138011,52.939778,POINT (14.138010980293023 52.939777869952025),4,0,0,0,0,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
5,14.135996,52.957697,POINT (14.135996451409314 52.957697487255),5,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,14.13398,52.975617,POINT (14.13397978029525 52.97561699660909),6,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,14.131961,52.993536,POINT (14.13196096390612 52.99353639794549),7,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,14.12994,53.011456,POINT (14.129939999191313 53.011455691195195),8,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,14.127917,53.029375,POINT (14.127916883094302 53.02937487628905),9,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
final_df.to_csv(r"/content/drive/MyDrive/Mastercard/final_df.csv")