# WSmart+ Route Data Files

In [6]:
from notebook_setup import setup_google_colab, setup_home_directory

NOTEBOOK_NAME = "data_files"
home_dir = setup_home_directory(NOTEBOOK_NAME)
IN_COLAB, gdrive, gfiles = setup_google_colab(NOTEBOOK_NAME)

Already added home_dir to system path: /home/pkhunter/Repositories/WSmart-Route


In [None]:
import json
import os
import pickle

import numpy as np
import pandas as pd

from logic.src.pipeline.simulator.wsmart_bin_analysis import OldGridBase
from logic.src.utils.io_utils import chunk_zip_content, reassemble_files

MAX_DISPLAY_ROWS = 500
np.set_printoptions(suppress=True)
pd.set_option("display.max_rows", MAX_DISPLAY_ROWS)

SHOW_TABLES = False
if IN_COLAB:
    gdrive.mount("/content/drive")

# Required to use matplotlib in Windows without breaking the Kernel
if os.name == "nt":
    os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

seed = 42
n_days = 30
n_bins = 104
n_samples = 1
data_dist = "emp"

## Daily waste data

In [9]:
data_dir = os.path.join(home_dir, "data", "wsr_simulator")
file_path = os.path.join(data_dir, "daily_waste", "enchimentos_abril_2024_cleaned.xlsx")
clean_df = (
    pd.read_excel(file_path)
    .rename(columns={"Fill Level_Day 0": "Day 0"})
    .drop(columns=["Day 31"])
)
if SHOW_TABLES:
    display(clean_df)

In [None]:
data_dir = os.path.join(home_dir, "data", "wsr_simulator")
file_path = os.path.join(data_dir, "daily_waste", "april_2024_summary.csv")
clean_df = pd.read_csv(file_path).rename(columns={"Fill Level_Day 0": "Day 0"})
if SHOW_TABLES:
    display(clean_df)

In [None]:
daily_waste_path = os.path.join(
    home_dir,
    "daily_waste",
    f"{area}{n_bins}_{data_dist}_wsr{n_days}_N{n_samples}_seed{seed}.pkl",
)
if not os.path.exists(daily_waste_path):
    clean_df.to_pickle(daily_waste_path)

## Coordinates

In [None]:
def process_old_wsba_coords(bins_coords):
    bins_coords = bins_coords.rename(columns={"Latitude": "Lat", "Longitude": "Lng"})
    bins_coords = bins_coords[["ID", "Lat", "Lng"]]
    return bins_coords.reset_index(drop=True)

In [None]:
area = "Rio Maior"
grid = OldGridBase(data_dir, area)
_, info = grid.load_data()
waste_types = ["Mistura de embalagens", "Embalagens de papel e cart√£o"]
plastic_bins = info[info["Tipo de Residuos"] == waste_types[0]]
plastic_df = process_old_wsba_coords(plastic_bins)

In [None]:
clean_df_coords = plastic_df[plastic_df["ID"].isin(clean_df["ID"])]
clean_df_coords = clean_df_coords.sort_values("ID").reset_index(drop=True)
clean_coords_file = os.path.join(
    data_dir, "coordinates", "coordenadas_abril_2024_cleaned.xlsx"
)
if not os.path.exists(clean_coords_file):
    clean_df_coords.to_excel(clean_coords_file, index=False)

In [None]:
selection_file = os.path.join(
    data_dir, "bins_selection", f"graphs_{n_bins}V_1N_plastic.json"
)
with open(selection_file) as fp:
    indices = json.load(fp)

clean_arr = clean_df.drop(columns=["ID", "Mean", "StD"]).to_numpy()
clean_arr = np.swapaxes(clean_arr[indices], 1, 2)
daily_waste_file = os.path.join(
    data_dir, "daily_waste", f"riomaior{n_bins}_emp_wsr{n_days}_N1_seed{seed}.pkl"
)
with open(daily_waste_file, "wb") as fp:
    pickle.dump(clean_arr, fp)

## Distance matrix

In [86]:
dm_path = os.path.join(
    data_dir, "distance_matrix", "gmaps_distmat_plastic[riomaior].csv"
)
dist_mat = np.loadtxt(dm_path, delimiter=",")
dm_ids = pd.concat([pd.Series([0]), clean_df_coords["ID"]], ignore_index=True)

In [87]:
col_names = dist_mat[0].astype(int)
data_rows = dist_mat[1:]
df = pd.DataFrame(data=data_rows, columns=col_names)
df[-1] = df[-1].astype(int)
df_matrix = df.set_index(-1)
df_matrix = df_matrix.loc[dm_ids, dm_ids]
df_matrix.to_csv(
    os.path.join(data_dir, "distance_matrix", "gmaps_distmat_plastic104[riomaior].csv")
)

In [89]:
np_arr = np.loadtxt(
    os.path.join(data_dir, "distance_matrix", "gmaps_distmat_plastic104[riomaior].csv"),
    delimiter=",",
)[1:, 1:]
idx = [
    0,
    60,
    34,
    10,
    64,
    94,
    78,
    102,
    44,
    59,
    82,
    15,
    5,
    20,
    77,
    29,
    32,
    12,
    79,
    28,
    27,
    51,
    7,
    21,
    67,
    8,
    19,
    81,
    70,
    74,
    71,
    54,
    24,
    69,
    36,
    22,
    23,
    47,
    38,
    6,
    85,
    48,
    100,
    86,
    18,
    96,
    55,
    14,
    33,
    92,
    13,
    97,
    61,
    91,
    26,
    35,
    39,
    58,
    75,
    53,
    2,
    73,
    43,
    103,
    11,
    31,
    62,
    3,
    95,
    37,
    101,
    42,
    17,
    76,
    66,
    16,
    41,
    56,
    4,
    57,
    1,
    99,
    25,
    87,
    104,
    68,
    72,
    88,
    9,
    40,
    65,
    63,
    30,
    45,
    93,
    80,
    98,
    89,
    49,
    84,
    83,
    90,
    50,
    46,
    52,
    0,
]
row_idx = idx[:-1]
col_idx = idx[1:]
values = np_arr[row_idx, col_idx]
dist = 0
for id, _ in enumerate(idx):
    if id == len(idx) - 1:
        break
    dist += np_arr[idx[id], idx[id + 1]]
dist

261.9850000000001

## Reassamble large files

In [47]:
GITHUB_MAX_MBSIZE = 95 * 1000**2
waste_data_dir = os.path.join(data_dir, "bins_waste")
zip_path = os.path.join(
    waste_data_dir, "wetransfer_dados-rio-maior_2025-03-19_1615.zip"
)
if os.path.isfile(zip_path):
    zip_mbsize = os.path.getsize(zip_path)
    chunk_files = chunk_zip_content(zip_path, GITHUB_MAX_MBSIZE, waste_data_dir)

In [48]:
data_files = reassemble_files(waste_data_dir)

In [None]:
collections_path = os.path.join(
    waste_data_dir, "Enchimentos_com_Recolhas[RioMaior].csv"
)
sensors_path = os.path.join(waste_data_dir, "Enchimentos_de_Sensores[RioMaior].csv")
collections = pd.read_csv(collections_path, delimiter=",")
sensors = pd.read_csv(sensors_path, delimiter=";")

In [None]:
collections["Matricula do contentor"].nunique()

In [None]:
sensors["Matricula do contentor"].nunique()

In [None]:
ucoll = collections.drop_duplicates(keep="first", subset="Matricula do contentor")
ucoll["description"].value_counts()

In [None]:
usensor = sensors.drop_duplicates(keep="first", subset="Matricula do contentor")
usensor["description"].value_counts()

In [None]:
collections.to_csv(collections_path, index=False)
if SHOW_TABLES:
    display(collections)

In [None]:
split_row = 524904
sensors1 = sensors.iloc[:split_row]
sensors2 = sensors.iloc[split_row:]

In [None]:
if SHOW_TABLES:
    display(sensors1)

In [None]:
if SHOW_TABLES:
    display(sensors2)