# WSmart+ Route Data Files

In [1]:
from notebook_setup import setup_home_directory, setup_google_colab


NOTEBOOK_NAME = 'data_files'
home_dir = setup_home_directory(NOTEBOOK_NAME)
IN_COLAB, gdrive, gfiles = setup_google_colab(NOTEBOOK_NAME)

Setup completed - added home_dir to system path: /home/pkhunter/Repositories/WSmart-Route


In [2]:
import os
import json
import pickle
import numpy as np
import pandas as pd

from src.pipeline.simulator.wsmart_bin_analysis import OldGridBase
from src.utils.io_utils import chunk_zip_content, reassemble_files


MAX_DISPLAY_ROWS = 500
pd.set_option('display.max_rows', MAX_DISPLAY_ROWS)

SHOW_TABLES = False
if IN_COLAB: 
    gdrive.mount('/content/drive')

# Required to use matplotlib in Windows without breaking the Kernel
if os.name == 'nt':
    os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

In [3]:
def process_old_wsba_coords(bins_coords):
    bins_coords = bins_coords.rename(columns={'Latitude': 'Lat', 'Longitude': 'Lng'})
    bins_coords = bins_coords[['ID', 'Lat', 'Lng']]
    return bins_coords.reset_index(drop=True)

In [4]:
data_dir = os.path.join(home_dir, "data", "wsr_simulator")
file_path = os.path.join(data_dir, 'daily_waste', 'enchimentos_abril_2024_cleaned.xlsx')
clean_df = pd.read_excel(file_path).rename(columns={'Fill Level_Day 0': 'Day 0'}).drop(columns=['Day 31'])
if SHOW_TABLES: display(clean_df)

In [5]:
data_dir = os.path.join(home_dir, "data", "wsr_simulator")
file_path = os.path.join(data_dir, 'daily_waste', 'april_2024_summary.csv')
clean_df = pd.read_csv(file_path).rename(columns={'Fill Level_Day 0': 'Day 0'}).drop(columns=['Day 31'])
if SHOW_TABLES: display(clean_df)

In [6]:
area = "Rio Maior"
grid = OldGridBase(data_dir, area)
_, info = grid.load_data()
waste_types = ['Mistura de embalagens', 'Embalagens de papel e cart√£o']
plastic_bins = info[info['Tipo de Residuos'] == waste_types[0]]
plastic_df = process_old_wsba_coords(plastic_bins)

In [7]:
clean_df_coords = plastic_df[plastic_df['ID'].isin(clean_df['ID'])]
clean_df_coords = clean_df_coords.sort_values('ID').reset_index(drop=True)
clean_coords_file = os.path.join(data_dir, 'coordinates', 'coordenadas_abril_2024_cleaned.xlsx')
if not os.path.exists(clean_coords_file): clean_df_coords.to_excel(clean_coords_file, index=False)

In [8]:
selection_file = os.path.join(data_dir, 'bins_selection', 'graphs_100V_1N_plastic.json')
with open(selection_file) as fp:
    indices = json.load(fp)

clean_arr = clean_df.drop(columns=['ID', 'Mean', 'StD']).to_numpy()
clean_arr = np.swapaxes(clean_arr[indices], 1, 2)
daily_waste_file = os.path.join(data_dir, 'daily_waste', 'riomaior100_emp_wsr31_N1_seed42.pkl')
with open(daily_waste_file, 'wb') as fp:
    pickle.dump(clean_arr, fp)

## Reassamble large files

In [47]:
GITHUB_MAX_MBSIZE = 95 * 1000**2
waste_data_dir = os.path.join(data_dir, 'bins_waste')
zip_path = os.path.join(waste_data_dir, "wetransfer_dados-rio-maior_2025-03-19_1615.zip")
if os.path.isfile(zip_path):
    zip_mbsize = os.path.getsize(zip_path)
    chunk_files = chunk_zip_content(zip_path, GITHUB_MAX_MBSIZE, waste_data_dir)

In [48]:
data_files = reassemble_files(waste_data_dir)

In [None]:
collections_path = os.path.join(waste_data_dir, "Enchimentos_com_Recolhas[RioMaior].csv")
sensors_path = os.path.join(waste_data_dir, "Enchimentos_de_Sensores[RioMaior].csv")
collections = pd.read_csv(collections_path, delimiter=',')
sensors = pd.read_csv(sensors_path, delimiter=';')

In [None]:
collections['Matricula do contentor'].nunique()

In [None]:
sensors['Matricula do contentor'].nunique()

In [None]:
ucoll = collections.drop_duplicates(keep='first', subset='Matricula do contentor')
ucoll['description'].value_counts()

In [None]:
usensor = sensors.drop_duplicates(keep='first', subset='Matricula do contentor')
usensor['description'].value_counts()

In [None]:
collections.to_csv(collections_path, index=False)
if SHOW_TABLES: display(collections)

In [None]:
split_row = 524904
sensors1 = sensors.iloc[:split_row]
sensors2 = sensors.iloc[split_row:]

In [None]:
if SHOW_TABLES: display(sensors1)

In [None]:
if SHOW_TABLES: display(sensors2)