# data partition notebook
use this notebook to configure and create dataset with training, testing, and validation partitions. allows one to not use data directly from scrape. partition workflow:
- select garage, date range and output path for dataset
- generates data and saves entire data set as bulk set
- configure parameters to save partitioned dataset
- saves partitioned dataset

datasets are saved as `.pkl` or pickle files to easy load back into python

## imports

In [1]:
import sys
import os
import pandas as pd
import pickle
import random

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../lib")))

from data_load import load_data, load_week_data

## define constants

In [2]:
GARAGE = "North Garage"
DATE_START = pd.to_datetime("2025-01-01 12:00:00 AM")
DATE_END = pd.to_datetime("2025-06-01 12:00:00 AM")

# path where data will be saved
OUTPUT_PATH = "../datasets/"
ENABLE_BULKSET_SAVE = False
RAND_WEEKS = True

## load data

In [3]:
start_dates, end_dates, data_by_weeks = load_week_data()

## save bulk set
saves data for all garages in given date range to `../datasets/bulkset.pkl`. data saved in format:

```
data: {
    "North Garage": [],
    "South Garage": [],
    "South Campus Garage": [],
    "West Garage": []
}
```

where each garage has a list of dataframes, each ith df represents a weeks worth of data

In [4]:
file_path = OUTPUT_PATH + "bulkset.pkl"

if ENABLE_BULKSET_SAVE:
    with open(file_path, "wb") as f:
        pickle.dump(data_by_weeks, f)
        print(f"saved data for each garage in {file_path}")

## save bulk set for specific garage
saves a list of dataframes (each df represents one week of data) to pickle file

In [5]:
garage_data_by_weeks = data_by_weeks[GARAGE]

data_start = str(garage_data_by_weeks[0].iloc[0]["timestamp"])
data_end = str(garage_data_by_weeks[-1].iloc[0]["timestamp"])
print(f"data loaded for {GARAGE} from {str(DATE_START)}-{str(DATE_END)} contains {len(garage_data_by_weeks)} weeks of data")
print(f"data starts on week of {data_start}")
print(f"data ends on week of {data_end}")

file_path = OUTPUT_PATH + f"{GARAGE.replace(" ", "_")}/{data_start.split(" ")[0]}_{data_end.split(" ")[0]}_set.pkl"

with open(file_path, "wb") as f:
    pickle.dump(garage_data_by_weeks, f)
    print(f"wrote {GARAGE} data to {file_path}")

data loaded for North Garage from 2025-01-01 00:00:00-2025-06-01 00:00:00 contains 8 weeks of data
data starts on week of 2025-02-13 16:57:00
data ends on week of 2025-04-14 00:13:00
wrote North Garage data to ../datasets/North_Garage/2025-02-13_2025-04-14_set.pkl


## save partitioned dataset
configure size of each parition by specifying number of weeks each parition should have. by default each parition does not overlap. weeks are chosen randomly to put in each partition

In [6]:
TRAINING_SIZE = 5
TEST_SIZE = 2
VALIDATION_SIZE = 1

# specific starting weeks to last weeks to choose from if choosing weeks deterministically
DIRECTION = "last weeks"

partitioned_data = {
    "training": [],
    "test": [],
    "validation": []
}

def choose_rand_weeks(num_weeks, set_type):
    chosen_weeks = []
    for i in range(num_weeks):
        rand_ind = random.randint(0, len(garage_data_by_weeks)-1)
        chosen_weeks.append(garage_data_by_weeks[rand_ind].iloc[0]["timestamp"])
        partitioned_data[set_type].append(garage_data_by_weeks.pop(rand_ind))
    print(f"\n\n{set_type} info:\n")
    print(f"chosen weeks starting at: \n{chosen_weeks}\n")
    print("set preview:")
    for df in partitioned_data[set_type]:
        print(df.head(2))

def choose_deterministic_weeks(num_weeks, set_type, direction):
    remove = 0
    if direction == "last weeks":
        remove = -1
    chosen_weeks = []
    for i in range(num_weeks):
        chosen_weeks.append(garage_data_by_weeks[remove].iloc[0]["timestamp"])
        partitioned_data[set_type].append(garage_data_by_weeks.pop(remove))
    print(f"\n\n{set_type} info:\n")
    print(f"chosen weeks starting at: \n{chosen_weeks}\n")
    print("set preview:")
    for df in partitioned_data[set_type]:
        print(df.head(2))

if RAND_WEEKS:
    choose_rand_weeks(TRAINING_SIZE, "training")
    choose_rand_weeks(TEST_SIZE, "test")
    choose_rand_weeks(VALIDATION_SIZE, "validation")
    file_path = file_path.replace("_set", "_partitioned_r")
else:
    choose_deterministic_weeks(TRAINING_SIZE, "training", DIRECTION)
    choose_deterministic_weeks(TEST_SIZE, "test", DIRECTION)
    choose_deterministic_weeks(VALIDATION_SIZE, "validation", DIRECTION)
    file_path = file_path.replace("_set", f"_partitioned_d_{DIRECTION.split(" ")[0]}")

with open(file_path, "wb") as f:
    pickle.dump(partitioned_data, f)
    print("wrote data to " + file_path)



training info:

chosen weeks starting at: 
[Timestamp('2025-02-24 00:21:00'), Timestamp('2025-04-14 00:13:00'), Timestamp('2025-02-17 00:37:00'), Timestamp('2025-02-13 16:57:00'), Timestamp('2025-03-17 01:09:00')]

set preview:
              timestamp   garage name  fullness
746 2025-02-24 00:21:00  North Garage         5
742 2025-02-24 00:21:00  North Garage         5
             timestamp   garage name  fullness
74 2025-04-14 00:13:00  North Garage         5
78 2025-04-14 06:29:00  North Garage         6
             timestamp   garage name  fullness
34 2025-02-17 00:37:00  North Garage         5
30 2025-02-17 00:37:00  North Garage         5
            timestamp   garage name  fullness
2 2025-02-13 16:57:00  North Garage        69
6 2025-02-13 17:17:00  North Garage        62
              timestamp   garage name  fullness
538 2025-03-17 01:09:00  North Garage         5
558 2025-03-17 09:01:00  North Garage        70


test info:

chosen weeks starting at: 
[Timestamp('2025-03-0