# Flatten the list of object IDs representing the candidates

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from tqdm import tqdm
import pickle

from pathlib import Path

### Aux functions

In [None]:
def extract_grid_resolution_and_offset_filename(file_candidates : str) -> tuple[int, int] :
    tokens = file_candidates.split('_')
    
    grid_res, grid_offset = tokens[1], tokens[2]
    return grid_res, grid_offset


def flatten_lists_ids(np_list_candidates : np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    ''' 
    Flatten the object ID lists associated with the candidates into a 1D array ###
    # NOTE: we do this because we can then use joblib's shared memory.

    Parameters
    ----------
    np_list_candidates : np.ndarray
        An array of lists, where each list contains the object IDs associated with a candidate.
    labels : np.ndarray
        A binary array indicating the presence (1) or absence (0) of a certain property for a given set of objects.

    Returns
    -------
    flat_ids : np.ndarray
        A 1D array containing all the object IDs associated with the candidates, concatenated together.
    indptr : np.ndarray
        An array of indices indicating the starting position of each candidate's list in the `flat_ids` array.
    lens : np.ndarray
        An array containing the length of each candidate's list of associated object IDs.
    '''

    # Compute the lengths of each candidate's list.
    lens = np.fromiter((a.size for a in np_list_candidates),
                       dtype=np.int32, count=len(np_list_candidates))

    # Compute the starting/ending positions of each candidate's list in the flattened array.
    indptr = np.empty(lens.size + 1, dtype=np.uint32)
    indptr[0] = 0
    np.cumsum(lens, out=indptr[1:])

    # Flatten the lists into a single vector.
    flat_ids = np.concatenate(np_list_candidates).astype(np.uint32, copy=False)
    
    return flat_ids, indptr, lens

### Main code

Build a single numpy vector contaning the candidates of all the grids. The vector will contain the flattened lists of the object IDs associated with the candidates. Also, the notebook generates auxiliary data structures that can be used to pinpoint the exact locations of the cells making up the candidates (recall also that the candidates have been generated from different grids).

Optionally, the notebook allows to filter the candidates whose number of associated objects is below a given threshold. This can be very useful to reduce the execution time when doing the Monte Carlo simulations. Note that even low thresholds, e.g., >= 5, greatly reduces the number of candidates to consider.

In [None]:
min_num_objects = 1 # Filter out the subset of cells that have associated less than this number of objects.
path_candidates = './data_simulator/huge_dataset/gencand/'
list_candidates_paths = [f for f in Path(path_candidates).iterdir() if (f.is_file() and ("flattened" not in f.name))]

# Read the candidates to be tested over a set of grids.
np_list_candidates = None
list_grids_info = []
for path in tqdm(list_candidates_paths, 
                 desc="Processing candidate files",
                 unit="file"):

    # Read the candidates that have been generated for a specific grid.
    candidates = pd.read_pickle(path)
    # print(f"Reading grid candidates from {path}")

    # Here we filter out the subset of cells that have associated less than 'min_num_objects' objects.
    candidates = candidates.loc[candidates['list_users'].apply(len) >= min_num_objects]

    # Retrieve from the index the list of candidates generated for this grid.
    list_grid_candidates = candidates.index

    # Generate two numpy arrays from the candidates DataFrame: one for the list of users associated with each candidate
    # (subset of cells), and one for the size (number of cells of a subset) of each candidate.
    cand = candidates['list_users'].to_numpy()
    np_list_candidates = np.append(np_list_candidates, cand) if np_list_candidates is not None else cand

    grid_res, grid_offset = extract_grid_resolution_and_offset_filename(path.stem)
    list_grids_info.append((path.name, grid_res, grid_offset, list_grid_candidates, cand.size))
    # print(f"Grid from which the candidates have been computed: {list_grids_info[-1]}")

print(f"Total number of candidates: {np_list_candidates.size}")


### Flatten the object ID lists associated with the candidates into a 1D array (plus aux arrays) ###
flat_ids, indptr, lens = flatten_lists_ids(np_list_candidates)
del np_list_candidates  # free memory 

# Create a dictionary containing all the necessary information needed to reconstruct the non-flattened candidates from all the grids. 
dict_flattened_candidates = {'flat_ids' : flat_ids,
                             'start_pos' : indptr, 
                             'lengths' : lens,
                             'grid_info' : list_grids_info}

# Save the dictionary to disk.
out_path = Path(path_candidates + "dict_flattened_candidates.pkl")
with out_path.open("wb") as f:
    pickle.dump(dict_flattened_candidates, f)
