In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

### Main code

Read the data.

In [None]:
# Read the files needed in input
path_users_classification = './data_simulator/huge_dataset/dataset_simulator_trajectories.compressed.parquet.classified.parquet'
path_users_cells_mapping = './mapping_users_cells_grid_50m.pkl'

users_labels = pd.read_parquet(path_users_classification)
display(users_labels)

mapping_users_cells = pd.DataFrame(pd.read_pickle(path_users_cells_mapping))
display(mapping_users_cells)

### Various operations

In [None]:
# Assuming that we are dealing with the classification case, and that the labels follow a Bernoullian distribution, 
# determine the global positive and negative rates on the users labels.
global_positive_rate = users_labels['label'].mean()
global_negative_rate = 1 - global_positive_rate
print(f"Global positive rate: {global_positive_rate} - global negative rate: {global_negative_rate}")

In [None]:
# Add the labels to the dataframe that maps 'users to cells'.
mapping_users_cells['label'] = users_labels['label']
display(mapping_users_cells)

In [None]:
# Produce a more compact indexing for the cell IDs: these IDs can have gaps in them, so reindex their IDs
# to prepare more efficient set intersections over the user IDs they refer to.
array_cell_ids = np.sort(mapping_users_cells['cell_id'].unique())
remapping_indices_cells = pd.Series(index = array_cell_ids, data = range(len(array_cell_ids)))
del array_cell_ids
display(remapping_indices_cells)

# Produce a more compact indexing for the user IDs. Same reason as above.
array_user_ids = np.sort(mapping_users_cells.index.unique())
remapping_indices_users = pd.Series(index = array_user_ids, data = range(len(array_user_ids)))
del array_user_ids
display(remapping_indices_users)

In [None]:
remapping_users_cells = mapping_users_cells.copy(deep=True)

# Remap cell and user IDs to comntinous ranges.
remapping_users_cells.index = remapping_users_cells.index.map(remapping_indices_users)
display(remapping_users_cells)
remapping_users_cells['cell_id'] = remapping_users_cells['cell_id'].map(remapping_indices_cells)
display(remapping_users_cells)

# Regenerate the mapping between users-labels.
remapping_users_labels = remapping_users_cells.groupby('uid')['label'].first()
display(remapping_users_labels)

In [None]:
# Compute some aggregations at cell-level, effectively creating an augmented version of the grid. 
stats_config = {'list_users' : pd.NamedAgg(column='uid', aggfunc=set),
                'num_users' : pd.NamedAgg(column='uid', aggfunc='nunique'),
                'positive_rate' : pd.NamedAgg(column='label', aggfunc='mean')}
aug_grid = (remapping_users_cells.reset_index()
                                 .groupby('cell_id')
                                 .agg(**stats_config))


# Sort the cells by their IDs.
aug_grid.sort_values(by='cell_id', ascending=True, inplace=True)
display(aug_grid)

In [None]:
list_combinations_cells_tocheck = []

In [None]:
# Versione Python non ottimizzata intersezione liste ID utenti coppia celle.
# TODO: questa parte puo' essere quasi interamente implementata con un algoritmo per il frequent itemset mining, e.g., apriori.
#       Ad esempio, una possibile implementazione in Python e' https://github.com/tommyod/Efficient-Apriori/tree/master.
from itertools import combinations

intersections = {}
cnt_threshold = 5 # TODO: da settare in funzione dello statistical power che vogliamo nel test d'ipotesi.
pairs = zip(aug_grid.index, aug_grid['list_users'])
for (cell_id, list_users), (other_cell_id, other_list_users) in combinations(pairs, 2):
    
    # Compute the set intersection, and its cardinality.
    intersection = list_users & other_list_users
    cnt = len(intersection)

    # Add to the dictionary only the cell pairs that have at least 'threshold' users in common.
    # The threshold should be calculated according to the statistical power we want to have in the hypotesis tests.
    if cnt > cnt_threshold : intersections[(cell_id, other_cell_id)] = intersection


# Store the results of the set intersections in a pandas Dataframe.
res_intersections = pd.Series(data = intersections, name='list_users').to_frame()
del intersections

display(res_intersections)

In [None]:
# Now we have to compute the local positive rate of the various combinations of cells...

# 1 - turn each user ID in a list element into a row.
tmp = res_intersections.explode('list_users', ignore_index=False)
display(tmp)

# 2 - For every user ID, find the associated predicted label.
tmp['labels'] = tmp['list_users'].map(remapping_users_labels)
display(tmp)

# 3- For every combination of cells found in tmp's index, compute the local positive rate.
res_intersections['positive_rate'] = tmp.groupby(level=list(range(tmp.index.nlevels)))['labels'].mean()
del tmp
display(res_intersections)


# TODO: add to the list of combinations  of cell to check those whose positive rate differs more than
#       some threshold from the global one.
eps = 0.2
combs_cells_tocheck = res_intersections[abs(res_intersections['positive_rate'] - global_positive_rate) > eps]
print(f"Number of combinations of cells to test: {len(combs_cells_tocheck)}")
display(combs_cells_tocheck)

list_combinations_cells_tocheck.extend(combs_cells_tocheck.index.to_list())

In [None]:
# Codice generico per intersezione liste ID utenti di combinazione celle con lista utenti cella singola.
# TODO: da mettere a posto anche per il caso in cui dim_itemset=1 (caso base) e poi incapsulare in una classe.
dim_itemset = res_intersections.index.nlevels

base_lvls = list(range(dim_itemset - 1))
intersections = {}
for keys, sub in res_intersections.loc[:, 'list_users'].groupby(level=base_lvls, sort=False) :
    # If the multi-index has size 2, then the first "n-1" levels is just one level, and thus isn't a tuple.
    if dim_itemset - 1 == 1 : keys = (keys,)

    # Drop the first 'n-1' levels in the multi-index, leaving only the last one.
    s = sub.droplevel(base_lvls)
    # display(s)

    # Generate all the possible tuples of length 'n+1' by keeping fixed the first "n-1" keys and 
    # consider all the possible combinations of length 2 that can be generated in the last level of the
    # multi-index.
    for a, b in combinations(s.index, 2):
        # Compute the set intersection, and its cardinality.
        intersection = s[a] & s[b]
        cnt = len(intersection)

        # Add to the dictionary only the cell pairs that have at least 'threshold' users in common.
        # The threshold should be calculated according to the statistical power we want to have in the hypotesis tests.
        if cnt > cnt_threshold : intersections[(*keys, a, b)] = intersection


new_res_intersections = pd.Series(data = intersections, name='list_users').to_frame()
del intersections
display(new_res_intersections)

In [None]:
# TODO: quindi, usare il codice gia' presente in una delle celle sopra per aggiungere ad una lista quelle combinazioni di celle
#       per cui giudicheremo necessario effettuare il test statistico.