In [1]:
# Load data from the processed data file
import pickle
import os
import pandas as pd

PROCESSED_DATA_DIR = "processed"
GEOLIFE_DATA = {}

with open(os.path.join(PROCESSED_DATA_DIR, 'geolife_processed_data.pkl'), 'rb') as f:
    GEOLIFE_DATA = pickle.load(f)


for participant, dataframes in GEOLIFE_DATA.items():
    for label, df in dataframes.items():
        print(f"Participant: {participant}, Label: {label}")
        print(df.info())
        print(df.head())
        break
    break

Participant: 1, Label: 0_bus
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   latitude   72 non-null     float64       
 1   longitude  72 non-null     float64       
 2   date_time  72 non-null     datetime64[ns]
 3   timestamp  72 non-null     float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 2.4 KB
None
    latitude   longitude           date_time     timestamp
0  39.975133  116.329467 2007-06-27 09:49:22  1.182938e+09
1  39.975250  116.329067 2007-06-27 09:50:07  1.182938e+09
2  39.975267  116.329283 2007-06-27 09:50:11  1.182938e+09
3  39.975250  116.329417 2007-06-27 09:50:33  1.182938e+09
4  39.975283  116.330033 2007-06-27 09:51:58  1.182938e+09


In [2]:
from dataset import GeoLifeMobilityDataset
import numpy as np
from torch.utils.data import Dataset, Subset, DataLoader


In [3]:
def generate_indeces_split(total_data_len, num_clients, mean=None, std=None, seed=42):
    np.random.seed(seed)
    mean = mean or total_data_len / num_clients
    std = std or total_data_len / (num_clients * 2)

    random_normal_samples = np.random.normal(mean, std, num_clients)
    non_negative_random_normal_samples = np.maximum(random_normal_samples, 1).astype(int)
    normalized_samples = (non_negative_random_normal_samples / np.sum(non_negative_random_normal_samples) * total_data_len).astype(int)

    if(normalized_samples.sum() != total_data_len):
        diff = total_data_len - normalized_samples.sum()
        random_index = np.random.randint(0, num_clients)
        normalized_samples[random_index] += diff

    randomized_data_idx = np.random.permutation(total_data_len)

    data_split = []
    start_index = 0
    for sample in normalized_samples:
        end_index = start_index + sample
        data_split.append(randomized_data_idx[start_index:end_index])
        start_index = end_index

    return data_split

def get_client_dataset_split_following_normal_distribution(client_idx, num_clients, dataset, mean=None, std=None, seed=42):
    data_split = generate_indeces_split(len(dataset), num_clients, mean, std, seed)
    client_data_indices = data_split[client_idx]
    client_dataset = Subset(dataset, client_data_indices)
    return client_dataset

In [4]:
filter_geo_dataset = {}
for client_id, data in GEOLIFE_DATA.items():
    filtered_data_dict = {}
    for lable, df in data.items():
        if 'run' not in lable and 'motorcycle' not in lable:
            filtered_data_dict[lable] = df
    filter_geo_dataset[client_id] = filtered_data_dict
GEOLIFE_DATA = filter_geo_dataset
                
labels = ['walk', 'bus', 'car', 'taxi', 'subway', 'train', 'bike'] #removed 'run' and 'motorcycle'
sorted_labels = sorted(labels)
label_mapping = {label: idx for idx, label in enumerate(sorted_labels)}

In [5]:
selected_clients = list(range(1, 65))

dataset = GeoLifeMobilityDataset(GEOLIFE_DATA, selected_clients, label_mapping,
    # feature_extractor=GeoLifeMobilityDataset.rich_extractor
)
print(dataset.feature_extractor)

<function GeoLifeMobilityDataset.default_data_extractor at 0x00000204A671BA60>


In [6]:
partition_id = 0
num_partitions = 1

In [7]:
client_dataset = get_client_dataset_split_following_normal_distribution(partition_id, num_partitions, dataset)

In [8]:
print(f"Total number of samples in client dataset: {len(client_dataset)}")

label_counts = {}
for i in range(len(client_dataset)):
    _, label = client_dataset[i]
    label_name = sorted_labels[label]
    if label_name in label_counts:
        label_counts[label_name] += 1
    else:
        label_counts[label_name] = 1

print("\nLabel distribution:")
for label, count in label_counts.items():
    print(f"{label}: {count} samples ({count/len(client_dataset)*100:.2f}%)")

Total number of samples in client dataset: 9254

Label distribution:
subway: 581 samples (6.28%)
walk: 3834 samples (41.43%)
bike: 1562 samples (16.88%)
bus: 1831 samples (19.79%)
car: 779 samples (8.42%)
taxi: 509 samples (5.50%)
train: 158 samples (1.71%)


In [21]:
def latlon_to_cell(lat, lon, cell_size_m=500):
    # Approximate cell based on rounding coordinates to fixed lat/lon grid
    lat_cell = int(lat * 111000 / cell_size_m)
    lon_cell = int(lon * 85000 / cell_size_m)
    return (lat_cell, lon_cell)

In [None]:
def get_client_quality_statistics(partition_id, num_partitions, label_mapping, default_data_extractor_dataset, spatial_granularity_m=500):
    client_dataset = get_client_dataset_split_following_normal_distribution(partition_id, num_partitions, default_data_extractor_dataset)
    
    labels = set()
    spatial_cells = set()
    time_slots = set()
    sampling_regularity_stds = []
    
    for data_tensor, label_tensor in client_dataset:

        label_name = [key for key, val in label_mapping.items() if val == label_tensor.item()][0]
        labels.add(label_name)

        # First two columns are lat, lon
        coords = data_tensor[:, :2]
        for lat, lon in coords:
            spatial_cells.add(latlon_to_cell(lat.item(), lon.item(), spatial_granularity_m))

        # Third column is timestamp
        timestamps = data_tensor[:, 2]
        for ts in timestamps:
            ts_datetime = pd.to_datetime(ts.item(), unit='s')
            time_slots.add((ts_datetime.hour, ts_datetime.weekday()))
        
        time_diffs = []
        for i in range(1, len(timestamps)):
            time_diffs.append(abs(timestamps[i].item() - timestamps[i-1].item()))
        
        std_dev = np.std(time_diffs)
        sampling_regularity_stds.append(std_dev)

    return {
        "label_diversity": len(labels),
        "spatial_diversity": len(spatial_cells),
        "temporal_diversity": len(time_slots),
        "sampling_regularity_std": 1/(np.median(sampling_regularity_stds) + 1e-8) #To not dedvide by 0 if there is such case,
    }

In [32]:
num_partitions = 5

results = {}

for partition_id in range(num_partitions):

    results[partition_id] = get_client_quality_statistics(
        partition_id,
        num_partitions,
        label_mapping,
        dataset
    )
    print(results[partition_id])



{'label_diversity': 7, 'spatial_diversity': 15566, 'temporal_diversity': 156, 'sampling_regularity_std': 0.04675872400696376}
{'label_diversity': 7, 'spatial_diversity': 13779, 'temporal_diversity': 147, 'sampling_regularity_std': 0.04757550756199304}
{'label_diversity': 7, 'spatial_diversity': 14661, 'temporal_diversity': 151, 'sampling_regularity_std': 0.04753992680587862}


KeyboardInterrupt: 

In [24]:
print(results)

{0: {'label_diversity': 7, 'spatial_diversity': 15566, 'temporal_diversity': 156, 'sampling_regularity_std': 0.04675872400696376}, 1: {'label_diversity': 7, 'spatial_diversity': 13779, 'temporal_diversity': 147, 'sampling_regularity_std': 0.04757550756199304}, 2: {'label_diversity': 7, 'spatial_diversity': 14661, 'temporal_diversity': 151, 'sampling_regularity_std': 0.04753992680587862}, 3: {'label_diversity': 7, 'spatial_diversity': 16732, 'temporal_diversity': 164, 'sampling_regularity_std': 0.04812510017595744}, 4: {'label_diversity': 7, 'spatial_diversity': 16931, 'temporal_diversity': 161, 'sampling_regularity_std': 0.047274909728495164}}


In [28]:
#Normalize results

df = pd.DataFrame.from_dict(results, orient='index') #Make values as rows
df_norm = (df - df.min()) / (df.max() - df.min() + 1e-8) #Normalize

df["score"] = ( 0.25 * df_norm["label_diversity"] +
                0.25 * df_norm["spatial_diversity"] +
                0.25 * df_norm["temporal_diversity"] +
                0.25 * df_norm["sampling_regularity_std"])

print(df)
print(df["score"].iloc[1])

   label_diversity  spatial_diversity  temporal_diversity  \
0                7              15566                 156   
1                7              13779                 147   
2                7              14661                 151   
3                7              16732                 164   
4                7              16931                 161   

   sampling_regularity_std     score  
0                 0.046759  0.274088  
1                 0.047576  0.149442  
2                 0.047540  0.271711  
3                 0.048125  0.734215  
4                 0.047275  0.550326  
0.14944229778592286
