In [1]:
import numpy as np
import random

seed = 2710
np.random.seed(seed)
random.seed(seed)

In [2]:
import numpy as np

def custom_train_test_split(x, y, k, n_obs):
    # Combine y and k to identify unique combinations
    yk_comb = np.array(list(zip(y, k)))
    
    # Find unique combinations of y and k
    unique_combinations = np.unique(yk_comb, axis=0)
    
    # Initialize lists to hold train and test indices
    train_indices = []
    test_indices = []
    
    # Iterate over each unique combination
    for comb in unique_combinations:
        # Find indices of the current combination
        comb_indices = np.where((yk_comb == comb).all(axis=1))[0]
        
        # Shuffle indices to ensure randomness
        np.random.shuffle(comb_indices)
        
        # Select n_obs indices for the test set
        test_indices.extend(comb_indices[:n_obs])
        
        # Remaining indices go to the train set
        train_indices.extend(comb_indices[n_obs:])
    
    # Convert lists to numpy arrays
    train_indices = np.array(train_indices)
    test_indices = np.array(test_indices)
    
    # Split the data into train and test sets
    x_train, x_test = x[train_indices], x[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    k_train, k_test = k[train_indices], k[test_indices]
    
    return x_train, x_test, y_train, y_test, k_train, k_test

In [3]:
import pickle

dataset = 'mobiact_pamap'

if dataset == 'realworld_pamap':
    num_df_domains = 15
    n_obs = 10
elif dataset == 'pamap_realworld':
    num_df_domains = 6
    n_obs = 50
elif dataset == 'mobiact_pamap':
    num_df_domains = 61
    n_obs = 10
elif dataset == 'pamap_mobiact':
    num_df_domains = 6
    n_obs = 10

# Load the dataset
with open(f'{dataset}.pkl', 'rb') as f:
    x, y, k = pickle.load(f)

print(f'Loaded full dataset {dataset} with shape {x.shape}, from {len(set(k))} domains and {len(set(y))} classes')

Loaded full dataset mobiact_pamap with shape (12369, 3, 128), from 67 domains and 4 classes


In [4]:
# Filter only df domains
mask_df = (k < num_df_domains)
x_df = x[mask_df]
k_df = k[mask_df]
y_df = y[mask_df]

print(f'Loaded Df data with shape {x_df.shape}, from {len(set(k_df))} domains and {len(set(y_df))} classes')

# Save the data
with open(f'splits/{dataset}_df.pkl', 'wb') as f:
    pickle.dump((x_df, y_df, k_df), f)

Loaded Df data with shape (10816, 3, 128), from 61 domains and 4 classes


In [5]:
# Filter only dp domains
mask_dp = (k >= num_df_domains)
x_dp = x[mask_dp]
k_dp = k[mask_dp]
y_dp = y[mask_dp]

print(f'Loaded Dp data with shape {x_dp.shape}, from {len(set(k_dp))} domains and {len(set(y_dp))} classes')

# Save the data
with open(f'splits/{dataset}_dp.pkl', 'wb') as f:
    pickle.dump((x_dp, y_dp, k_dp), f)

Loaded Dp data with shape (1553, 3, 128), from 6 domains and 4 classes


In [6]:
x_dp_map, x_dp_te, y_dp_map, y_dp_te, k_dp_map, k_dp_te = custom_train_test_split(x_dp, y_dp, k_dp, n_obs)

print(f'Divided Dp data into map with shape {x_dp_map.shape}, from {len(set(k_dp_map))} domains and {len(set(y_dp_map))} classes')

# Save the data
with open(f'splits/{dataset}_dp_map.pkl', 'wb') as f:
    pickle.dump((x_dp_map, y_dp_map, k_dp_map), f)

print(f'And into test with shape {x_dp_te.shape}, from {len(set(k_dp_te))} domains and {len(set(y_dp_te))} classes')

# Save the data
with open(f'splits/{dataset}_dp_te.pkl', 'wb') as f:
    pickle.dump((x_dp_te, y_dp_te, k_dp_te), f)

Divided Dp data into map with shape (1313, 3, 128), from 6 domains and 4 classes
And into test with shape (240, 3, 128), from 6 domains and 4 classes


In [7]:
train_samples = len(y_dp_map)
test_samples = len(y_dp_te)
print(f'Dp map has {train_samples} train samples and {test_samples} test samples ({test_samples / (train_samples + test_samples):.2f} test)')

for y_val in np.unique(y_dp):
    for k_val in np.unique(k_dp):
        print(f'Class {y_val}, domain {k_val}: {np.sum((y_dp_map == y_val) & (k_dp_map == k_val))} train samples, {np.sum((y_dp_te == y_val) & (k_dp_te == k_val))} test samples')

Dp map has 1313 train samples and 240 test samples (0.15 test)
Class 0, domain 61: 70 train samples, 10 test samples
Class 0, domain 62: 112 train samples, 10 test samples
Class 0, domain 63: 110 train samples, 10 test samples
Class 0, domain 64: 86 train samples, 10 test samples
Class 0, domain 65: 116 train samples, 10 test samples
Class 0, domain 66: 107 train samples, 10 test samples
Class 1, domain 61: 67 train samples, 10 test samples
Class 1, domain 62: 23 train samples, 10 test samples
Class 1, domain 63: 81 train samples, 10 test samples
Class 1, domain 64: 72 train samples, 10 test samples
Class 1, domain 65: 1 train samples, 10 test samples
Class 1, domain 66: 48 train samples, 10 test samples
Class 2, domain 61: 37 train samples, 10 test samples
Class 2, domain 62: 41 train samples, 10 test samples
Class 2, domain 63: 32 train samples, 10 test samples
Class 2, domain 64: 26 train samples, 10 test samples
Class 2, domain 65: 26 train samples, 10 test samples
Class 2, domain 