In [1]:
import numpy as np

def custom_train_test_split(x, y, k, n_obs):
    # Combine y and k to identify unique combinations
    yk_comb = np.array(list(zip(y, k)))
    
    # Find unique combinations of y and k
    unique_combinations = np.unique(yk_comb, axis=0)
    
    # Initialize lists to hold train and test indices
    train_indices = []
    test_indices = []
    
    # Iterate over each unique combination
    for comb in unique_combinations:
        # Find indices of the current combination
        comb_indices = np.where((yk_comb == comb).all(axis=1))[0]
        
        # Shuffle indices to ensure randomness
        np.random.shuffle(comb_indices)
        
        # Select n_obs indices for the test set
        test_indices.extend(comb_indices[:n_obs])
        
        # Remaining indices go to the train set
        train_indices.extend(comb_indices[n_obs:])
    
    # Convert lists to numpy arrays
    train_indices = np.array(train_indices)
    test_indices = np.array(test_indices)
    
    # Split the data into train and test sets
    x_train, x_test = x[train_indices], x[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    k_train, k_test = k[train_indices], k[test_indices]
    
    return x_train, x_test, y_train, y_test, k_train, k_test

In [8]:
import pickle

dataset = 'realworld_mobiact'

# Load the dataset
with open(f'splits/{dataset}_dp.pkl', 'rb') as f:
    x, y, k = pickle.load(f)

x_train, x_test, y_train, y_test, k_train, k_test = custom_train_test_split(x, y, k, n_obs=10)

print(f'Train: {len(y_train)} samples')
print(f'Test: {len(y_test)} samples')
print(f'Proportion: {len(y_test) / len(y):.2f}')

print()

for y_val in np.unique(y):
    for k_val in np.unique(k):
        n_train = np.sum((y_train == y_val) & (k_train == k_val))
        n_test = np.sum((y_test == y_val) & (k_test == k_val))
        print(f'Class {y_val}, Domain {k_val}: Train {n_train} Test {n_test}')

Train: 8379 samples
Test: 2437 samples
Proportion: 0.23

Class 0, Domain 15: Train 106 Test 10
Class 0, Domain 16: Train 106 Test 10
Class 0, Domain 17: Train 106 Test 10
Class 0, Domain 18: Train 107 Test 10
Class 0, Domain 19: Train 106 Test 10
Class 0, Domain 20: Train 106 Test 10
Class 0, Domain 21: Train 107 Test 10
Class 0, Domain 22: Train 106 Test 10
Class 0, Domain 23: Train 107 Test 10
Class 0, Domain 24: Train 106 Test 10
Class 0, Domain 25: Train 107 Test 10
Class 0, Domain 26: Train 106 Test 10
Class 0, Domain 27: Train 106 Test 10
Class 0, Domain 28: Train 106 Test 10
Class 0, Domain 29: Train 106 Test 10
Class 0, Domain 30: Train 106 Test 10
Class 0, Domain 31: Train 106 Test 10
Class 0, Domain 32: Train 106 Test 10
Class 0, Domain 33: Train 107 Test 10
Class 0, Domain 34: Train 107 Test 10
Class 0, Domain 35: Train 106 Test 10
Class 0, Domain 36: Train 107 Test 10
Class 0, Domain 37: Train 107 Test 10
Class 0, Domain 38: Train 106 Test 10
Class 0, Domain 39: Train 106 T

In [10]:
import pickle

dataset = 'mobiact_realworld'

# Load the dataset
with open(f'splits/{dataset}_dp.pkl', 'rb') as f:
    x, y, k = pickle.load(f)

x_train, x_test, y_train, y_test, k_train, k_test = custom_train_test_split(x, y, k, n_obs=50)

print(f'Train: {len(y_train)} samples')
print(f'Test: {len(y_test)} samples')
print(f'Proportion: {len(y_test) / len(y):.2f}')

print()

for y_val in np.unique(y):
    for k_val in np.unique(k):
        n_train = np.sum((y_train == y_val) & (k_train == k_val))
        n_test = np.sum((y_test == y_val) & (k_test == k_val))
        print(f'Class {y_val}, Domain {k_val}: Train {n_train} Test {n_test}')

Train: 8791 samples
Test: 2992 samples
Proportion: 0.25

Class 0, Domain 61: Train 187 Test 50
Class 0, Domain 62: Train 177 Test 50
Class 0, Domain 63: Train 204 Test 50
Class 0, Domain 64: Train 179 Test 50
Class 0, Domain 65: Train 209 Test 50
Class 0, Domain 66: Train 184 Test 50
Class 0, Domain 67: Train 178 Test 50
Class 0, Domain 68: Train 194 Test 50
Class 0, Domain 69: Train 182 Test 50
Class 0, Domain 70: Train 185 Test 50
Class 0, Domain 71: Train 197 Test 50
Class 0, Domain 72: Train 185 Test 50
Class 0, Domain 73: Train 193 Test 50
Class 0, Domain 74: Train 199 Test 50
Class 0, Domain 75: Train 197 Test 50
Class 1, Domain 61: Train 171 Test 50
Class 1, Domain 62: Train 171 Test 50
Class 1, Domain 63: Train 165 Test 50
Class 1, Domain 64: Train 87 Test 50
Class 1, Domain 65: Train 154 Test 50
Class 1, Domain 66: Train 171 Test 50
Class 1, Domain 67: Train 188 Test 50
Class 1, Domain 68: Train 173 Test 50
Class 1, Domain 69: Train 166 Test 50
Class 1, Domain 70: Train 184 Te