In [17]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit

In [18]:
class Writer:
    def __init__(self, outdir, start_idx=0):
        self.outdir = Path(outdir)
        self.outdir.mkdir(parents=True, exist_ok=True)
        self.idx = start_idx
    def write(self, X, y):
        save_file = self.outdir / f'{self.idx}.pt'
        # torch.save((X, y), save_file)
        np.savez_compressed(save_file, X=X, y=y)
        self.idx += 1

In [19]:
def write_to_file(writer, X, y):
    try:
        for xi, yi in zip(X, y):
            writer.write(xi, yi)
    except: return False
    return True

In [20]:
data_dir = '../Data/CHD_w29_s14_ID_Data/wavelet/'
data = np.load(Path(data_dir) / 'DoS.npz')
X, y = data['X'], data['y']

In [27]:
y = y.squeeze()
test_fraction = 0.3
test_size = int(y.shape[0] * test_fraction)
indices = np.random.permutation(y.shape[0])
test_idx, train_idx = indices[:test_size], indices[test_size:]

(array([ 9280, 30432,  8873, ...,  5671, 28750, 32409]),
 array([45281, 29792,  9544, ..., 34303, 21388, 45809]))

In [30]:
data = np.load('../Data/CHD_w29_s14_ID_Data/1/idex.npz')
data['train'], data['test']

(array([60, 80, 90, 68, 51, 27, 18, 56, 63, 74,  1, 61, 42, 41,  4, 15, 17,
        40, 38,  5, 91, 59,  0, 34, 28, 50, 11, 35, 23, 52, 10, 31, 66, 57,
        79, 85, 32, 84, 14, 89, 19, 29, 49, 97, 98, 69, 20, 94, 72, 77, 25,
        37, 81, 46, 39, 65, 58, 12, 88, 70, 87, 36, 21, 83,  9, 96, 67, 64,
        47, 44]),
 array([26, 86,  2, 55, 75, 93, 16, 73, 54, 95, 53, 92, 78, 13,  7, 30, 22,
        24, 33,  8, 43, 62,  3, 71, 45, 48,  6, 99, 82, 76]))

In [5]:
num_splits = 5
test_size = 0.3
outdir = '../Data/CHD_w29_s14_ID_Data/'
train_writers = [Writer(outdir=outdir + f'train/{i + 1}/') for i in range(num_splits)]
val_writers = [Writer(outdir=outdir + f'val/{i + 1}/') for i in range(num_splits)]
sss = StratifiedShuffleSplit(n_splits=num_splits, test_size=test_size)
sss.get_n_splits(X, y)

5

In [6]:
from asyncio import as_completed
from concurrent import futures


with futures.ProcessPoolExecutor(num_splits) as exec:
    todo = []
    for i, (train_idx, test_idx) in enumerate(sss.split(X, y)):
        print(i, len(train_idx), len(test_idx))
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_val = X[test_idx]
        y_val = y[test_idx]
        future = exec.submit(write_to_file, train_writers[i], X_train, y_train)
        todo.append(future)
        future = exec.submit(write_to_file, val_writers[i], X_val, y_val)
        todo.append(future)

    results = []
    sucess = 0
    fail = 0
    for future in futures.as_completed(todo):
        res = future.result()
        if res:
            sucess += 1
        else:
            fail += 1
print(f'Sucess: {sucess} - Fails: {fail}')

0 231084 99036
1 231084 99036
2 231084 99036
3 231084 99036
4 231084 99036
Sucess: 10 - Fails: 0


In [None]:
torch.load('./')