In [1]:
import torch
import numpy as np
import pandas as pd

In [2]:
from torch import utils
from torchbearer.cv_utils import DatasetValidationSplitter

In [3]:
root_dir = 'external_data'
pd.read_csv(f'{root_dir}/train.csv').head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
def mnist_convert_to_pt_from_csv(root_dir=root_dir):
    dest_dir = f'{root_dir}/mnist_tensors'
    df = pd.concat([pd.read_csv(f'{root_dir}/train.csv'),
                    pd.read_csv(f'{root_dir}/test.csv')], axis=1).sample(frac=1)
    data = df.values; data_size = data.shape[0]
    train_end, test_end = map(lambda x: int(data_size * x), [0.4, 0.55])
    train_data, test_data, unlabeled_data = data[: train_end], data[train_end: test_end], data[test_end:]
    train, test, unlabeled = torch.tensor(train_data).float(), torch.tensor(test_data).float(), torch.tensor(unlabeled_data).float()
    torch.save(train[:, 1:].reshape(-1, 28), f'{dest_dir}/x_train.pt'); torch.save(train[:, : 1].long(), f'{dest_dir}/y_train.pt')
    torch.save(test[:, 1:].reshape(-1, 28), f'{dest_dir}/x_test.pt'); torch.save(test[:, : 1].long(), f'{dest_dir}/y_test.pt')
    torch.save(unlabeled[:, 1:].reshape(-1, 28), f'{dest_dir}/x_unlabeled.pt'); torch.save(unlabeled[:, : 1].long(), f'{dest_dir}/y_unlabeled.pt')

mnist_convert_to_pt_from_csv()

In [None]:
def get_loader_from_tensor(root_dir: str, batch_size: int = 32, frac_train_to_be_val: int = 0.2, labeled_idx: np.array = np.arange(0)):
    x_unlabeled, y_unlabeled = torch.load(f'{root_dir}/x_unlabeled.pt'), torch.load(f'{root_dir}/y_unlabeled.pt')
    x_chosen, y_chosen = x_unlabeled[labeled_idx], y_unlabeled[labeled_idx]
    new_unlabeled_idx = ~np.isin(np.arange(x_unlabeled.shape[0]), labeled_idx)
    x_unlabeled, y_unlabeled = x_unlabeled[new_unlabeled_idx], y_unlabeled[new_unlabeled_idx]
    
    loaders = {}
    x_test, y_test = torch.load(f'{root_dir}/x_test.pt'), torch.load(f'{root_dir}/y_test.pt')
    
    x_train, y_train = torch.load(f'{root_dir}/x_train.pt'), torch.load(f'{root_dir}/y_train.pt')
    x_train, y_train = torch.cat([x_train, x_chosen]), torch.cat([y_train, y_chosen])
    permute_idx_train = np.arange(x_train.shape[0])
    x_train, y_train = x_train[permute_idx_train], y_train[permute_idx_train]
    
    torch.save(x_chosen, 'x_chosen.pt'); torch.save(y_chosen, 'y_chosen.pt')
    torch.save(x_train, 'x_train.pt'); torch.save(y_train, 'y_train.pt')
    torch.save(x_unlabeled, 'x_unlabeled.pt'); torch.save(y_unlabeled, 'y_unlabeled.pt')
    
    unlabeled_dataset = utils.data.TensorDataset(x_unlabeled, y_unlabeled)
    test_dataset = utils.data.TensorDataset(x_test, y_test)
    dataset = utils.data.TensorDataset(x_train, y_train)
    splitter = DatasetValidationSplitter(len(dataset), frac_train_to_be_val)
    val_dataset = splitter.get_val_dataset(dataset)
    train_dataset = splitter.get_train_dataset(dataset)
    
    loaders['train'] = utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle = True)
    loaders['val'] = utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle = True)
    loaders['test'] = utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle = False)
    loaders['unlabeled'] = utils.data.DataLoader(unlabeled_dataset, batch_size=batch_size, shuffle = False)
    
    return loaders

loaders = get_loader_from_tensor(
    root_dir=f'{root_dir}/mnist_tensors'
)
loaders

In [None]:
def convert_emnist_to_pt(root_dir='external_data'):
    dest_dir = f'{root_dir}/emnist_tensors'
    from emnist import  extract_samples
    x_test, y_test = extract_samples(dataset='digits', usage='test')
    x_train, y_train = extract_samples(dataset='digits', usage='train')
    train_end = int(x_train.shape[0] * 0.45)
    x_train, x_unlabeled = x_train[: train_end], x_train[train_end: ]
    y_train, y_unlabeled = y_train[: train_end], y_train[train_end: ]
    x_train, x_test, x_unlabeled = torch.tensor(x_train).float(), torch.tensor(x_test).float(), torch.tensor(x_unlabeled).float()
    y_train, y_test, y_unlabeled = torch.tensor(y_train), torch.tensor(y_test), torch.tensor(y_unlabeled)
    torch.save(x_train, f'{dest_dir}/x_train.pt'); torch.save(y_train.long(), f'{dest_dir}/y_train.pt')
    torch.save(x_test, f'{dest_dir}/x_test.pt'); torch.save(y_test.long(), f'{dest_dir}/y_test.pt')
    torch.save(x_unlabeled, f'{dest_dir}/x_unlabeled.pt'); torch.save(y_unlabeled.long(), f'{dest_dir}/y_unlabeled.pt')
    
convert_emnist_to_pt()

In [1]:
import torch

In [None]:
torch.utils.data.TensorDataset