In [None]:
import os
import pickle
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class Cifar10LT(Dataset):
    def __init__(self, root, imb_factor=0.1, transform=None, train=True):
        self.transform = transform
        self.train = train
        self.data, self.labels = self._load_data(root, train)
        
        self.indices = self._generate_imbalance(imb_factor)

    def _load_data(self, root, train):
        """Load CIFAR-10 dataset"""
        if train:
            batches = [f"data_batch_{i}" for i in range(1, 6)]
        else:
            batches = ["test_batch"]

        data = []
        labels = []

        for batch in batches:
            with open(os.path.join(root, batch), 'rb') as f:
                batch_data = pickle.load(f, encoding='bytes')
                data.append(batch_data[b'data'])
                labels += batch_data[b'labels']

        data = np.concatenate(data)
        return data, np.array(labels)

    def _generate_imbalance(self, imb_factor):
        labels = self.labels
        num_classes = len(np.unique(labels))
        max_samples = len(labels) // num_classes

        # Calculate number of samples per class
        num_samples_per_class = [
            int(max_samples * (imb_factor ** (i / (num_classes - 1)))) for i in range(num_classes)
        ]
        
        indices = []
        for class_idx in range(num_classes):
            class_indices = np.where(labels == class_idx)[0]
            np.random.shuffle(class_indices)
            selected_indices = class_indices[:num_samples_per_class[class_idx]]
            indices.extend(selected_indices)

        return indices

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        img = self.data[self.indices[idx]]
        label = self.labels[self.indices[idx]]
        img = np.transpose(np.reshape(img, (3, 32, 32)), (1, 2, 0))  # Transform to C,H,W format
        if self.transform:
            img = self.transform(img)
        return img, label

# Set parameters
imb_factor_train = 0.01  # Imbalance factor for training set
imb_factor_test = imb_factor_train     # Imbalance factor for test set
batch_size = 64
root = './cifar-10-batches-py'

# Data augmentation
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Create training dataset
train_dataset = Cifar10LT(root=root, imb_factor=imb_factor_train, transform=transform, train=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Create test dataset
test_dataset = Cifar10LT(root=root, imb_factor=imb_factor_test, transform=transform, train=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Print training set information
print(f"Training set samples: {len(train_dataset)}")
unique_train_labels, train_counts = np.unique(train_dataset.labels[train_dataset.indices], return_counts=True)
print("Samples per class in training set:", dict(zip(unique_train_labels, train_counts)))

# Print test set information
print(f"Test set samples: {len(test_dataset)}")
unique_test_labels, test_counts = np.unique(test_dataset.labels[test_dataset.indices], return_counts=True)
print("Samples per class in test set:", dict(zip(unique_test_labels, test_counts)))

# Example: Iterate through training data loader
for images, labels in train_loader:
    print("Training batch image shape:", images.shape)
    print("Training batch label shape:", labels.shape)
    break  # Only show the first batch

# Example: Iterate through test data loader
for images, labels in test_loader:
    print("Test batch image shape:", images.shape)
    print("Test batch label shape:", labels.shape)
    break  # Only show the first batch


In [None]:
import os
import numpy as np

# Create save directory
save_dir = './cifar10lt_100'
os.makedirs(save_dir, exist_ok=True)

# Save training set data and labels
np.save(os.path.join(save_dir, 'train_images.npy'), train_dataset.data[train_dataset.indices])
np.save(os.path.join(save_dir, 'train_labels.npy'), train_dataset.labels[train_dataset.indices])

# Save test set data and labels
np.save(os.path.join(save_dir, 'test_images.npy'), test_dataset.data[test_dataset.indices])
np.save(os.path.join(save_dir, 'test_labels.npy'), test_dataset.labels[test_dataset.indices])

print("Data saved to:", save_dir)


In [None]:
# Load data and labels
train_images = np.load(os.path.join(save_dir, 'train_images.npy'))
train_labels = np.load(os.path.join(save_dir, 'train_labels.npy'))

# Convert data to PyTorch Tensors if needed
train_images_tensor = torch.tensor(train_images).float()
train_labels_tensor = torch.tensor(train_labels).long()
