# Data Processing | CIFAR-10 Dataset

In [11]:
import numpy as np
import torch
from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms.v2 as v2

### Loading Data

In [None]:
# Loading dataset
def load_data():
    train_dataset = datasets.CIFAR10(root='./data', train=True, download=True)
    test_dataset = datasets.CIFAR10(root='./data', train=False, download=True)

    return train_dataset, test_dataset

Files already downloaded and verified
Files already downloaded and verified


### Normalization

In [13]:
def RGB_stats(X):

    '''
    This function will return the mean and standard deveations of
    each RGB panel.
    '''
    mu = np.mean(X, axis=(0, 1, 2))
    sigma = np.std(X, axis=(0, 1, 2))

    return mu, sigma

### Data Splitting and Loaders

In [14]:
def split_data(dataset, train_ratio=0.8):

    '''Function will split the given dataset into a train set and test set'''

    train_size = int(train_ratio * len(dataset.targets))
    val_size = len(dataset.targets) - train_size
    idx = torch.randperm(len(dataset.targets)).tolist()

    x_train, y_train = [dataset.data[i] for i in idx[:train_size]], [dataset.targets[i] for i in idx[:train_size]]
    x_val, y_val = [dataset.data[i] for i in idx[train_size:]], [dataset.targets[i] for i in idx[train_size:]]
    
    return x_train, y_train, x_val, y_val

In [15]:
def dataloaders(train_dataset, val_dataset, test_dataset, batch_size=32):

    '''Returns dataloaders for the given datasets'''

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

### Transformations

In [16]:
def transformations(mu, stdev):

    '''Returns the desired transformations required in the processing phase. Please note that 
    test_transform does not include any data augmentation'''

    transform = v2.Compose([
        # transforms.RandomApply([transforms.GaussianNoise(sigma=1)], p=0.2),
        transforms.RandomHorizontalFlip(0.5),
        transforms.RandomRotation(degrees=(-10,10)),
        transforms.Normalize(mean=mu, std=stdev),
        transforms.ToPILImage(),
        transforms.ToTensor(),
    ])

    test_transform = transforms.Compose([
        transforms.Normalize(mean=mu, std=stdev),
        transforms.ToPILImage(),
        transforms.ToTensor(), 
    ])
    
    return transform, test_transform

### Dataset Class

In [17]:
class Dataset(Dataset):
    def __init__(self, X, Y, transform=None):
        self.X = X
        self.Y = Y
        self.transform = transform
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):

        x, y = self.X[idx], self.Y[idx]
        x = torch.tensor(x).permute(2, 0, 1).float()
        if self.transform:
            x = self.transform(x)
            
        return x, torch.tensor(y, dtype=torch.long)

# Data Pipeline

In [None]:
def data_pipeline(batch_size=32, transform_func=transformations):

    train, test = load_data()

    # Gathering Normalization Statistics
    mu, stdev = RGB_stats(train.data)

    # Splitting the Data
    x_train, y_train, x_val, y_val = split_data(train)

    # Transforming and Normalizing
    train_transform, test_transform = transform_func(mu, stdev)

    train_dataset = Dataset(x_train, y_train, transform=train_transform)
    val_dataset = Dataset(x_val, y_val, transform=test_transform)
    test_dataset = Dataset(test.data, test.targets, transform=test_transform)

    # Creating Dataloaders
    train_dataset, val_dataset, test_dataset = dataloaders(train_dataset, val_dataset, test_dataset, batch_size=batch_size)

    return(train_dataset, val_dataset, test_dataset)