In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split

raw_train_path = "raw/train.csv"
normalised_train_path = "processed/norm_train.csv"
normalised_param_path = "processed/norm_params.csv"

train_path = "processed/train.csv"
val_path = "processed/val.csv"

raw_train_df = pd.read_csv(raw_train_path)

params = {}
train_df = raw_train_df.copy()
for col in raw_train_df.columns:
    mean = raw_train_df[col].mean()
    std = raw_train_df[col].std(ddof=0) or 1.0
    train_df[col] = (raw_train_df[col] - mean) / std
    params[col] = {"mean": mean, "std": std}

train_df.to_csv(normalised_train_path, index=False)
pd.DataFrame(params).to_csv(normalised_param_path, index=False)

In [14]:
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
print(f"Train size: {len(train_df)}\nVal size: {len(val_df)}")

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)

Train size: 419331
Val size: 104833


In [1]:
import torch
from torch.utils.data import Subset, DataLoader
from torchvision import datasets, transforms
import os

data_dir = "./mnist"

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train = datasets.MNIST(root=data_dir, train=True, transform=transform, download=True)

100%|██████████| 9.91M/9.91M [00:02<00:00, 4.86MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 119kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.24MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 2.25MB/s]
