# Collect all data

In [1]:
import struct

def get_byte_list(lbl_file_name, img_file_name):
    '''
    Returns a list of tuples,
    each tuple contains a label and an image, both in bytes.
    '''

    tuples = []
    with open(lbl_file_name, 'rb') as lbl_file, open(img_file_name, 'rb') as img_file:

        magic_number, num = struct.unpack('>II', lbl_file.read(8))
        _magic, _num, rows, cols = struct.unpack('>IIII', img_file.read(16))

        assert(num == _num)

        for i in range(num):
            label = lbl_file.read(1)
            img = img_file.read(rows*cols)

            tuples.append((label, img))

    return tuples

In [2]:
!ls -l ../src/data/mnist/t*

-rw-rw-r-- 1 adrian adrian  7840016 maj 14 16:08 ../src/data/mnist/t10k-images.idx3-ubyte
-rw-rw-r-- 1 adrian adrian    10008 maj 14 16:08 ../src/data/mnist/t10k-labels.idx1-ubyte
-rw-rw-r-- 1 adrian adrian 47040016 maj 14 16:08 ../src/data/mnist/train-images.idx3-ubyte
-rw-rw-r-- 1 adrian adrian    60008 maj 14 16:08 ../src/data/mnist/train-labels.idx1-ubyte


In [3]:
from pathlib import Path

DATA_DIR = Path('../src/data/mnist')
train_data = get_byte_list(DATA_DIR / 'train-labels.idx1-ubyte', DATA_DIR / 'train-images.idx3-ubyte')
test_data = get_byte_list(DATA_DIR / 't10k-labels.idx1-ubyte', DATA_DIR / 't10k-images.idx3-ubyte')
data = train_data + test_data
print('tot num data', len(data))

tot num data 70000


# Shuffle the data

In [4]:
SEED = 20180516

In [5]:
from random import shuffle
from random import seed

seed(SEED)
print('first 4 before:', list(zip(*data[:4]))[0])
shuffle(data)
print('first 4 after:', list(zip(*data[:4]))[0])

first 4 before: (b'\x05', b'\x00', b'\x04', b'\x01')
first 4 after: (b'\x04', b'\t', b'\x01', b'\x07')


# Split into folds

In [6]:
def print_folds(folds):
    for i in range(len(folds[0])):
        print(i, end='\t')
        for fold in folds:
            print(int.from_bytes(fold[i][0], byteorder='big'), end='\t')
        print()

In [7]:
NUM_FOLDS = 5

n_data = len(data)
n_data_per_fold = n_data // NUM_FOLDS
print(n_data_per_fold, 'per fold; rest:', n_data%NUM_FOLDS)

tmp_data = data
folds = []
for i in range(NUM_FOLDS):
    heads = tmp_data[:n_data_per_fold]
    tail = tmp_data[n_data_per_fold:]
    
    folds.append(heads)
    tmp_data = tail

14000 per fold; rest: 0


# Write data to files

In [8]:
import os

directory = DATA_DIR.parent / 'mnist_iid_cv'
try:
    os.mkdir(directory)
except FileExistsError:
    print("A folder already exists:", directory)

for i in range(NUM_FOLDS):
    folder_name = directory / ('fold' + str(1+i))
    try:
        os.mkdir(folder_name)
    except FileExistsError:
        print("A folder already exists:", folder_name)

A folder already exists: ../src/data/mnist_iid_cv


In [9]:
os.listdir(directory)

['fold2', 'fold4', 'fold3', 'fold1', 'fold5']

In [10]:
def partition_mnist_list(pairs, nr_data_per_car, output_dir):

    for i, nr_data in enumerate(nr_data_per_car):
        number_list = pairs[:nr_data]
        pairs = pairs[nr_data:]
        car_i = i+1

        lbls_file_name = output_dir / f"car{car_i}-labels.byte"
        imgs_file_name = output_dir / f"car{car_i}-images.byte"
        with open(lbls_file_name, 'wb') as lbl_file, \
                open(imgs_file_name, 'wb') as img_file:

            lbl_magic_nr = b'\x00\x00\x08\x01' # ubyte, 1-dim
            n_lbl = (nr_data).to_bytes(4, byteorder='big')
            lbl_header = lbl_magic_nr + n_lbl
            lbl_file.write(lbl_header)

            img_magic_nr = b'\x00\x00\x08\x03' # ubyte, 3-dim
            n_imgs = (nr_data).to_bytes(4, byteorder='big')
            n_rows = (28).to_bytes(4, byteorder='big')
            n_cols = (28).to_bytes(4, byteorder='big')
            img_header = img_magic_nr + n_imgs + n_rows + n_cols
            img_file.write(img_header)

            for (lbl, img) in number_list:
                lbl_file.write(lbl)
                img_file.write(img)

In [11]:
folders = os.listdir(directory)
folders.sort()
for fold_name, data_pairs in zip(folders, folds):
    output_dir = directory / fold_name
    print(output_dir)
    tot_data = len(data_pairs)
    partition_mnist_list(data_pairs, [tot_data // 100 for _ in range(100)], output_dir)

../src/data/mnist_iid_cv/fold1
../src/data/mnist_iid_cv/fold2
../src/data/mnist_iid_cv/fold3
../src/data/mnist_iid_cv/fold4
../src/data/mnist_iid_cv/fold5


# Read how many digit each client has

In [12]:
import numpy
import struct

def read_mnist_data(fname_img, fname_lbl):
    '''
    Read MNIST data from a byte file.

    Return: tuple of inputs and labels (numpy)
    '''
    with open(fname_lbl, 'rb') as flbl:
        magic, num = struct.unpack(">II", flbl.read(8))
        lbl = numpy.fromfile(flbl, dtype=numpy.int8)
        if len(lbl) != num:
            print('Header mismatch. #labels != header number')

    with open(fname_img, 'rb') as fimg:
        magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
        img = numpy.fromfile(fimg, dtype=numpy.uint8).reshape(num, rows, cols)

    return (img, lbl)

In [13]:
import os

directory = DATA_DIR.parent / 'mnist_iid_cv'

def read_car_data(folder_names, input_dir, car_i):
    '''
    Read data for one car
    '''
    x_return = []
    y_return = []

    for fold_name in folder_names:
        # Read file
        input_directory = input_dir / fold_name
        label_file = input_directory / f"car{car_i}-labels.byte"
        image_file = input_directory / f"car{car_i}-images.byte"
        x, y = read_mnist_data(image_file, label_file)

        # accumulate/store chosen
        x_return.append(x)
        y_return.append(y)

    # return
    return (numpy.concatenate(x_return), numpy.concatenate(y_return))
    
folders = os.listdir(directory)
folders.remove('fold1')

list_of_pairs = [read_car_data(folders, directory, car_i) for car_i in range(1, 101)]

In [14]:
from itertools import groupby
import operator, functools

freq = dir()
for i in range(10):
    freq[i] = 0

for i,(_, car_lbls) in enumerate(list_of_pairs):
    groups = groupby(numpy.sort(car_lbls))
    keys, data_length = zip(*[(key,len(list(data))) for (key, data) in groups])
    print(i+1, keys)
    print(i+1, functools.reduce(operator.add, data_length))
    for num in keys:
        freq[num] += 1
            
print("Frequency")
for i in range(10):
    print(i,":", freq[i])

1 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
1 560
2 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
2 560
3 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
3 560
4 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
4 560
5 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
5 560
6 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
6 560
7 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
7 560
8 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
8 560
9 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
9 560
10 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
10 560
11 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
11 560
12 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
12 560
13 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
13 560
14 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
14 560
15 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
15 560
16 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
16 560
17 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
17 560
18 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
18 560
19 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
19 560
20 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
20 560
21 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
21 560
22 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
22 560
23 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
23 560
24 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
24 560
25 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
