# One-hot encode

In this note book, we will 
1. Read in training data pair, and testing data pair as numpy array from ubyte file
2. One-hot encode the label for both training set and test set
3. Export the data and label as csv


In [2]:
import numpy as np

In [3]:
def load_mnist(path, kind='train'):
    from numpy import fromfile, uint8
    import os
    import struct
    
    labels_path = os.path.join(path, '%s-labels-idx1-ubyte' % kind)
    images_path = os.path.join(path, '%s-images-idx3-ubyte' % kind)
    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II', lbpath.read(8))
        labels = fromfile(lbpath, dtype=uint8)
        with open(images_path, 'rb') as imgpath:
            magic, num, rows, cols = struct.unpack(">IIII",imgpath.read(16))
            images = fromfile(imgpath, dtype=uint8).reshape(len(labels), 784)
            images = ((images / 255.) - .5) * 2
    return images, labels

In [4]:
X_train, y_train = load_mnist('../data', kind='train')
print(f'Xtrain:  data type= {type(X_train)}, data shape= {X_train.shape}, element type={X_train.dtype}')
print(f'ytrain:  data type= {type(y_train)}, data shape= {y_train.shape}, element type={y_train.dtype}')

X_test, y_test = load_mnist('../data', kind='t10k')
print(f'Xtrain:  data type= {type(X_test)}, data shape= {X_test.shape}')
print(f'ytrain:  data type= {type(y_test)}, data shape= {y_test.shape}')

Xtrain:  data type= <class 'numpy.ndarray'>, data shape= (60000, 784), element type=float64
ytrain:  data type= <class 'numpy.ndarray'>, data shape= (60000,), element type=uint8
Xtrain:  data type= <class 'numpy.ndarray'>, data shape= (10000, 784)
ytrain:  data type= <class 'numpy.ndarray'>, data shape= (10000,)


One-hot Encode

In [5]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [6]:
y_test_onehot=convert_to_one_hot(y_test, 10)

In [7]:
y_train_onehot=convert_to_one_hot(y_train, 10)

In [8]:
vector=y_test.reshape(-1)
vector.shape

(10000,)

In [9]:
np.eye(10)

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [10]:
y_test_onehot.shape

(10000, 10)

In [11]:
y_test_onehot[0:9]

array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]])

Output numpy array to csv using [numpy.savetxt](https://numpy.org/doc/stable/reference/generated/numpy.savetxt.html)

We only save the first 1000 image for simplicity

In [16]:
np.savetxt('../../Neural-Network-CUDA/data/train_x.csv', X_train[0:60000],fmt='%3.11f',delimiter='\n')

In [17]:
np.savetxt('../../Neural-Network-CUDA/data/train_y.csv', y_train_onehot[0:60000],fmt='%1.0f', delimiter='\n')

In [21]:
np.savetxt('../../Neural-Network-CUDA/data/test_x.csv', X_test[0:10000],fmt='%3.11f',delimiter='\n')

In [22]:
np.savetxt('../../Neural-Network-CUDA/data/test_y.csv', y_test_onehot[0:10000],fmt='%1.0f',delimiter='\n')