In [1]:
import sys
if sys.version_info[0] < 3:
	raise Exception("Python 3 not detected.")
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from scipy import io


for data_name in ["mnist", "spam", "cifar10"]:
    data = np.load(f"../data/{data_name}-data.npz")
    print("\nloaded %s data!" % data_name)
    fields = "test_data", "training_data", "training_labels"
    for field in fields:
        print(field, data[field].shape)



loaded mnist data!
test_data (10000, 1, 28, 28)
training_data (60000, 1, 28, 28)
training_labels (60000,)

loaded spam data!
test_data (1000, 32)
training_data (4172, 32)
training_labels (4172,)

loaded cifar10 data!
test_data (10000, 3072)
training_data (50000, 3072)
training_labels (50000,)


In [3]:
# Q2: Data Partitioning

# Returns indices for training_labels and training_data that are set aside for validation and for training.
def shuffle_partition(data_name, count):
    data = np.load(f"../data/{data_name}-data.npz")
    indices = np.arange(0, len(data["training_labels"]), 1)
    np.random.shuffle(indices)
    return indices[0:count], indices[count:]
    
# 10,000 training data set aside for validation.
mnist_validation, mnist_training = shuffle_partition("mnist", 10000)
# 20% of training data set aside for validation.
spam_validation, spam_training = shuffle_partition("spam", 834)
# 5,000 training data set aside for validation.
cifar10_validation, cifar10_training = shuffle_partition("cifar10", 5000)
