# CIFAR-10 data inspection and conversion pipeline
Author: **Andrea Incerti Delmonte**

Email: ** andrea.incertidelmonte@gmail.com**
1. Dataset load
2. Data inspection
3. Data conversion to tf_record
4. TF_record inspection

In [None]:
import numpy as np
import tensorflow as tf
import tarfile
import os
import cPickle
import matplotlib.pyplot as plt
# Visualizations will be shown in the notebook.
%matplotlib inline
import random

## 1. Dataset load

### 1.1 Download and extract dataset if necessary

In [None]:
CIFAR10_FILENAME = "cifar-10-python.tar.gz"
CIFAR10_DOWNLOAD_URL = "https://www.cs.toronto.edu/~kriz/" + CIFAR10_FILENAME
CIFAR10_LOCAL_FOLDER = "./cifar-10_dataset"
CIFAR10_TARGET_FOLDER = "cifar-10-batches-py"

In [None]:
tf.contrib.learn.datasets.base.maybe_download(CIFAR10_FILENAME, CIFAR10_LOCAL_FOLDER, CIFAR10_DOWNLOAD_URL)
tarfile.open(os.path.join(CIFAR10_LOCAL_FOLDER, CIFAR10_FILENAME),"r:gz").extractall(CIFAR10_LOCAL_FOLDER)

In [None]:
extracted_data_folder = os.path.join(CIFAR10_LOCAL_FOLDER, CIFAR10_TARGET_FOLDER)
os.listdir(extracted_data_folder)

### 1.2 Load Cifar10 metadata

In [None]:
metadata_f = open(os.path.join(extracted_data_folder, "batches.meta"), "rb")
metadata_dict = cPickle.load(metadata_f)
print(metadata_dict)
dataset_batch_size = metadata_dict["num_cases_per_batch"]
image_lenght = metadata_dict["num_vis"]

#### 1.2.1 Labels to classes lookup table

In [None]:
labels_LUT = metadata_dict["label_names"]
for index, value in enumerate(labels_LUT):
    print("Label {} = {}".format(index,value))

### 1.3 Load training data

In [None]:
training_images = np.zeros(shape=[dataset_batch_size*4, image_lenght], dtype=np.uint8)
training_labels = np.zeros(shape=[dataset_batch_size*4], dtype=np.int64)

for i in range(4):
    training_f = open(os.path.join(extracted_data_folder, "data_batch_{}".format(i+1)), "rb")
    training_dict = cPickle.load(training_f)
    start_index = i*dataset_batch_size
    end_index = start_index + dataset_batch_size
    training_images[start_index:end_index,:] = training_dict["data"]
    training_labels[start_index:end_index] = np.asarray(training_dict["labels"])
    
print("training_images.shape {}".format(training_images.shape))
print("training_labels.shape {}".format(training_labels.shape))

### 1.4 Load evaluation data

In [None]:
evaluation_f = open(os.path.join(extracted_data_folder, "data_batch_5"), "rb")
evaluation_dict = cPickle.load(evaluation_f)
evaluation_images = evaluation_dict["data"]
evaluation_labels = np.asarray(evaluation_dict["labels"])
    
print("evaluation_images.shape {}".format(evaluation_images.shape))
print("evaluation_labels.shape {}".format(evaluation_labels.shape))

### 1.5 Load test data

In [None]:
test_f = open(os.path.join(extracted_data_folder, "test_batch"), "rb")
test_dict = cPickle.load(test_f)
test_images = test_dict["data"]
test_labels = np.asarray(test_dict["labels"])
    
print("test_images.shape {}".format(test_images.shape))
print("test_labels.shape {}".format(test_labels.shape))

## 2. Dataset inspection

### 2.1 Load random examples fom training set

In [None]:
IMG_HEIGHT = 32
IMG_WIDTH = 32
IMG_CHANNELS = 3

In [None]:
def plot_image(image, index, label):
    image = image.reshape((IMG_CHANNELS,IMG_HEIGHT, IMG_WIDTH)).transpose(1,2,0)
    plt.imshow(image)
    plt.title("Image index {}, label {}, class {}".format(index, label, labels_LUT[label]))
    plt.show()

In [None]:
for i in range(5):
    index = random.randint(0, training_images.shape[0])
    plot_image(training_images[index], index, training_labels[index])

### 2.2 Is Cifar10 a balanced dataset?

In [None]:
def data_histogram(data, title):
    fig, ax1 = plt.subplots(1, 1)
    ax1.hist(data, bins=np.arange(min(data)-0.5, max(data)+1+0.5), rwidth=0.5)
    plt.xticks(range(data.min(),data.max()+1))
    plt.title(title)
    ax1.yaxis.grid(True) # horizontal lines       
    plt.show()

In [None]:
data_histogram(training_labels, "Training data distribution")

In [None]:
data_histogram(evaluation_labels, "Evaluation data distribution")

In [None]:
data_histogram(test_labels, "Test data distribution")

## 3. Data conversion to tf_record

In [None]:
TFRECORDS_BASE_PATH = "./cifar-10_dataset/tf_records/"
TRAIN_TFRECORDS = TFRECORDS_BASE_PATH + "train.tfrecords"
EVAL_TFRECORDS = TFRECORDS_BASE_PATH + "eval.tfrecords"
TEST_TFRECORDS = TFRECORDS_BASE_PATH + "test.tfrecords"

In [None]:
def tf_record_builder(input_files, output_file):
    
    with tf.python_io.TFRecordWriter(output_file) as record_writer:
        
        for f_name in input_files:
            with tf.gfile.Open(f_name, "rb") as f:
                data_dict = cPickle.load(f)
            
            for i in range(data_dict["data"].shape[0]):
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[data_dict["data"][i].tobytes()])),
                        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[data_dict["labels"][i]]))
                    }))
                record_writer.write(example.SerializeToString()) 
        
            print("Writen {} examples from {} to {}".format(data_dict["data"].shape[0], f_name, output_file))

### 3.1 Create train.tfrecords

In [None]:
training_files_list = []
for i in range(4):
    training_files_list.append(os.path.join(extracted_data_folder, "data_batch_{}".format(i+1)))

tf_record_builder(training_files_list, TRAIN_TFRECORDS)

### 3.2 Create eval.tfrecords

In [None]:
eval_file_path = os.path.join(extracted_data_folder, "data_batch_5")
tf_record_builder([eval_file_path], EVAL_TFRECORDS)

### 3.3 Create test.tfrecords

In [None]:
test_file_path = os.path.join(extracted_data_folder, "test_batch")
tf_record_builder([test_file_path], TEST_TFRECORDS)

## 4. tf_record inspection

### 4.1 tf_record general structure
```feature {
  key: "image"
  value {
    bytes_list {
      value: ";+2Dbw\213\221\225\225\203}\216\220\211\201\211\206|\213\213\205\210\213\230\243\250\237..."
    }
  }
}
feature {
  key: "label"
  value {
    int64_list {
      value: 6
    }
  }
}```

### 4.2 Extract and plot 5 images from train.tfrecords

In [None]:
image_to_extract = 5
tf_record_iterator = tf.python_io.tf_record_iterator(path=TRAIN_TFRECORDS)

i = 0
for string_record in tf_record_iterator:
    
    record = tf.train.Example()
    record.ParseFromString(string_record)
    
    label = record.features.feature['label'].int64_list.value[0]

    encoded_image = (record.features.feature['image'].bytes_list.value[0])
    image = np.frombuffer(encoded_image, dtype=np.uint8)
                                              
    plot_image(image, i, label)   
    
    i = i + 1
    if i == image_to_extract:
        break