# Exploration and Transformation of Dataset
- Reference: [Home Page](http://www.cs.toronto.edu/~kriz/cifar.html)
- CIFAR-10 is a **completely balanced & labelled** dataset. In order to include the effects of unbalanced dataset, and to include unsupervised techniques, we will forcibly make the dataset unbalanced and will remove the labels of some of the examples, and consider them as unlabelled.

In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Setting the seed for numpy
np.random.seed(0)

In [2]:
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [3]:
dict_db1 = unpickle("../input/cifar10/data_batch_1")
dict_db2 = unpickle("../input/cifar10/data_batch_2")
dict_db3 = unpickle("../input/cifar10/data_batch_3")
dict_db4 = unpickle("../input/cifar10/data_batch_4")
dict_db5 = unpickle("../input/cifar10/data_batch_5")
dict_tb = unpickle("../input/cifar10/test_batch")

keys = []
for key in dict_db1.keys():
    keys.append(key)

print(keys)

In [4]:
db1 = pd.DataFrame(dict_db1[keys[2]])
db2 = pd.DataFrame(dict_db2[keys[2]])
db3 = pd.DataFrame(dict_db3[keys[2]])
db4 = pd.DataFrame(dict_db4[keys[2]])
db5 = pd.DataFrame(dict_db5[keys[2]])
tb = pd.DataFrame(dict_tb[keys[2]])

y1 = pd.Series(dict_db1[keys[1]])
y2 = pd.Series(dict_db2[keys[1]])
y3 = pd.Series(dict_db3[keys[1]])
y4 = pd.Series(dict_db4[keys[1]])
y5 = pd.Series(dict_db5[keys[1]])
y_test = pd.Series(dict_tb[keys[1]])

print(db1.shape, db2.shape, db3.shape, db4.shape, db5.shape, tb.shape)
print(len(y1), len(y2), len(y3), len(y4), len(y5), len(y_test))

In [5]:
db_train = pd.concat([db1, db2, db3, db4, db5], axis=0)
y_train = pd.concat([y1, y2, y3, y4, y5], axis=0)
print(db_train.shape, y_train.shape)

## Visualization of a single image

In [6]:
ind = 11
example = np.array(db_train.iloc[ind, : ])
example = np.reshape(example, (3, 32, 32))
example = example.transpose((1, 2, 0))
plt.figure(figsize=(1.5, 1.5))
plt.imshow(example)
print(y_train.iloc[ind])

# Splitting the Dataset into Labelled and Unlabelled (Leads to Unbalancing)

In [7]:
fracs = np.random.random(10)
num_examples = [int((frac / np.sum(fracs)) * 10000) for frac in fracs]

# We will be taking 9993 examples from the training dataset, and will treat them as unlabelled
# Since the number of examples taken from each class are chosen at random,
# Hence, this creates an unbalanced dataset as well
print(np.sum(num_examples), num_examples)

In [8]:
# Creating a list of lists for storing the indices of data-points in the training dataset, class-wise
classes_ind = []
for i in range(10):
    classes_ind.append([])

for ind, clss in enumerate(y_train):
    classes_ind[clss].append(ind)

# Transforming list of lists into numpy array
classes_ind = np.array([np.array(xi) for xi in classes_ind])

# Printing the first 10 indices of class-0
print(classes_ind.shape)
print(classes_ind[0][:10])

In [9]:
# Splitting the indices into labelled and unlabelled indices
# Using list of arrays, since the #entries in each row is different
lab_ind, unl_ind = [], [] 

for i in range(10):
    cls_ind = classes_ind[i]
    np.random.shuffle(cls_ind)
    arrys = np.split(cls_ind, [num_examples[i]])
    unl_ind.append(arrys[0])
    lab_ind.append(arrys[1])

print(len(lab_ind), len(unl_ind))

# Transforming the list of arrays into a single list
l_ind, u_ind = [], []
for arr in lab_ind:
    l_ind.extend(list(arr))
for arr in unl_ind:
    u_ind.extend(list(arr))

print(len(l_ind), len(u_ind))
print(l_ind[:10], u_ind[:10])

In [10]:
# lab_train stores the training examples that we are considering as labelled
# unl_train stores the training examples that we are considering as unlabelled
lab_train = db_train.iloc[l_ind][ : ]
unl_train = db_train.iloc[u_ind][ : ]
y_lab_train = y_train.iloc[l_ind]
y_unl_train = y_train.iloc[u_ind]
print(lab_train.shape, unl_train.shape, y_lab_train.shape, y_unl_train.shape)

In [11]:
lab_train.to_csv("train_lab_x.csv", index=False)
y_lab_train.to_csv("train_lab_y.csv", index=False)
unl_train.to_csv("train_unl_x.csv", index=False)
y_unl_train.to_csv("train_unl_y.csv", index=False)
tb.to_csv("test_x.csv", index=False)
y_test.to_csv("test_y.csv", index=False)