## Reikalingos bibliotekos:

In [7]:
import pickle
import glob
import numpy as np
from PIL import Image
import random
import os

## Pagalbinės funkcijos reikalingos atidaryti CIFAR duomenų failui ir išsaugoti nuotraukoms

In [8]:
def unpickle(filepath):
    with open(filepath, 'rb') as file:
        dict = pickle.load(file, encoding="bytes")
    return dict

def makedir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)

def create_data_directories(output_dir, label_names):
    makedir(output_dir)
    makedir(os.path.join(output_dir, "train"))
    makedir(os.path.join(output_dir, "valid"))
    
    for label_name in label_names:
        makedir(os.path.join(output_dir, "train", label_name))
        makedir(os.path.join(output_dir, "valid", label_name))
        
def get_save_path(output_dir, train_or_valid, label_name, filename):
    filename_without_extension, _ = os.path.splitext(filename)
    filename = filename_without_extension + ".jpg"
    return os.path.join(output_dir, train_or_valid, label_name, filename)

#### CIFAR duomenų masyvo transponavimas ir išsaugojimas į nuotraukas

`train_split`, `seed`, ir `output_dir` gali būt keičiami. `seed` nurodome, kad galėtume atkartoti identišką duomenų išmaišmą.

In [9]:
# 0.7 indicates that 70% of data will go to the training set & 30$ to the validation set.
train_split= 0.7
seed = 1
output_dir= "data/"


with open("./cifar-10-batches-py/batches.meta.txt") as file:
    label_names = [line.rstrip() for line in file]


data_filepaths = glob.glob("./cifar-10-batches-py/*_batch*")
data_filepaths.sort()

train_and_test_data = []

for filepath in data_filepaths:
    dict = unpickle(filepath)
    images = dict[b"data"]
    label_ids = dict[b"labels"]
    filenames = [str(filename, "UTF-8") for filename in dict[b"filenames"]]
    
    for image, id, filename in zip(images, label_ids, filenames):
        train_and_test_data.append((image, label_names[id], filename))

create_data_directories(output_dir, label_names)

random.Random(seed).shuffle(train_and_test_data)

train_last_id = int(len(train_and_test_data) * train_split)

for (image, label_name, filename) in train_and_test_data[ : train_last_id]:
    image = np.transpose(np.reshape(image, (3, 32, 32)), (1,2,0))
    Image.fromarray(image, mode="RGB").save(get_save_path(output_dir, "train", label_name, filename))

for (image, label_name, filename) in train_and_test_data[train_last_id : ]:
    image = np.transpose(np.reshape(image, (3, 32, 32)), (1,2,0))
    Image.fromarray(image, mode="RGB").save(get_save_path(output_dir, "valid", label_name, filename))
    

6
10000
10000
10000
10000
10000
10000
