# Imports

In [None]:
import os
import numpy as np
import shutil
import matplotlib.pyplot as plt

Set the paths and import the label array

In [None]:
# Define the path to the folder containing the images and labels
data_folder = os.path.join(os.getcwd(), "Data Generation/Pre Processed Data Generated/Square Images")

# Load the labels from the NPY file
labels = np.load(os.path.join(data_folder, 'y_piece_generated.npz'))['arr_0']

# Get the list of image filenames
image_files = os.listdir(data_folder)
image_files = [f for f in image_files if f.endswith('.jpg')]

Plot the distribution of the generated data before removing training examples that are empty squares to rebalance the data. We save that plot in data directory because we will not 

In [None]:
# create a dictionary that maps class numbers to their names
class_names = {0: 'Empty', 1: 'White pawn', 2: 'White knight', 3: 'White bishop', 4: 'White rook',
               5: 'White queen', 6: 'White king', 7: 'Black pawn', 8: 'Black knight', 9: 'Black bishop',
               10: 'Black rook', 11: 'Black queen', 12: 'Black king'}


# plot the first histogram on the left subplot
hist, bins, _ = plt.hist(labels, bins=range(14), align='left', rwidth=0.8)
plt.set_xticks(list(class_names.keys()))
plt.set_xticklabels(list(class_names.values()), rotation='vertical')
plt.set_xlabel('Class')
plt.set_ylabel('Frequency')
plt.set_title('Data Distribution of the generated data before rebalancing data by removing empty square examples.')

# Store the count of each class in an array to define the losses later
samples_per_class_generated = hist

# add count number above each bar
for i in range(len(class_names)):
    plt.text(i, (hist[i])+5, "{:.2f}%".format(int(hist[i]*100)/labels.shape[0]), ha='center')

# adjust the layout of the subplots
plt.subplots_adjust(wspace=0.5)

plt.savefig(data_folder+"/../data_distribution_generated_before_rebalancing.png")

# show the plot
plt.show()

In [None]:


# Shuffle the image files and their corresponding labels in the same order
indices = np.random.permutation(len(image_files))
image_files = [image_files[i] for i in indices]
labels = labels[indices]

# Delete some training examples 

# Define the ratios of the training, validation, and testing sets
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# Calculate the sizes of the training, validation, and testing sets
num_images = len(image_files)
num_train = int(train_ratio * num_images)
num_val = int(val_ratio * num_images)
num_test = num_images - num_train - num_val

# Define the paths to the subfolders
train_folder = os.path.join(data_folder, 'Training')
val_folder = os.path.join(data_folder, 'Validation')
test_folder = os.path.join(data_folder, 'Testing')

# Create the subfolders if they don't exist
for folder in [train_folder, val_folder, test_folder]:
    if not os.path.exists(folder):
        os.makedirs(folder)

# Move the images and their labels to the subfolders
for i, image_file in enumerate(image_files):
    if i < num_train:
        folder = train_folder
    elif i < num_train + num_val:
        folder = val_folder
    else:
        folder = test_folder

    src_file = os.path.join(data_folder, image_file)
    dst_file = os.path.join(folder, image_file)
    shutil.copy(src_file, dst_file)

    label = labels[i]
    dst_label_file = os.path.join(folder, f'label_{i}.npy')
    np.save(dst_label_file, label)

