In [531]:
import os
import struct
import numpy as np
import cv2 as cv
import random

In [532]:
width = 28
height = 28
percentage_split_dataset = 0.90

In [533]:
def vectorize_image_opencv(image_path, width, height):
    # Lire l'image avec OpenCV
    img = cv.imread(image_path, cv.IMREAD_GRAYSCALE)

    # Redimensionner l'image à une taille fixe si nécessaire
    img_resized = cv.resize(img, (width, height))

    # Aplatir l'image en un vecteur 1D
    img_vector = img_resized.flatten()

    
    #Reverse colors
    for i in range(len(img_vector)):
        img_vector[i] = 255 - img_vector[i]

    return img_vector

In [534]:
vectorize_image_opencv("Arad_Ancient_Hebrew/Text_1/Alphabet/10_Yod/1.bmp", width, height)

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0, 255, 255, 255,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0, 255, 255, 255, 255,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0, 255, 255, 255, 255, 255, 255, 255,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0, 255, 255, 255, 255, 255, 255, 255, 255, 255,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0, 255, 255, 255, 255, 255, 255, 255, 255, 255,
       255, 255,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0, 255, 255, 255, 255, 255, 255, 255, 25

In [535]:
def add_elements_from_folder(directory):
    # Initialiser une liste vide pour stocker le nom du dossier
    labels = []
    name_folder = os.path.basename(directory)
    name_folder = name_folder.split("_")[0]
    name_folder = str(int(name_folder)-1)
    
    # Compter le nombre d'éléments dans le dossier
    nb_elements = len(os.listdir(directory))
    
    # Ajouter le nom du dossier à la liste autant de fois qu'il y a d'éléments
    labels.extend([name_folder] * nb_elements)
    
    return labels

In [536]:
def generate_idx1_ubyte(labels, train_output, validation_output):
    
    split_index = int(len(labels) * percentage_split_dataset)
    train_labels = labels[:split_index]
    validation_labels = labels[split_index:]
    
    with open(train_output, 'wb') as train_file:
        train_file.write(struct.pack('>II', 0x00000801, len(train_labels)))
        for label in train_labels:
            train_file.write(struct.pack('B', int(label)))
    
    with open(validation_output, 'wb') as validation_file:
        validation_file.write(struct.pack('>II', 0x00000801, len(validation_labels)))
        for label in validation_labels:
            validation_file.write(struct.pack('B', int(label)))

In [537]:
add_elements_from_folder("Arad_Ancient_Hebrew/Text_1/Alphabet/1_Alef")

['0', '0', '0', '0']

In [538]:
def get_immediate_subdirectories(a_dir):
    return [name for name in os.listdir(a_dir)
            if os.path.isdir(os.path.join(a_dir, name))]

In [539]:
get_immediate_subdirectories("Arad_Ancient_Hebrew")

['Text_1',
 'Text_111',
 'Text_16',
 'Text_17a',
 'Text_17b',
 'Text_18',
 'Text_2',
 'Text_21',
 'Text_24',
 'Text_3',
 'Text_31',
 'Text_38',
 'Text_39a',
 'Text_39b',
 'Text_40',
 'Text_5',
 'Text_7',
 'Text_8']

In [540]:
def get_hebrew_labels(directory):
    labels = []
    subdirectories = get_immediate_subdirectories(directory)
    for subdirectory in subdirectories :
        current_labels = get_immediate_subdirectories(directory+"/"+subdirectory+"/Alphabet")
        for current_label in current_labels :
            retrieve_labels = add_elements_from_folder(directory+"/"+subdirectory+"/Alphabet"+"/"+current_label)
            if len(retrieve_labels) != 0 :
                labels += retrieve_labels
    return labels

In [541]:
def generate_idx3_ubyte(images, train_output, validation_output):
    # Assurez-vous que les images sont sous forme de numpy array
    images = np.array(images, dtype=np.uint8)
    rows = height
    cols = width
    
    split_index = int(len(images) * percentage_split_dataset)
    train_images = images[:split_index]
    validation_images = images[split_index:]
    
    # Écrire les images d'entraînement dans le fichier de sortie correspondant
    with open(train_output, 'wb') as train_file:
        train_file.write(struct.pack('>IIII', 0x00000803, len(train_images), rows, cols))
        for image in train_images:
            train_file.write(image.tobytes())
    
    # Écrire les images de validation dans le fichier de sortie correspondant
    with open(validation_output, 'wb') as validation_file:
        validation_file.write(struct.pack('>IIII', 0x00000803, len(validation_images), rows, cols))
        for image in validation_images:
            validation_file.write(image.tobytes())

In [542]:
print(vectorize_image_opencv("to_vectorize.bmp", width, height))

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0 255   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0 255 255 255 255
 248   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0 255 255 255 255 255 255   0   0   0   0   0   0   0
   0   0 255 255   0   0   0   0   0   0   0   0   0   0 255 255 255 255
 255 255 255   0   0   0   0   0   0   0   0 255 255 255 255   0   0   0
   0   0   0   0 255 255 255 255 255 255 255 255 255   0   0   0   0   0
   0   0 255 255 255 255 255 255   0   0   0   0 255 255 255 255 255 255
 255 255 255 255   0   0   0   0   0   0   0   0 255 255 255 255 255 255
   0   0 255 255 255 255 255 255 255 255 255 255 255   0   0   0   0   0
   0   0   0   0 255 255 255 255 255 255 255 255 255 255 255 255 255 255
 255 255   0   0   0   0   0   0   0   0   0   0   0   0 255 255 255 255
 255 255 255 255 255 255 255 255 255 255   0   0   

In [543]:
def get_images_to_vector(directory) :
    images = []

    for item in os.listdir(directory):
        complete_path = os.path.join(directory, item)
        if os.path.isfile(complete_path):
            images.append(vectorize_image_opencv(complete_path, width, height))
    
    return images

In [544]:
def get_hebrew_images(directory):
    images = []
    subdirectories = get_immediate_subdirectories(directory)
    for subdirectory in subdirectories :
        current_labels = get_immediate_subdirectories(directory+"/"+subdirectory+"/Alphabet")
        for current_label in current_labels :
            retrieve_images = get_images_to_vector(directory+"/"+subdirectory+"/Alphabet"+"/"+current_label)
            if len(retrieve_images) != 0 :
                images += retrieve_images
    return images

In [545]:
print(get_images_to_vector("Arad_Ancient_Hebrew/Text_1/Alphabet/1_Alef"))

[array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 255,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 255, 255,
       255, 255, 248,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 255,
       255, 255, 255, 255, 255,   0,   0,   0,   0,   0,   0,   0,   0,
         0, 255, 255,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
       255, 255, 255, 255, 255, 255, 255,   0,   0,   0,   0,   0,   0,
         0,   0, 255, 255, 255, 255,   0,   0,   0,   0,   0,   0,   0,
       255, 255, 255, 255, 255, 255, 255, 255, 255,   0,   0,   0,   0,
         0,   0,   0, 255, 255, 255, 255, 255, 255,   0,   0,   0,   0,
       255, 255, 255, 255, 255, 255, 255, 255, 255, 255,   0,   0,   0,
         0,   0,   0,   0,   0, 255, 255, 255, 255, 255, 255,  

In [546]:
def get_name_letter_from_label(number_label) :
    dictionnary_file = open('dictionnary_labels.txt', 'r')
    lines = dictionnary_file.readlines()

    for line in lines :
        if str(number_label + 1) in line :
            return line.split("_")[1].strip()
    
    return None

In [547]:
labels = get_hebrew_labels("Arad_Ancient_Hebrew")
images = get_hebrew_images("Arad_Ancient_Hebrew")

In [548]:
def duplicate_dict_labels_images(labels, images):
    # Vérifier que les deux listes ont la même longueur
    if len(labels) != len(images):
        raise ValueError("Les deux listes doivent avoir la même longueur")

    duplication_dict = [(x, y) for x, y in zip(labels, images)]
    
    return duplication_dict

In [549]:
def randomize_dataset(labels, images) :
    dup_dataset = duplicate_dict_labels_images(labels, images)
    rand_dataset=random.sample(dup_dataset,len(dup_dataset))
    
    rand_labels = [label for label, _ in rand_dataset]
    rand_images = [image for _, image in rand_dataset]
    
    return rand_labels, rand_images

In [550]:
labels, images = randomize_dataset(labels, images)
generate_idx1_ubyte(labels, "train/train-labels-idx1-ubyte", "validation/validation-labels-idx1-ubyte")
generate_idx3_ubyte(images, "train/train-images-idx3-ubyte", "validation/validation-images-idx3-ubyte")

In [551]:
from mnist import MNIST

mndata = MNIST('train')

images, labels = mndata.load_training()

In [552]:
index = random.randrange(0, len(images))  # choose an index ;-)
print(len(labels))
print(len(images))
index = 10
print(mndata.display(images[index]))
print(labels[index])
print(get_name_letter_from_label(labels[index]))

384
384

............................
......................@@@@..
.....................@@@@@@.
.....................@@@@@@.
.....................@@@@@@.
.....................@@@@@@.
....................@@@@@@@.
...................@@@@@@@@.
..................@@@@@@@@..
.................@@@@@@@@...
.................@@@@@@@....
................@@@@@@@.....
...............@@@@@@.......
..............@@@@@@........
.............@@@@@@.........
............@@@@@@..........
...........@@@@@............
..........@@@@@.............
........@@@@@@..............
.......@@@@@@...............
......@@@@@.................
.....@@@@@..................
...@@@@@@...................
..@@@@@@....................
.@@@@@......@@..............
.@@@@@@@@@@@@@@.............
.@@@@@@@@@@@@@..............
............................
11
Lamed
