Loads the handwritten math symbol dataset into a .pickle file  
Dataset can be downloaded from https://www.kaggle.com/datasets/xainano/handwrittenmathsymbols  
From the download extract the 'extracted_images' directory from 'archive.zip'

Both dataset and .pickle file are too large to upload to GitHub

In [1]:
import pickle, os
from PIL import Image
from torchvision.transforms import PILToTensor

In [2]:
# Function that takes a .jpeg file and returns a FloatTensor

transform = PILToTensor()
def jpeg_to_tensor(path):
    # .jpeg file to PIL image
    image = Image.open(path)
    # PIL image to pytorch float tensor
    image_tensor = transform(image).float()
    image_tensor.requires_grad_(True)
    return image_tensor

In [3]:
dataset = {
    'train': [],
    'val': [],
    'test': [],
    'label_map': {}
}

# iterate through directories
# downloaded from https://www.kaggle.com/datasets/xainano/handwrittenmathsymbols
for image_dir in os.listdir('extracted_images'):
    path = 'extracted_images/' + image_dir
    # list of samples/files in directory
    X = os.listdir(path)
    # directory name is the label
    y = image_dir
    # len(X) = # of samples in directory
    n = len(X)

    # first 80% of files goes in train set, next 10% val set, remaining files goes in testing set
    # index where training set ends
    train_split = int(0.8*n)
    # index where validation set ends
    val_split = train_split + int(0.1*n)
    # label map assigns a numerical value to each label
    dataset['label_map'][y] = len(dataset['label_map'])
    print(f"Loading {len(X)} samples for label \'{y}\'")

    # iterate through samples in current directory
    # .jpeg file converts to float tensor before adding to dataset
    for image_file in X[: train_split]: # include all elements up to train_split
        # convert .jpeg to float tensor
        x = jpeg_to_tensor(path + '/' + image_file)
        # add to train set
        dataset['train'].append((x,y))

    for image_file in X[train_split : val_split]: # start at train_split and end at val_split
        # convert .jpeg to float tensor
        x = jpeg_to_tensor(path + '/' + image_file)
        # add to val set
        dataset['val'].append((x,y))

    for image_file in X[val_split :]: # include all elements after val_split
        # convert .jpeg to float tensor
        x = jpeg_to_tensor(path + '/' + image_file)
        # add to test set
        dataset['test'].append((x,y))

Loading 1300 samples for label '!'
Loading 14294 samples for label '('
Loading 14355 samples for label ')'
Loading 25112 samples for label '+'
Loading 1906 samples for label ','
Loading 33997 samples for label '-'
Loading 6914 samples for label '0'
Loading 26520 samples for label '1'
Loading 26141 samples for label '2'
Loading 10909 samples for label '3'
Loading 7396 samples for label '4'
Loading 3545 samples for label '5'
Loading 3118 samples for label '6'
Loading 2909 samples for label '7'
Loading 3068 samples for label '8'
Loading 3737 samples for label '9'
Loading 13104 samples for label '='
Loading 12367 samples for label 'A'
Loading 2546 samples for label 'alpha'
Loading 1339 samples for label 'ascii_124'
Loading 8651 samples for label 'b'
Loading 2025 samples for label 'beta'
Loading 5802 samples for label 'C'
Loading 2986 samples for label 'cos'
Loading 4852 samples for label 'd'
Loading 137 samples for label 'Delta'
Loading 868 samples for label 'div'
Loading 3003 samples for 

In [4]:
dataset['label_map']

{'!': 0,
 '(': 1,
 ')': 2,
 '+': 3,
 ',': 4,
 '-': 5,
 '0': 6,
 '1': 7,
 '2': 8,
 '3': 9,
 '4': 10,
 '5': 11,
 '6': 12,
 '7': 13,
 '8': 14,
 '9': 15,
 '=': 16,
 'A': 17,
 'alpha': 18,
 'ascii_124': 19,
 'b': 20,
 'beta': 21,
 'C': 22,
 'cos': 23,
 'd': 24,
 'Delta': 25,
 'div': 26,
 'e': 27,
 'exists': 28,
 'f': 29,
 'forall': 30,
 'forward_slash': 31,
 'G': 32,
 'gamma': 33,
 'geq': 34,
 'gt': 35,
 'H': 36,
 'i': 37,
 'in': 38,
 'infty': 39,
 'int': 40,
 'j': 41,
 'k': 42,
 'l': 43,
 'lambda': 44,
 'ldots': 45,
 'leq': 46,
 'lim': 47,
 'log': 48,
 'lt': 49,
 'M': 50,
 'mu': 51,
 'N': 52,
 'neq': 53,
 'o': 54,
 'p': 55,
 'phi': 56,
 'pi': 57,
 'pm': 58,
 'prime': 59,
 'q': 60,
 'R': 61,
 'rightarrow': 62,
 'S': 63,
 'sigma': 64,
 'sin': 65,
 'sqrt': 66,
 'sum': 67,
 'T': 68,
 'tan': 69,
 'theta': 70,
 'times': 71,
 'u': 72,
 'v': 73,
 'w': 74,
 'X': 75,
 'y': 76,
 'z': 77,
 '[': 78,
 ']': 79,
 '{': 80,
 '}': 81}

In [5]:
with open('dataset.pickle', 'wb') as new_file:
    pickle.dump(dataset, new_file, pickle.HIGHEST_PROTOCOL)