### Condense OCR-Handwriting Dataset for Google Colab prior to Uploading Data

For this project, I have determined that I will use approximately 20,000 samples. To make the dataset size reasonable to train locally, I am going to have a training dataset of 20,000 samples and a testing/evaluation dataset of 5,000 samples.

In [1]:
import os, shutil

import pandas as pd

### Training Dataset Setup

In [2]:
data = pd.read_csv("archive/CSV/written_name_train.csv")

data = data.dropna()

data = data.drop_duplicates()

data

Unnamed: 0,FILENAME,IDENTITY
0,TRAIN_00001.jpg,BALTHAZAR
1,TRAIN_00002.jpg,SIMON
2,TRAIN_00003.jpg,BENES
3,TRAIN_00004.jpg,LA LOVE
4,TRAIN_00005.jpg,DAPHNE
...,...,...
330956,TRAIN_330957.jpg,LENNY
330957,TRAIN_330958.jpg,TIFFANY
330958,TRAIN_330959.jpg,COUTINHO DESA
330959,TRAIN_330960.jpg,MOURAD


In [3]:
# add a string length feature, then sort by that in ascending order

data['text_len'] = data['IDENTITY'].str.len()

data = data[data['text_len'] < 12]
data = data[data['text_len'] > 3]

data = data.sample(n = 20000)

data = data.sort_values(by=['text_len'])

data

Unnamed: 0,FILENAME,IDENTITY,text_len
109698,TRAIN_109699.jpg,LOTH,4
111763,TRAIN_111764.jpg,BURY,4
264134,TRAIN_264135.jpg,MILO,4
64903,TRAIN_64904.jpg,LUCY,4
96561,TRAIN_96562.jpg,ENZO,4
...,...,...,...
132274,TRAIN_132275.jpg,FAGEGALTIER,11
163749,TRAIN_163750.jpg,LEBOURGEOIS,11
318364,TRAIN_318365.jpg,JOHNCHARLES,11
279437,TRAIN_279438.jpg,TRESMONTANT,11


In [4]:
data = data.drop(columns=["text_len"])

data

Unnamed: 0,FILENAME,IDENTITY
109698,TRAIN_109699.jpg,LOTH
111763,TRAIN_111764.jpg,BURY
264134,TRAIN_264135.jpg,MILO
64903,TRAIN_64904.jpg,LUCY
96561,TRAIN_96562.jpg,ENZO
...,...,...
132274,TRAIN_132275.jpg,FAGEGALTIER
163749,TRAIN_163750.jpg,LEBOURGEOIS
318364,TRAIN_318365.jpg,JOHNCHARLES
279437,TRAIN_279438.jpg,TRESMONTANT


In [5]:
# move the files from the data dataframe into a newly created folder

current_parent_dir = os.path.join("archive", "train_v2", "train")

# Create new folder
new_parent_dir = "/Users/leedunn/Desktop/Projects_to_Train/OCR/data"
os.mkdir(new_parent_dir)
new_parent_dir = "/Users/leedunn/Desktop/Projects_to_Train/OCR/data/train"
os.mkdir(new_parent_dir)

# Move Files from old folder to new folder
for x in data['FILENAME']:
    current_location = os.path.join(current_parent_dir, x)
    new_location = os.path.join(new_parent_dir, x)
    shutil.move(current_location, new_location)

In [7]:
# save data file to disk in folder near images

data.to_csv(os.path.join(new_parent_dir, "_annotations.csv"))

### Evaluation Dataset Setup

In [8]:
data = pd.read_csv("archive/CSV/written_name_validation.csv")

data = data.dropna()

data = data.drop_duplicates()

data

Unnamed: 0,FILENAME,IDENTITY
0,VALIDATION_0001.jpg,BILEL
1,VALIDATION_0002.jpg,LAUMIONIER
2,VALIDATION_0003.jpg,LEA
3,VALIDATION_0004.jpg,JEAN-ROCH
4,VALIDATION_0005.jpg,RUPP
...,...,...
41365,VALIDATION_41366.jpg,CHAILLAN
41366,VALIDATION_41367.jpg,BAROUH
41367,VALIDATION_41368.jpg,MAXENCE
41368,VALIDATION_41369.jpg,HAMELIN


In [9]:
# add a string length feature, then sort by that in ascending order

data['text_len'] = data['IDENTITY'].str.len()

data = data[data['text_len'] < 12]
data = data[data['text_len'] > 3]

data = data.sample(n = 5000)

data = data.sort_values(by=['text_len'])

data

Unnamed: 0,FILENAME,IDENTITY,text_len
30133,VALIDATION_30134.jpg,JEAN,4
17881,VALIDATION_17882.jpg,JEAN,4
32985,VALIDATION_32986.jpg,GARY,4
3250,VALIDATION_3251.jpg,GROS,4
37784,VALIDATION_37785.jpg,MAUD,4
...,...,...,...
10795,VALIDATION_10796.jpg,KAZMIERCZAK,11
23409,VALIDATION_23410.jpg,KWIATKOWSKI,11
29152,VALIDATION_29153.jpg,GIOVANNETTI,11
28161,VALIDATION_28162.jpg,NIEDZIELSKI,11


In [10]:
data = data.drop(columns=["text_len"])

data

Unnamed: 0,FILENAME,IDENTITY
30133,VALIDATION_30134.jpg,JEAN
17881,VALIDATION_17882.jpg,JEAN
32985,VALIDATION_32986.jpg,GARY
3250,VALIDATION_3251.jpg,GROS
37784,VALIDATION_37785.jpg,MAUD
...,...,...
10795,VALIDATION_10796.jpg,KAZMIERCZAK
23409,VALIDATION_23410.jpg,KWIATKOWSKI
29152,VALIDATION_29153.jpg,GIOVANNETTI
28161,VALIDATION_28162.jpg,NIEDZIELSKI


In [14]:
# move the files from the data dataframe into a newly created folder

current_parent_dir = os.path.join("archive", "validation_v2", "validation")

# Create new folder
new_parent_dir = "/Users/leedunn/Desktop/Projects_to_Train/OCR/data/eval"
os.mkdir(new_parent_dir)

# Move Files from old folder to new folder
for x in data['FILENAME']:
    current_location = os.path.join(current_parent_dir, x)
    new_location = os.path.join(new_parent_dir, x)
    shutil.move(current_location, new_location)

In [16]:
data.to_csv(os.path.join(new_parent_dir, "_annotations.csv"))