# Resource Generation Notebook

In [1]:
#--------------------------------------
# imports
#--------------------------------------
import pandas as pd 
from tqdm import tqdm
from glob import glob
from shutil import copy
import numpy as np
import random
import os

tqdm.pandas()

In [2]:
#--------------------------------------
# string resources
#--------------------------------------
numbers                 =       ['০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯']
punctuations            =       ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '।']
vowelDiacritic          =       ['ꠣ', 'ꠤ', 'ꠥ' , 'ꠦ' , 'ꠂ' , 'ꠧ']
consonantDiacritic      =       ['ꠋ', '꠬']
consonants              =       ['ꠇ', 'ꠈ', 'ꠉ', 'ꠊ', 'ꠌ', 'ꠍ', 'ꠎ', 'ꠏ', 'ꠐ', 'ꠑ', 'ꠒ', 'ꠓ', 'ꠘ', 'ꠔ', 'ꠕ', 'ꠖ', 
                                'ꠗ', 'ꠙ', 'ꠚ', 'ꠛ', 'ꠜ', 'ꠝ', 'ꠎ', 'ꠞ', 'ꠟ', 'ꠡ', 'ꠢ', 'ꠠ', 'ꠅ']
vowels                  =       ['ꠅ', 'ꠀ',  'ꠁ', 'ꠃ', 'ꠄ', 'ꠀꠄ', 'ꠅ꠆ꠎꠣ'] 
vowelConsonantConjunct  =       ['ꠀꠔ', 'ꠀꠞ', 'ꠀꠟ', 'ꠀꠡ']
consonant_conjunct1     =       ['ꠇ꠆ꠇ','ꠇ꠆ꠔ','ꠌ꠆ꠌ','ꠌ꠆ꠍ','ꠎ꠆ꠎ','ꠔ꠆ꠔ','ꠘ꠆ꠔ','ꠘ꠆ꠖ','ꠘ꠆ꠎ','ꠘ꠆ꠘ','ꠛ꠆ꠛ','ꠝ꠆ꠛ','ꠝ꠆ꠝ','ꠞꠟ','ꠟ꠆ꠟ','ꠡ꠆ꠇ','ꠡꠌ','ꠡꠍ','ꠡ꠆ꠐ','ꠡꠑ','ꠡ꠆ꠛ']
consonant_conjunct2     =       ['ꠇ꠆ꠞ','ꠇ꠆ꠐ' ,'ꠇ꠆ꠟ','ꠇ꠆ꠡ','ꠈ꠆ꠔ','ꠉ꠆ꠉ','ꠎ꠆ꠘ','ꠉ꠆ꠘ','ꠉ꠆ꠞ','ꠉ꠆ꠟ','ꠐ꠆ꠐ','ꠐ꠆ꠞ','ꠒ꠆ꠒ','ꠒ꠆ꠞ','ꠔ꠆ꠞ','ꠖ꠆ꠖ','ꠖ꠆ꠞ','ꠘ꠆ꠇ','ꠘ꠆ꠌ',
                                'ꠘ꠆ꠍ','ꠘ꠆ꠐ','ꠘ꠆ꠒ','ꠘ꠆ꠗ','ꠘ꠆ꠡ','ꠙ꠆ꠐ','ꠙ꠆ꠔ','ꠙ꠆ꠙ' ,'ꠙ꠆ꠞ' ,'ꠙ꠆ꠟ','ꠚ꠆ꠞ','ꠛ꠆ꠞ' ,'ꠛ꠆ꠟ' ,'ꠝ꠆ꠙ','ꠝ꠆ꠞ','ꠞ꠆ꠛ','ꠟ꠆ꠐ','ꠟ꠆ꠙ','ꠡ꠆ꠐ',
                                'ꠡ꠆ꠕ','ꠡ꠆ꠞ','ꠡꠡ','ꠢ꠆ꠞ']



consonant_conjunct = ['ꠀꠔ', 'ꠀꠞ', 'ꠀꠟ', 'ꠀꠡ', 'ꠄ꠆ꠎ', 'ꠅ꠆ꠌ', 'ꠅ꠆ꠍ', 'ꠅ꠆ꠎ', 'ꠅ꠆ꠏ', 'ꠇ', 'ꠇ꠆ꠇ', 'ꠇ꠆ꠎ', 'ꠇ꠆ꠐ', 'ꠇ꠆ꠐ ', 'ꠇ꠆ꠐ꠆ꠎ', 'ꠇ꠆ꠐ꠆ꠞ', 'ꠇ꠆ꠔ', 'ꠇ꠆ꠔ꠆ꠞ', 
                    'ꠇ꠆ꠛ', 'ꠇ꠆ꠝ', 'ꠇ꠆ꠞ', 'ꠇ꠆ꠞ꠆ꠎ', 'ꠇ꠆ꠟ', 'ꠇ꠆ꠟ꠆ꠎ', 'ꠇ꠆ꠡ', 'ꠇ꠆ꠡ꠆ꠎ', 'ꠇ꠆ꠡ꠆ꠘ', 'ꠇ꠆ꠡ꠆ꠛ', 'ꠇ꠆ꠡ꠆ꠝ', 'ꠇ꠆ꠡ꠆ꠝ꠆ꠎ', 'ꠈ꠆ꠎ', 'ꠈ꠆ꠔ', 'ꠈ꠆ꠞ', 'ꠈꠔ', 'ꠉ꠆ꠉ', 
                    'ꠉ꠆ꠎ', 'ꠉ꠆ꠗ', 'ꠉ꠆ꠗ꠆ꠎ', 'ꠉ꠆ꠗ꠆ꠞ', 'ꠉ꠆ꠘ', 'ꠉ꠆ꠘ꠆ꠎ', 'ꠉ꠆ꠛ', 'ꠉ꠆ꠝ', 'ꠉ꠆ꠞ', 'ꠉ꠆ꠞ꠆ꠎ', 'ꠉ꠆ꠟ', 'ꠉ꠆ꠟ꠆ꠎ', 'ꠉꠉ', 'ꠊ꠆ꠎ', 'ꠊ꠆ꠘ', 'ꠊ꠆ꠞ', 'ꠌ꠆ꠅ', 'ꠌ꠆ꠌ', 
                    'ꠌ꠆ꠍ', 'ꠌ꠆ꠍ꠆ꠛ', 'ꠌ꠆ꠍ꠆ꠞ', 'ꠌ꠆ꠎ', 'ꠌ꠆ꠛ', 'ꠎ꠆ꠅ', 'ꠎ꠆ꠎ', 'ꠎ꠆ꠎ꠆ꠛ', 'ꠎ꠆ꠏ', 'ꠎ꠆ꠘ', 'ꠎ꠆ꠛ', 'ꠎ꠆ꠞ', 'ꠐ꠆ꠎ', 'ꠐ꠆ꠐ', 'ꠐ꠆ꠛ', 'ꠐ꠆ꠝ', 'ꠐ꠆ꠞ', 'ꠐ꠆ꠞ꠆ꠎ', 'ꠑ꠆ꠎ', 
                    'ꠒ꠆ꠎ', 'ꠒ꠆ꠒ', 'ꠒ꠆ꠛ', 'ꠒ꠆ꠞ', 'ꠒ꠆ꠞ꠆ꠎ', 'ꠓ꠆ꠎ', 'ꠓ꠆ꠞ', 'ꠔ꠆ꠎ', 'ꠔ꠆ꠔ', 'ꠔ꠆', 'ꠔ꠆ꠔ꠆ꠎ', 'ꠔ꠆ꠔ꠆ꠛ', 'ꠔ꠆ꠕ', 'ꠔ꠆ꠘ', 'ꠔ꠆ꠛ', 'ꠔ꠆ꠝ', 'ꠔ꠆ꠝ꠆ꠎ', 'ꠔ꠆ꠞ', 
                    'ꠔ꠆ꠞ꠆ꠎ', 'ꠕ꠆ꠎ', 'ꠕ꠆ꠛ', 'ꠕ꠆ꠞ', 'ꠕ꠆ꠞ꠆ꠎ', 'ꠖ꠆ꠉ', 'ꠖ꠆ꠊ', 'ꠖ꠆ꠎ', 'ꠖ꠆ꠖ', 'ꠖ꠆ꠖ꠆ꠛ', 'ꠖ꠆ꠗ', 'ꠖ꠆ꠛ', 'ꠖ꠆ꠜ', 'ꠖ꠆ꠜ꠆ꠞ', 'ꠖ꠆ꠝ', 'ꠖ꠆ꠞ', 'ꠖ꠆ꠞ꠆ꠎ', 
                    'ꠗ꠆ꠎ', 'ꠗ꠆ꠘ', 'ꠗ꠆ꠛ', 'ꠗ꠆ꠝ', 'ꠗ꠆ꠞ', 'ꠘ꠆ꠇ', 'ꠘ꠆ꠌ', 'ꠘ꠆ꠍ', 'ꠘ꠆ꠎ', 'ꠘ꠆ꠐ', 'ꠘ꠆ꠐ꠆ꠎ', 'ꠘ꠆ꠐ꠆ꠞ', 'ꠘ꠆ꠐ꠆ꠞ꠆ꠎ', 'ꠘ꠆ꠑ', 'ꠘ꠆ꠑ꠆ꠎ', 'ꠘ꠆ꠒ', 'ꠘ꠆ꠒ꠆ꠎ', 'ꠘ꠆ꠒ꠆ꠛ', 
                    'ꠘ꠆ꠒ꠆ꠞ', 'ꠘ꠆ꠓ', 'ꠘ꠆ꠔ', 'ꠘ꠆ꠔ', 'ꠘ꠆ꠔ꠆ꠎ', 'ꠘ꠆ꠔ꠆ꠛ', 'ꠘ꠆ꠔ꠆ꠞ', 'ꠘ꠆ꠔ꠆ꠞ꠆ꠎ', 'ꠘ꠆ꠕ', 'ꠘ꠆ꠕ꠆ꠎ', 'ꠘ꠆ꠕ꠆ꠞ', 'ꠘ꠆ꠖ', 'ꠘ꠆ꠖ', 'ꠘ꠆ꠖ꠆ꠎ', 'ꠘ꠆ꠖ꠆ꠛ', 'ꠘ꠆ꠖ꠆ꠞ', 'ꠘ꠆ꠗ', 
                    'ꠘ꠆ꠗ꠆ꠎ', 'ꠘ꠆ꠗ꠆ꠞ', 'ꠘ꠆ꠘ', 'ꠘ꠆ꠘ', 'ꠘ꠆ꠛ', 'ꠘ꠆ꠝ', 'ꠘ꠆ꠡ', 'ꠘ꠆ꠡ꠆ꠎ', 'ꠙ꠆ꠎ', 'ꠙ꠆ꠐ', 'ꠙ꠆ꠐ꠆ꠎ', 'ꠙ꠆ꠔ', 'ꠙ꠆ꠘ', 'ꠙ꠆ꠙ', 'ꠙ꠆ꠙ ', 'ꠙ꠆ꠞ', 'ꠙ꠆ꠞ ', 
                    'ꠙ꠆ꠞ꠆ꠎ', 'ꠙ꠆ꠟ', 'ꠙ꠆ꠟ꠆ꠎ', 'ꠙ꠆ꠡ', 'ꠚ꠆ꠎ', 'ꠚ꠆ꠞ', 'ꠚ꠆ꠞ꠆ꠎ', 'ꠚ꠆ꠟ', 'ꠚ꠆ꠟ꠆ꠎ', 'ꠛ꠆ꠎ', 'ꠛ꠆ꠖ', 'ꠛ꠆ꠗ', 'ꠛ꠆ꠛ', 'ꠛ꠆ꠛ', 'ꠛ꠆ꠞ', 'ꠛ꠆ꠞ ', 'ꠛ꠆ꠞ꠆ꠎ', 
                    'ꠛ꠆ꠟ', 'ꠛ꠆ꠟ ', 'ꠛ꠆ꠟ꠆ꠎ', 'ꠜ꠆ꠎ', 'ꠜ꠆ꠛ', 'ꠜ꠆ꠞ', 'ꠜ꠆ꠟ', 'ꠝ꠆ꠎ', 'ꠝ꠆ꠘ', 'ꠝ꠆ꠘ꠆ꠎ', 'ꠝ꠆ꠙ', 'ꠝ꠆ꠙ꠆ꠎ', 'ꠝ꠆ꠙ꠆ꠞ', 'ꠝ꠆ꠚ', 'ꠝ꠆ꠛ', 'ꠝ꠆ꠛ꠆ꠞ', 'ꠝ꠆ꠜ', 'ꠝ꠆ꠜ꠆ꠞ', 
                    'ꠝ꠆ꠝ', 'ꠝ꠆ꠝ ', 'ꠝ꠆ꠞ', 'ꠝ꠆ꠟ', 'ꠞ꠆ꠇ', 'ꠞ꠆ꠇ꠆ꠎ', 'ꠞ꠆ꠇ꠆ꠐ', 'ꠞ꠆ꠈ', 'ꠞ꠆ꠉ', 'ꠞ꠆ꠉ꠆ꠎ', 'ꠞ꠆ꠉ꠆ꠞ', 'ꠞ꠆ꠊ', 'ꠞ꠆ꠊ꠆ꠎ', 'ꠞ꠆ꠌ', 'ꠞ꠆ꠌ꠆ꠎ', 'ꠞ꠆ꠍ', 'ꠞ꠆ꠎ', 
                    'ꠞ꠆ꠎ꠆ꠅ', 'ꠞ꠆ꠎ꠆ꠎ', 'ꠞ꠆ꠏ', 'ꠞ꠆ꠐ', 'ꠞ꠆ꠐ꠆ꠎ', 'ꠞ꠆ꠐ꠆ꠞ', 'ꠞ꠆ꠒ', 'ꠞ꠆ꠒ꠆ꠞ', 'ꠞ꠆ꠓ꠆ꠎ', 'ꠞ꠆ꠔ', 'ꠞ꠆ꠔ꠆ꠎ', 'ꠞ꠆ꠔ꠆ꠝ', 'ꠞ꠆ꠔ꠆ꠞ', 'ꠞ꠆ꠕ', 'ꠞ꠆ꠕ꠆ꠎ', 'ꠞ꠆ꠖ', 'ꠞ꠆ꠖ꠆ꠛ', 
                    'ꠞ꠆ꠖ꠆ꠞ', 'ꠞ꠆ꠗ', 'ꠞ꠆ꠗ꠆ꠛ', 'ꠞ꠆ꠘ', 'ꠞ꠆ꠘ꠆ꠎ', 'ꠞ꠆ꠘ꠆ꠒ', 'ꠞ꠆ꠘ꠆ꠔ', 'ꠞ꠆ꠙ', 'ꠞ꠆ꠙ꠆ꠐ', 'ꠞ꠆ꠙ꠆ꠟ', 'ꠞ꠆ꠚ', 'ꠞ꠆ꠛ', 'ꠞ꠆ꠛ꠆ꠎ', 'ꠞ꠆ꠜ', 'ꠞ꠆ꠝ', 'ꠞ꠆ꠝ꠆ꠎ', 'ꠞ꠆ꠝ꠆ꠕ', 
                    'ꠞ꠆ꠝ꠆ꠙ', 'ꠞ꠆ꠟ', 'ꠞ꠆ꠟ꠆ꠎ', 'ꠞ꠆ꠟ꠆ꠒ', 'ꠞ꠆ꠡ', 'ꠞ꠆ꠡ꠆ꠎ', 'ꠞ꠆ꠡ꠆ꠐ', 'ꠞ꠆ꠡ꠆ꠛ', 'ꠞ꠆ꠡ꠆ꠝ', 'ꠞ꠆ꠢ', 'ꠞ꠆ꠢ꠆ꠎ', 'ꠞꠟ', 'ꠟ꠆ꠇ', 'ꠟ꠆ꠇ꠆ꠎ', 'ꠟ꠆ꠉ', 'ꠟ꠆ꠌ', 'ꠟ꠆ꠎ', 
                    'ꠟ꠆ꠐ', 'ꠟ꠆ꠐ꠆ꠎ', 'ꠟ꠆ꠐ꠆ꠞ', 'ꠟ꠆ꠒ', 'ꠟ꠆ꠒ꠆ꠎ', 'ꠟ꠆ꠒ꠆ꠞ', 'ꠟ꠆ꠙ', 'ꠟ꠆ꠚ', 'ꠟ꠆ꠛ', 'ꠟ꠆ꠛ꠆ꠎ', 'ꠟ꠆ꠜ', 'ꠟ꠆ꠝ', 'ꠟ꠆ꠟ', 'ꠡ꠆ꠇ', 'ꠡ꠆ꠇ ', 'ꠡ꠆ꠇ꠆ꠎ', 'ꠡ꠆ꠇ꠆ꠞ', 
                    'ꠡ꠆ꠇ꠆ꠞ꠆ꠎ', 'ꠡ꠆ꠈ', 'ꠡ꠆ꠌ', 'ꠡ꠆ꠍ', 'ꠡ꠆ꠎ', 'ꠡ꠆ꠐ', 'ꠡ꠆ꠐ꠆ꠎ', 'ꠡ꠆ꠐ꠆ꠞ', 'ꠡ꠆ꠐ꠆ꠞ꠆ꠎ', 'ꠡ꠆ꠑ', 'ꠡ꠆ꠑ꠆ꠎ', 'ꠡ꠆ꠔ', 'ꠡ꠆ꠔ꠆ꠎ', 'ꠡ꠆ꠔ꠆ꠛ', 'ꠡ꠆ꠔ꠆ꠞ', 'ꠡ꠆ꠕ', 'ꠡ꠆ꠕ꠆ꠎ', 
                    'ꠡ꠆ꠘ', 'ꠡ꠆ꠘ꠆ꠎ', 'ꠡ꠆ꠙ', 'ꠡ꠆ꠙ꠆ꠎ', 'ꠡ꠆ꠙ꠆ꠞ', 'ꠡ꠆ꠙ꠆ꠞ꠆ꠎ', 'ꠡ꠆ꠙ꠆ꠟ', 'ꠡ꠆ꠙ꠆ꠟ꠆ꠎ', 'ꠡ꠆ꠚ', 'ꠡ꠆ꠛ', 'ꠡ꠆ꠝ', 'ꠡ꠆ꠝ꠆ꠎ', 'ꠡ꠆ꠞ', 'ꠡ꠆ꠞ꠆ꠎ', 'ꠡ꠆ꠟ', 'ꠡ꠆ꠟ꠆ꠎ', 
                    'ꠡꠌ', 'ꠡꠍ', 'ꠡꠑ', 'ꠡꠡ', 'ꠢ꠆ꠎ', 'ꠢ꠆ꠘ', 'ꠢ꠆ꠛ', 'ꠢ꠆ꠝ', 'ꠢ꠆ꠞ', 'ꠢ꠆ꠟ']

# Synthetic Grapheme based dictionary creation
* **NUM_MIX_DATA**: The amount of data to create where numbers,punctuations and graphemes are mixed at random
* **NUM_NUM_DATA**: The amount of numeric data to create 
* **NUM_GPM_LOOP**: Number of loop to go over the grapheme list. In each loop , the whole set of grapheme are covered at-least once.
* **DICT_NAME**   : The path to save the dictionary (word based)  

In [3]:
NUM_MIX_DATA= 300000
NUM_NUM_DATA= 100000
NUM_GPM_LOOP= 5000
DICT_NAME   = '/home/apsisdev/OCR/SylhetiNagri/dicts/synthdict.txt'

In [4]:
#-------------------------------
# creating graphemes
#-------------------------------
graphemes = vowels + consonants + consonant_conjunct + \
            [x+ consonantDiacritic[0] for x in vowels+consonants+consonant_conjunct]+ \
            [x+ consonantDiacritic[1] for x in vowels+consonants+consonant_conjunct] + \
            [x+ consonantDiacritic[0]+consonantDiacritic[1] for x in vowels+consonants+consonant_conjunct]
for v in vowelDiacritic:
    for c in consonants+consonant_conjunct:
        graphemes.append(c+v)
        graphemes.append(c+v+consonantDiacritic[0])
        
graphemes= list(np.unique([x.replace(' ', '').replace('ꠋ'+'꠬','ꠋ') for x in graphemes]))




In [5]:
def random_exec(poplutation=[0,1],weights=[0.5,0.5],match=0):
    return random.choices(population=poplutation,weights=weights,k=1)[0]==match

def create_words(graphemes,
                min_len=1,
                max_len=10,
                mods=[],#['ঁ', 'ং', 'ঃ'],
                mod_weights=[0.3,0.7]):
    
    _graphemes = graphemes.copy()
    random.shuffle(_graphemes)
    words = [] 
    index = 0 
    length = len(_graphemes) 
    while (index < length):
        _len = random.randint(min_len,max_len)
        word=_graphemes[index:index+_len]
        if random_exec(weights=mod_weights):
            wlen=len(word)
            widx=random.randint(0,wlen-1)
            #word[widx]+=random.choice(mods) 
        words.append("".join(word)) 
        index = index + _len
    return words
def create_numbers(numbers,
                min_len=1,
                max_len=10,
                num_samples=1000000):
    
    words = [] 
    for _ in range(num_samples):
        _len = random.randint(min_len,max_len)
        _word=[]
        for _ in range(_len):_word.append(random.choice(numbers))
        if random_exec():_word[random.randint(0,_len-1)]+="."
        words.append("".join(_word))
    return words

def create_mixed_data(numbers,
                    graphemes,
                    punctuations,    
                    num_samples=100000,
                    lens= [1,2,3,4,5,6,7,8,9,10],
                    weights= [0.05,0.05,0.1,0.15,0.15,0.15,0.15,0.1,0.05,0.05],
                    comp_weights= [0.33,0.34,0.33]):
    words=[]
    for _ in tqdm(range(num_samples)):
        len_word=random.choices(population=lens,weights=weights,k=1)[0]
        _graphemes=[]
        for _ in range(len_word):
            _ctype=random.choices(population=["g","n","p"],weights=comp_weights,k=1)[0]
            if _ctype=="g":    
                _graphemes.append(random.choice(graphemes))
            elif _ctype=="n":    
                _graphemes.append(random.choice(numbers))
            else:
                _graphemes.append(random.choice(punctuations))        
        words.append("".join(_graphemes))
    return words

In [6]:
words=create_mixed_data(numbers,graphemes,punctuations,num_samples=NUM_MIX_DATA)
dfm=pd.DataFrame({"word":words})

words=create_numbers(numbers,num_samples=NUM_NUM_DATA)
dfn=pd.DataFrame({"word":words})

gwords=[]
for i in tqdm(range(NUM_GPM_LOOP)):
    gwords+=create_words(graphemes)
dfg=pd.DataFrame({"word":gwords})

dfs=[dfm,dfn,dfg]

df=pd.concat(dfs,ignore_index=True)
df=df.sample(frac=1)

with open(DICT_NAME,"w+") as f:
    for idx in tqdm(range(len(df))):
        word=df.iloc[idx,0]
        f.write(f"{word}\n")

100%|██████████| 300000/300000 [00:02<00:00, 122157.86it/s]
100%|██████████| 5000/5000 [00:13<00:00, 357.31it/s]
100%|██████████| 5025556/5025556 [01:25<00:00, 58619.96it/s]


# Vocab creation
* **SINGLE_LINE_VOCAB_TXT**: file path for creating (mostly) synthtiger format vocabulary/charset 

    ```text 
    012345abcd....
    ```
* **MULTI_LINE_VOCAB_TXT** : file path for creating (mostly) synthindic format line separated vocab/charset

    ```text
    0
    1
    2
    a
    b
    c
    .
    .

    ```

In [7]:
SINGLE_LINE_VOCAB_TXT="/home/apsisdev/OCR/SylhetiNagri/vocabs/charset.txt"
MULTI_LINE_VOCAB_TXT ="/home/apsisdev/OCR/SylhetiNagri/vocabs/vocab.txt"

In [8]:
unicodes=[]

for comp in vowels+consonants+consonantDiacritic+vowelDiacritic+punctuations+numbers:
    for u in comp:
        if u not in unicodes:
            unicodes.append(u)
    
unicodes=sorted(list(set(unicodes)))
# synthindic
with open(MULTI_LINE_VOCAB_TXT,"w+") as f:
    for u in unicodes:
        f.write(f"{u}\n")
# synthtiger
charset="".join(unicodes)
with open(SINGLE_LINE_VOCAB_TXT,"w+") as f:
    f.write(charset)

# Train set words
* **CSV_PATH** = The csv that contains the following columns: ```['filename', 'SN', 'source', 'fold']```
* **DICT_NAME**= The path to save dictionary created form the unique words in the train-set   

In [9]:
CSV_PATH ="/home/apsisdev/OCR/SylhetiNagri/datasets/SNdataset/SN_OCR.csv"
DICT_NAME="/home/apsisdev/OCR/SylhetiNagri/dicts/traindict.txt"

In [10]:
df=pd.read_csv(CSV_PATH)
train=df.loc[df.fold=="train"]
words=[w for w in train.SN.unique()]
print("Train data unique words:",len(words))
with open(DICT_NAME,"w+") as f:
    for word in tqdm(words):
        f.write(f"{word}\n")

Train data unique words: 1573


100%|██████████| 1573/1573 [00:00<00:00, 1853269.72it/s]


# Data Separation
* **DATASET_PATH**: The path where train and test folders will be created/ the path to save the separated data
* **IMAGE_FOLDER_PATH**: The path where all the images are available. Namely *data* folder from [here](/home/apsisdev/OCR/SylhetiNagri/datasets/SNdataset/data) 


In [11]:
DATASET_PATH="/home/apsisdev/OCR/SylhetiNagri/datasets/" 
IMAGE_FOLDER_PATH="/home/apsisdev/OCR/SylhetiNagri/datasets/SNdataset/data"

In [12]:
df["filename"]=df["filename"].progress_apply(lambda x:os.path.join(IMAGE_FOLDER_PATH,x))

100%|██████████| 4305/4305 [00:00<00:00, 837227.19it/s]


In [13]:
#---------------------------------------
# functions
#---------------------------------------
def create_dir(base,ext):
    _path=os.path.join(base,ext)
    if not os.path.exists(_path):
        os.mkdir(_path)
    return _path

#---------------------------------------
# directories of folders and files
#---------------------------------------
# create directories
train_path=create_dir(DATASET_PATH,"train")
test_path=create_dir(DATASET_PATH,"test")
# image path
train_img_path=create_dir(train_path,"images")
test_img_path=create_dir(test_path,"images")
# data.txt
train_data_txt=os.path.join(train_path,"data.txt")
test_data_txt=os.path.join(test_path,"data.txt")

In [14]:
def copy_paths(df,img_path):
    _paths=df["filename"].tolist()
    for _path in tqdm(_paths):
        base=os.path.basename(_path)
        dst=os.path.join(img_path,base)
        copy(_path,dst)

def create_data_txt(df,data_txt):
    with open(data_txt,"w+") as f:
        for idx in tqdm(range(len(df))):
            _path=df.iloc[idx,0]
            _label=df.iloc[idx,1]
            f.write(f"{_path}\t{_label}\n")
        

In [15]:
train=df.loc[df.fold=="train"]
test=df.loc[df.fold=="test"]

train=train[["filename","SN"]]
test=test[["filename","SN"]]
copy_paths(test,test_img_path)
copy_paths(train,train_img_path)
create_data_txt(train,train_data_txt)
create_data_txt(test,test_data_txt)

100%|██████████| 1306/1306 [00:00<00:00, 15639.98it/s]
100%|██████████| 2999/2999 [00:00<00:00, 15346.96it/s]
100%|██████████| 2999/2999 [00:00<00:00, 25599.59it/s]
100%|██████████| 1306/1306 [00:00<00:00, 30254.01it/s]
