In [30]:
import numpy as np
import math, json
from rdp import rdp
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import animation
from IPython.display import HTML
import tqdm.notebook as tq
import pickle
import re
from collections import Counter 

In [31]:
#https:#jrgraphix.net/r/Unicode/0600-06FF
map_chars = {
    "\u0623":["\u0621", "\u0627"], # أ
    "\u0622":["\u0605", "\u0627"], # آ
    "\u0625":["\u0627", "\u0621"], # إ
    "\u0628":["\u066E", "."], # ب
    "\u062A":[".", ".", "\u066E"], # ت
    "\u062B":[".", ".", ".", "\u066E"], # ث 
    "\u062C":["\u062D", "."], # ج
    "\u062E":[".", "\u062D"], # خ
    "\u0630":[".", "\u062F"], # ذ
    "\u0632":[".", "\u0631"], # ز
    "\u0634":[".", ".", ".", "\u0633"], # ش
    "\u0636":[".", "\u0635"], # ض
    "\u0637":["\u0627", "\uFEBB"], # ط
    "\u0638":[".", "\u0627", "\uFEBB"], # ظ
    "\u063A":[".", "\u0639"], # غ
    "\u0641":[".", "\u066F"], # ف
    "\u0642":[".", ".", "\u066F"], # ق
    "\u06A4":[".", ".", ".", "\u066F"], # ڤ
    "\u0643":["\u0621", "\u0644"], # ك
    "\u0646":[".", "\u06BA"], # ن
    "\u0624":["\u0621", "\u0648"], # ؤ
    "\u064A":["\u0649", ".", "."], #ي
    "\u0626":["\u0621", "\u0649"], #ئ
    "\u0629":[".", ".", "\u0647"], #ه
}

In [32]:
def preprocess(text):
    char_comps = []
    
    diacritics = "[ًٌٍَُِّْ]"
    numbers = '0123456789'
    for diac in diacritics: 
        text = text.replace(diac, '')

    for num in numbers: 
        text = text.replace(num, '')
    
    outText = ""
    
    for i in range(len(text)):
    
        if (text[i] == " "):
            continue
    
        if text[i] in map_chars:
            if (i < len(text) - 1 and text[i] == "\u0643"):
                if text[i+1] != ' ':
                    char_comps.append({text[i] : '\uFEDB'})
                else:
                    char_comps.append({text[i] : map_chars[text[i]]})
            else:
                char_comps.append({text[i] : map_chars[text[i]]})
        else:
                char_comps.append({text[i] : text[i]})

    return char_comps

In [33]:
def generate_words(file):
    word_drawings = []
    annot = file.split('/')[-1][:-5]
    annot = re.sub('[0-9]', '', annot)
    annot = re.sub('_', '', annot)
    char_comps = preprocess(annot)
    indices = [m.start() - i - 1 for i, m in enumerate(re.finditer(' ', annot))]
    indices = indices + [len(annot) - len(indices)- 1]
    drawing = json.load(open(file))
    i, j  = 0, 0
    c = 0
    word = ""
    for cntr, comp in enumerate(char_comps):
        char = list(comp.keys())[0]
        j += len(comp[char])
        word += char
        if cntr == indices[c]:
            word_drawings.append({word:drawing[i:i+j]})
            i = i+j
            j = 0 
            c += 1
            word = ""
    return word_drawings

In [34]:
def apply_rdb(drawing, verbose = 0):
    new_drawing = []
    total_prev_strokes = 0
    total_post_strokes = 0
    for item in drawing:
        char = list(item.keys())[0]
        stroke = item[char]
        processed_stroke = []
        if len(stroke):
            if verbose:
                print('processing ', char)
            post_stroke = rdp(stroke, epsilon = 2.0)
            total_post_strokes += len(post_stroke)
            total_prev_strokes += len(stroke)
        new_drawing.append({char:post_stroke})
    if verbose:
        print('reduced from ', total_prev_strokes, ' to ', total_post_strokes)
    return new_drawing

In [35]:
import glob
npy_files = glob.glob('dataset/**/*.json')

In [36]:
for item in generate_words(npy_files[0]):
    word = list(item.keys())[0]
    print(item[word])
    break

[{'و': [[386.76275634765625, 430.04132080078125], [375.49652099609375, 428.93536376953125], [373.561279296875, 428.93536376953125], [371.76416015625, 428.79718017578125], [368.446533203125, 427.48388671875], [367.4097900390625, 426.37799072265625], [366.64947509765625, 423.75140380859375], [366.51123046875, 421.74688720703125], [366.92596435546875, 418.63653564453125], [368.446533203125, 415.80267333984375], [370.24359130859375, 413.3143310546875], [372.31719970703125, 411.86285400390625], [374.5980224609375, 411.033447265625], [377.08624267578125, 410.8260498046875], [379.57452392578125, 411.3790283203125], [381.78631591796875, 412.4849853515625], [383.928955078125, 414.4202880859375], [386.00244140625, 416.90850830078125], [387.86865234375, 419.604248046875], [389.73480224609375, 422.4381103515625], [390.9788818359375, 425.548583984375], [391.94659423828125, 428.31329345703125], [392.2230224609375, 431.63104248046875], [392.1539306640625, 434.5340576171875], [391.4627685546875, 437.2

In [37]:
def add_z(drawing):
    new_data = []
    x_prev, y_prev = None, None
    for item in drawing:
        char = list(item.keys())[0]
        if x_prev is None:
            x_prev, y_prev = item[char][0]
        stroke = item[char]
        x_data = []
        y_data = []
        segments = []
        
        if len(stroke) == 1:
            x, y = stroke[0]
            stroke.append([x+1, y+1])
            
        for i, point in enumerate(stroke):
            x, y = point
            if i == len(stroke) - 1:
                z = 1
            else:
                z = 0
            if i >=0:
                segments.append([x-x_prev, y-y_prev, z])
            x_prev, y_prev = [x, y]
        new_data += segments
    return np.array(new_data)

In [38]:
cnts = Counter()

In [41]:
train_data = []
valid_data = []
test_data = []
count = 0 
cnts = Counter()
for file in glob.glob('dataset/**/*.json'):
    for item in generate_words(file):
        word = list(item.keys())[0]
        if cnts[word] < 5:
            new_drawing = apply_rdb(item[word])
            strokes = add_z(new_drawing)
            cnts[word] += 1
            if 'train' in file:
                train_data.append(strokes)
            elif 'valid' in file:
                valid_data.append(strokes)
            elif 'test' in file:
                test_data.append(strokes)

print("save dataset")
with open('dataset_words_str.npz', 'wb') as f:
    pickle.dump({'train':train_data, 'valid':valid_data, 'test':test_data}, f, protocol=2)

save dataset


In [43]:
len(cnts)

2430