# Arabic Font Classification
Author: Mahmoud Aslan

Date: 18-7-2020

## Ensure Reproducibility

In [1]:
import tensorflow as tf
assert tf.__version__ == '2.2.0'

In [2]:
!pip install tensorflow-determinism

Collecting tensorflow-determinism
  Downloading https://files.pythonhosted.org/packages/76/56/79d74f25b326d8719753172496abc524980fa67d1d98bb247021376e370a/tensorflow-determinism-0.3.0.tar.gz
Building wheels for collected packages: tensorflow-determinism
  Building wheel for tensorflow-determinism (setup.py) ... [?25l[?25hdone
  Created wheel for tensorflow-determinism: filename=tensorflow_determinism-0.3.0-cp36-none-any.whl size=9156 sha256=fa6be7fd992b0393c6035a185d5a3d12fc2de2f6cf2d88a72846ee9fa2cd20c2
  Stored in directory: /root/.cache/pip/wheels/66/c3/18/13959a90d3e0d10182a99866d6ff4d0119e9daed6ce014b54c
Successfully built tensorflow-determinism
Installing collected packages: tensorflow-determinism
Successfully installed tensorflow-determinism-0.3.0


In [3]:
import numpy as np
import random as rn
import os

rn.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
rng = tf.random.experimental.Generator.from_seed(1234)

os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['PYTHONHASHSEED']=str(0)
# os.environ['CUDA_VISIBLE_DEVICES'] = ''


## Fetch and load data

In [4]:
!wget 'https://github.com/mhmoodlan/arabic-font-classification/releases/download/v0.1.0/rufa.tar.gz' && tar -xzf '/content/rufa.tar.gz'

--2020-07-21 05:58:29--  https://github.com/mhmoodlan/arabic-font-classification/releases/download/v0.1.0/rufa.tar.gz
Resolving github.com (github.com)... 140.82.118.3
Connecting to github.com (github.com)|140.82.118.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/280475345/18288f80-c87d-11ea-95ed-712fd4c4a137?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200721%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200721T055829Z&X-Amz-Expires=300&X-Amz-Signature=9b6277706705425f8acd52ea94a912c72f17c2895dda88aee237c2e9749645fb&X-Amz-SignedHeaders=host&actor_id=0&repo_id=280475345&response-content-disposition=attachment%3B%20filename%3Drufa.tar.gz&response-content-type=application%2Foctet-stream [following]
--2020-07-21 05:58:29--  https://github-production-release-asset-2e65be.s3.amazonaws.com/280475345/18288f80-c87d-11ea-95ed-712fd4c4a137?X-Amz-Algorithm=AWS4-HMAC-SHA25

In [5]:
print('# of real Ruqaa images: ', len(os.listdir('/content/rufa/real/ruqaa/')))
print('# of real Farsi images: ', len(os.listdir('/content/rufa/real/farsi/')))
print('# synthesized Ruqaa: ', len(os.listdir('/content/rufa/synth/ruqaa/')))
print('# synthesized Farsi: ', len(os.listdir('/content/rufa/synth/farsi/')))

# of real Ruqaa images:  260
# of real Farsi images:  256
# synthesized Ruqaa:  20000
# synthesized Farsi:  20000


In [6]:
from pathlib import Path

synth_dir = Path('/content/rufa/synth')
real_dir = Path('/content/rufa/real')

CLASS_NAMES = np.array([item.name for item in real_dir.glob('*')])
CLASS_NAMES

array(['ruqaa', 'farsi'], dtype='<U5')

In [7]:
synth_paths = tf.data.Dataset.list_files(str(synth_dir / '*/*.jpg'), seed=42)
real_paths = tf.data.Dataset.list_files(str(real_dir / '*/*.jpg'), seed=42)

## Train, val, mismatch, test split

In [8]:
_max_data_size = 2**np.int('32')
_test_ratio = '0.2'


def test_set_check(item):
    id = tf.strings.split(tf.strings.split(item, os.sep)[-1], '.')[0]
    hash = tf.strings.to_hash_bucket_fast(id, _max_data_size)
    return tf.cast(hash, tf.float64) < float(_test_ratio) * _max_data_size

def train_set_check(item):
    id = tf.strings.split(tf.strings.split(item, os.sep)[-1], '.')[0]
    hash = tf.strings.to_hash_bucket_fast(id, _max_data_size)
    return tf.cast(hash, tf.float64) >= float(_test_ratio) * _max_data_size

In [9]:
train_paths = synth_paths.filter(train_set_check)
val_paths = synth_paths.filter(test_set_check)
mismatch_paths = real_paths.filter(test_set_check)
test_paths = real_paths.filter(train_set_check)

## Preprocessing

In [10]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 32
mapping = {0: 'farsi', 1: 'ruqaa'}

def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
  if cache:
    if isinstance(cache, str):
      ds = ds.cache(cache)
    else:
      ds = ds.cache()

  ds = ds.shuffle(buffer_size=shuffle_buffer_size, seed=42)
  ds = ds.batch(BATCH_SIZE)
  ds = ds.prefetch(buffer_size=AUTOTUNE)

  return ds

def parse_image(data_instance):
  parts = tf.strings.split(data_instance, os.sep)
  label = tf.cast(tf.argmax(tf.cast(parts[-2] == np.array(list(mapping.values())), dtype=tf.float16)), tf.float16)

  image = tf.io.read_file(data_instance)
  image = tf.image.decode_jpeg(image, 1)
  image = tf.image.convert_image_dtype(image, tf.float32)

  if parts[-3] == 'synth':
    noise = rng.normal(shape=tf.shape(image), mean=0.0, stddev=0.015, dtype=tf.float32)
    image = tf.add( image, noise)
    image = tf.clip_by_value(image, 0.0, 1.0)

    image = tf.image.adjust_jpeg_quality(image, 90)

  return image, label

In [11]:
train_ds = train_paths.map(parse_image)
val_ds = val_paths.map(parse_image)
mismatch_ds = mismatch_paths.map(parse_image)
full_train_ds = train_ds.concatenate(val_ds.concatenate(mismatch_ds))
test_ds = test_paths.map(parse_image)

train_ds = prepare_for_training(train_ds)
val_ds = prepare_for_training(val_ds)
mismatch_ds = prepare_for_training(mismatch_ds)
full_train_ds = prepare_for_training(full_train_ds)
test_ds = prepare_for_training(test_ds)

## Model

In [12]:
def cnn(input_shape, output_shape):
    num_classes = output_shape[0]
    dropout_seed = 708090
    kernel_seed = 42
  
    model = tf.keras.models.Sequential([
      tf.keras.layers.Conv2D(16, 3, activation='relu', input_shape=input_shape, kernel_initializer=tf.keras.initializers.GlorotUniform(seed=kernel_seed)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Dropout(0.1, seed=dropout_seed),
      tf.keras.layers.Conv2D(32, 5, activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=kernel_seed)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Dropout(0.1, seed=dropout_seed),
      tf.keras.layers.Conv2D(64, 10, activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=kernel_seed)),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Dropout(0.1, seed=dropout_seed),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(128, activation='relu', kernel_regularizer='l2', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=kernel_seed)),
      tf.keras.layers.Dropout(0.2, seed=dropout_seed),
      tf.keras.layers.Dense(16, activation='relu', kernel_regularizer='l2', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=kernel_seed)),
      tf.keras.layers.Dropout(0.2, seed=dropout_seed),
      tf.keras.layers.Dense(num_classes, activation='sigmoid', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=kernel_seed))
    ])

    return model

In [28]:
epochs = 6
callbacks = None

In [29]:
model = cnn((100, 100, 1), (1,))
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), optimizer='Adam', metrics='accuracy')
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 98, 98, 16)        160       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 49, 49, 16)        0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 49, 49, 16)        0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 45, 45, 32)        12832     
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 22, 22, 32)        0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 22, 22, 32)        0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 13, 13, 64)       

## Training and evaluation

#### Load pretrained weights, or...

In [35]:
!wget 'https://raw.githubusercontent.com/mhmoodlan/arabic-font-classification/master/codebase/code/font_classifier/weights/FontModel_RuFaDataset_cnn_weights(4).h5' -O weights.h5
model.load_weights('/content/weights.h5')

--2020-07-21 06:10:36--  https://raw.githubusercontent.com/mhmoodlan/arabic-font-classification/master/codebase/code/font_classifier/weights/FontModel_RuFaDataset_cnn_weights(4).h5
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2090968 (2.0M) [application/octet-stream]
Saving to: ‘weights.h5’


2020-07-21 06:10:36 (27.7 MB/s) - ‘weights.h5’ saved [2090968/2090968]



#### or train from scratch

In [None]:
model.fit(
  full_train_ds,
  epochs=epochs,
  callbacks=callbacks
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


#### Evaluate

In [36]:
test_score = model.evaluate(test_ds)
print(f"Test score: {test_score}")

Test score: [0.2316255122423172, 0.971222996711731]


In [37]:
assert np.allclose(test_score, [.231625, .971222])