In [1]:
!pip install pyarabic

Collecting pyarabic
[?25l  Downloading https://files.pythonhosted.org/packages/b8/77/da852ee13bce3affc55b746cebc0fdc0fc48628dbc5898ce489112cd6bd1/PyArabic-0.6.6.tar.gz (101kB)
[K     |███▎                            | 10kB 21.2MB/s eta 0:00:01[K     |██████▌                         | 20kB 3.2MB/s eta 0:00:01[K     |█████████▊                      | 30kB 4.4MB/s eta 0:00:01[K     |█████████████                   | 40kB 2.9MB/s eta 0:00:01[K     |████████████████▏               | 51kB 3.3MB/s eta 0:00:01[K     |███████████████████▍            | 61kB 3.9MB/s eta 0:00:01[K     |██████████████████████▋         | 71kB 4.3MB/s eta 0:00:01[K     |█████████████████████████▉      | 81kB 4.6MB/s eta 0:00:01[K     |█████████████████████████████   | 92kB 5.2MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 3.1MB/s 
[?25hBuilding wheels for collected packages: pyarabic
  Building wheel for pyarabic (setup.py) ... [?25l[?25hdone
  Created wheel for pyarabic: file

In [1]:
!ls

sample_data


We use a product review dataset

In [2]:
!wget 'https://raw.githubusercontent.com/zaidalyafeai/ARBML/master/datasets/Poem Meters/baits.zip'
!unzip final_baits.zip

--2020-01-02 18:52:12--  https://raw.githubusercontent.com/zaidalyafeai/Arabic-Poetry/master/final_baits.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2267882 (2.2M) [application/zip]
Saving to: ‘final_baits.zip’


2020-01-02 18:52:12 (30.7 MB/s) - ‘final_baits.zip’ saved [2267882/2267882]

Archive:  final_baits.zip
   creating: final_baits/
  inflating: final_baits/train.txt   
  inflating: final_baits/labels.txt  
  inflating: final_baits/test.txt    


## Imports

In [3]:
import tensorflow as tf
import numpy as np
import os
import time
import glob
from random import shuffle
from pyarabic import araby
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional, BatchNormalization, Flatten, Reshape
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [0]:
tf.enable_eager_execution()

In [0]:
with open('final_baits/labels.txt', 'r') as f:
  label2name = f.readlines()
  label2name = [name.replace('\n', '') for name in label2name]

## Read the Dataset

preprocess a review by removing special characters and long spaces

In [0]:
# Read, then decode for py2 compat.
def extract_data(path, thresh = 70, on_shatrs = False):
  global vocab
  
  text = ""
  
  X = []
  y = []
    
  t = open(path, 'r').read()
  t = araby.strip_tashkeel(t)
  
  # remove some exteranous chars 
  execluded = '!()*-ـ.:=o[]«»;؛,،~?؟\u200f\ufeffـ'
  out = ""
  
  for char in t:
    if char not in execluded:
      out += char
      
  text += out
  baits = out.split('\n')
  for line in baits:
    if len(line) <= 1:
      continue
    label, bait = line.split(' ', 1)
    label = int(label)

    bait  = bait.strip()
    if on_shatrs:
      shatrs = bait.split('#')
      for shatr in shatrs:
        X.append(shatr.strip())
        y.append(label)
    else:
      X.append(bait.strip())
      y.append(label)
  
  #create the vocab 
  vocab = sorted(set(' '.join(X)))  
  
  #shuffle the data 
  X, y = shuffle(X, y)
  return X, y

In [0]:
X, y = extract_data("final_baits/train.txt", on_shatrs=False)

In [8]:
for i in range(5):
  print(X[i], ' ', label2name[y[i]])

كلما غاض لنا بحر بدا # خضرم طام كريم المورد   ramal
نظمتها درا عليه غصت في # بحر القريض وسرت فيه ملجلجا   kamel
وإذا تحاشدت المحامل حوله # قمنا فننشد من غرائب مدحنا   kamel
لنا آخذ المرباع قبل ربيعة # فأنى لبكر أن تفاخرنا بكر   taweel
قاصد وجهها تزور بني الحا # رث أهل الغناء عند الشروب   khafeef


## Create Sequences
Create sequences by using the most repeated 500 words

## Create Numpy Arrays

In [0]:
X_train, X_valid , y_train, y_valid = train_test_split(X, y, test_size = 0.15, random_state = 41)

In [0]:
# Creating a mapping from unique characters to indices
char2idx = {u:i+1 for i, u in enumerate(vocab)}

def to_sequences(X):
  X = [[char2idx[char] for char in line] for line in X]
  X = pad_sequences(X, padding='post', value=0, maxlen = 100)
  return X
 
X_train = to_sequences(X_train)
X_valid = to_sequences(X_valid)

y_train = np.array(y_train)
y_valid = np.array(y_valid)

## Create the model

In [0]:
model = Sequential()
model.add(Input((100,)))
model.add(Embedding(len(char2idx)+1, 256))
model.add(Bidirectional(GRU(units = 256, return_sequences=True)))
model.add(Bidirectional(GRU(units = 256, return_sequences=True)))
model.add(Bidirectional(GRU(units = 256)))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(len(label2name), activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 256)          9984      
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 512)          787968    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 512)          1181184   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               1181184   
_________________________________________________________________
dense (Dense)                (None, 128)               65664     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 14)                1

In [13]:
model(tf.zeros((10, 100))).shape

TensorShape([Dimension(10), Dimension(14)])

## Train the model

In [0]:
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_delta=0.0001, min_lr=0.0001)]
callbacks += [tf.keras.callbacks.ModelCheckpoint('full_verse.h5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')]

In [26]:
model.fit(X_train, y_train, validation_data= (X_valid, y_valid), epochs = 15, batch_size= 128, shuffle = True, callbacks=callbacks)

Train on 40055 samples, validate on 7069 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f85c60e8438>

In [0]:
model = tf.keras.models.load_model('full_verse.h5')

## Tests

In [0]:
def classify(sentence):
#   sentence = process_review(sentence)
  sentence = araby.strip_tashkeel(sentence)
  sequence = [char2idx[char] for char in sentence]
  sequence = pad_sequences([sequence], maxlen = X_train.shape[1], padding='post', value=0)

  pred = model.predict(sequence)[0]
  print(label2name[np.argmax(pred, 0).astype('int')], np.max(pred))
  

In [30]:
classify("ما تردون على هذا المحب # دائبا يشكو إليكم في الكتب")
classify("ولد الهدى فالكائنات ضياء # وفم الزمان تبسم وسناء")
classify(" لك يا منازل في القلوب منازل # أقفرت أنت وهن منك أواهل")
classify("ومن لم يمت بالسيف مات بغيره # تعددت الأسباب والموت واحد")
classify("أنا النبي لا كذب # أنا ابن عبد المطلب")
classify("هذه دراهم اقفرت # أم ربور محتها الدهور")
classify("هزجنا في بواديكم # فأجزلتم عطايانا")
classify("بحر سريع ماله ساحل # مستفعلن مستفعلن فاعلن")
classify("مَا مَضَى فَاتَ وَالْمُؤَمَّلُ غَيْبٌ # وَلَكَ السَّاعَةُ الَّتِيْ أَنْتَ فِيْهَا")
classify("يا ليلُ الصبّ متى غدهُ # أقيامُ الساعة موعدهُ")

ramal 0.9998889
kamel 0.9227781
kamel 0.97875893
taweel 0.99984443
mujtath 0.4425953
rajaz 0.90593857
hazaj 0.99126935
saree 0.9779699
khafeef 0.9998227
mutadarak 0.99999654


In [0]:
model.save('full_verse.h5')

In [18]:
label2meter = {i:name for i, name in enumerate(label2name)}

{0: 'saree', 1: 'kamel', 2: 'mutakareb', 3: 'mutadarak', 4: 'munsareh', 5: 'madeed', 6: 'mujtath', 7: 'ramal', 8: 'baseet', 9: 'khafeef', 10: 'taweel', 11: 'wafer', 12: 'hazaj', 13: 'rajaz'}


In [0]:
!pip install tensorflowjs

In [0]:
!tensorflowjs_converter --quantization_bytes 2 --input_format keras  full_verse.h5 model/ 

In [0]:
import csv
def create_csv(file, dict):
    with open(file, 'w') as csvfile:
        writer = csv.writer(csvfile)
        for key in dict.keys():
            writer.writerow([key,dict[key]])

In [0]:
create_csv("char2idx.csv", char2idx)