# Deep Learning Model for Arabic dialect classification

First, let's import libraries and get the data.

In [1]:
import keras
import pandas as pd
from keras.callbacks import TensorBoard
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras import layers
from keras.callbacks import EarlyStopping
import numpy as np

In [2]:
train = pd.read_csv("../input/clean-dialect-text/train.csv",lineterminator='\n')
val = pd.read_csv("../input/clean-dialect-text/validation.csv",lineterminator='\n')

In [3]:
y = train['dialect']
X = train['clean_text']
y_val = val['dialect']
x_val = val['clean_text']

## 1- Convert input-data from text to tokens and output-data to labels

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X_train_tokens = tokenizer.texts_to_sequences(X)
X_val_tokens = tokenizer.texts_to_sequences(x_val)
vocab_size = len(tokenizer.word_index) + 1 

In [5]:
vocab_size

400865

In [6]:
encoder = LabelEncoder()

y_train = encoder.fit_transform(y)
y_val = encoder.transform(y_val)

In [7]:
y_val[0:5]

array([12,  8,  8, 11,  5])

## 2- Padding the Input

We need to make sure that input from both validation and input are on the same length.

For this I checked the longest sequence and adding more to it for all.

In [8]:
# max_length = 0

# for lis in X_train_tokens:
#     length = len(lis)
#     if length > max_length:
#         max_length = length
    
    
# print(max_length)

In [9]:
maxlen = 100

X_train = pad_sequences(X_train_tokens, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val_tokens, padding='post', maxlen=maxlen)

## 3- Making the model

- Embedding size: choosed by trying for different numbers between 100-300
- After trying different layers, it showed that BILSTM is slightly better.
- There is no f1_score in keras so I'll use tensorflow addons.

In [10]:
# import tensorflow_addons as tfa 

# f1 = tfa.metrics.F1Score( average='macro')

In [11]:
embedding_dim = 200

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Bidirectional(keras.layers.LSTM(64)))
#model.add(layers.Dropout(0.1))
model.add(layers.Dense(64, activation='relu',kernel_initializer='he_uniform'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(18,activation='softmax'))

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])
model.summary()



2022-03-10 19:57:30.844291: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-10 19:57:30.935310: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-10 19:57:30.936013: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-03-10 19:57:30.937099: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          80173000  
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               135680    
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 18)                594       
Total params: 80,319,610
Trainable params: 80,319,610
Non-trainable params: 0
____________________________________________

## 4- Fitting the model

- epochs = 4, because more than that the model overfits.
- Batch size = 256 , also choosed by trial and error.

In [12]:
history = model.fit(X_train, y_train,
                    epochs=4,
                    verbose=True,
                    validation_data=(X_val, y_val),
                    batch_size=256)
                    #callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.05)])


2022-03-10 19:57:33.811982: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/4


2022-03-10 19:57:37.246231: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/4
Epoch 3/4
Epoch 4/4


## Checking F1-score on test

In [13]:
test = pd.read_csv("../input/clean-dialect-text/test.csv",lineterminator='\n')
y_test = test['dialect']
x_test = test['clean_text']
X_test = tokenizer.texts_to_sequences(x_test)
Y_test = encoder.transform(y_test)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [14]:
output = model.predict(X_test)
out = output.argmax(axis = 1)[:,None]
# reshaping to enter the f1 function
Y_pred = out.reshape((out.shape[0],))
Y_true = np.array(Y_test)

In [15]:
from sklearn.metrics import f1_score

f1_score(Y_true, Y_pred, average='macro')

0.46536801166160724

In [16]:
# print(np.argmax(output))
# list(encoder.inverse_transform([np.argmax(output)]))

## 5- Saving the model, tokenizer and the encoder

The model:

In [17]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


Tokenizer:

In [18]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

Encoder:

In [19]:
with open('encoder.pkl', 'wb') as handle:
    pickle.dump(encoder,handle)

## Conclusions:

- Because Arabic dialects are similar, it'll be better if we used less classes.
- I believe that ML model is better. 