# Needed libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , LSTM ,Dropout ,SpatialDropout1D
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping ,  ModelCheckpoint
from sklearn.model_selection import train_test_split

# Reading the preprocessed dataset

In [2]:
dataset = pd.read_csv('outputs/output_3.csv')
df = dataset.drop(columns=['Unnamed: 0'])
df

Unnamed: 0,Text,dialect
0,حاب نضحك لكن رانا سيريو هاد اليومين,13
1,ما تتخيلي السعادة بعد ما قريت الاسم وشوفت الصو...,15
2,ريتها ما تبلى الضحكه الله يبسطك على طول,6
3,اللي كانت تحط الكورة تحت ملابسها ️وتسوي نفسها ...,11
4,القادسيهالكويت ممكن تردد القناة الناقلة,16
...,...,...
166423,ماكرهناش يزيدو هاد الايموجي الخوت,16
166424,تخيل وانت قاعد معا باتك ويقولك سامحني يا وليدي...,4
166425,ونحنا حدك قول الله,7
166426,نمدح هيئة الكهربا في الشارجة,9


# Seting some parameters and creating the word tokenizer

In [3]:
epochs = 10
emb_dim = 100
batch_size = 256
n_most_common_words = 80000
max_len = 250


tokenizer = Tokenizer(num_words=n_most_common_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['Text'].values)

word_index = tokenizer.word_index
print("Unique Tokens are:",len(word_index))

Unique Tokens are: 310985


# Creating the tensors to be input for the model

In [4]:
X = tokenizer.texts_to_sequences(df['Text'].values)
X = pad_sequences(X , maxlen=max_len)
print("Shape of data tensor",X.shape)

Shape of data tensor (166428, 250)


In [5]:
X

array([[    0,     0,     0, ..., 53280,   189,  5607],
       [    0,     0,     0, ..., 31567,  2184,    13],
       [    0,     0,     0, ..., 53282,     7,   198],
       ...,
       [    0,     0,     0, ...,  5508,   499,     6],
       [    0,     0,     0, ...,  2516,     3,  7659],
       [    0,     0,     0, ...,   923,  2005,  2998]])

In [6]:
Y = pd.get_dummies(df['dialect']).values
print("Shape of label tensor",Y.shape)

Shape of label tensor (166428, 18)


In [7]:
Y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

# Spliting the dataframe

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y,  test_size=0.1, random_state=42)

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(149785, 250) (16643, 250) (149785, 18) (16643, 18)


# Creating the LSTM model

In [10]:
model = Sequential()
model.add(Embedding(n_most_common_words+1,emb_dim,input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(100,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(18,activation='softmax'))
model.compile(loss='categorical_crossentropy' , optimizer='adam' ,metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 100)          8000100   
                                                                 
 spatial_dropout1d (SpatialD  (None, 250, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 18)                1818      
                                                                 
Total params: 8,082,318
Trainable params: 8,082,318
Non-trainable params: 0
_________________________________________________________________
None


# Setting some callbacks like early stoping and auto saving best models

In [11]:
my_callbacks=[  EarlyStopping(monitor = 'val_loss',min_delta = 0,patience = 2,verbose = 1,restore_best_weights = True) ,
                ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5',monitor='val_loss',mode='min',save_best_only=True,verbose=1)  ]


# The Training Process

In [12]:
history = model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5,batch_size=128, callbacks=my_callbacks)


Epoch 1/5
Epoch 00001: val_loss improved from inf to 1.88547, saving model to model.01-1.89.h5
Epoch 2/5
Epoch 00002: val_loss improved from 1.88547 to 1.73899, saving model to model.02-1.74.h5
Epoch 3/5
Epoch 00003: val_loss did not improve from 1.73899
Epoch 4/5

Epoch 00004: val_loss did not improve from 1.73899
Epoch 00004: early stopping


# Saving the last model

In [13]:
model.save("outputs/my_h5_model.h5")
print('Done')

Done


# Model Evaluation
- Here you can see the model accuracy is not good enough but you have to know the the model just trained on only 5 epochs, But each epoch take more than 2 hours because thne amount of data was not small, so i want to say that the model can give better accuracy if it was trained on more number of epochs.

In [14]:
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))


Test set
  Loss: 1.739
  Accuracy: 0.454


# Testing the model on a given text
- Although the model accuracy was not good but when it take easy recognizable text it predict it very well

In [38]:
txt = ["راه صعيب عليك"]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_len)
pred = model.predict(padded)
labels = ['EG'    ,
'PL'    ,
'KW'    ,
'LY'    ,
'QA'    ,
'JO'    ,
'LB'    ,
'SA'    ,
'AE'    ,
'BH'    ,
'OM'    ,
'SY'    ,
'DZ'    ,
'IQ'    ,
'SD'    ,
'MA'    ,
'YE'    , 
'TN']
print(pred, labels[np.argmax(pred)])

[[2.4054772e-03 1.9003147e-04 4.7690082e-05 1.8438566e-02 6.7165340e-05
  5.8164464e-05 1.5003608e-04 1.4943248e-04 9.9206969e-05 3.9845829e-05
  3.7252918e-05 1.8293475e-04 2.0087668e-01 3.6764908e-05 6.1633380e-04
  7.5468510e-01 8.6046719e-05 2.1833224e-02]] MA
