In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from keras.wrappers.scikit_learn import KerasClassifier
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.layers.recurrent import SimpleRNN
from keras.models import Sequential
from keras.layers import Dense , Activation , Dropout
import keras.preprocessing.text
import keras.backend as K
from keras.metrics import Precision , Recall , Accuracy , TruePositives , TrueNegatives , FalsePositives , FalseNegatives

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv('final_dialect_dataset.csv')
df.head()

Unnamed: 0,id,text,dialect
0,1175358310087892992,لكن بالنهاية ينتفض يغير,IQ
1,1175416117793349632,يعني هذا محسوب على البشر حيونه ووحشيه وتطلب...,IQ
2,1175450108898565888,مبين من كلامه خليجي,IQ
3,1175471073770573824,يسلملي مرورك وروحك الحلوه,IQ
4,1175496913145217024,وين هل الغيبه اخ محمد,IQ


In [3]:
df.shape

(458197, 3)

> In order to save time and computation, I will perform stratified sampling and take only 50% of the data.

In [4]:
df, df_extra = train_test_split(df, test_size=0.5, stratify=df['dialect'])

## ML Model

In [4]:
# The text will be our training independent x variable and the dialect is our dependent y variable
X = df.iloc[:, 1].apply(lambda x: np.str_(x)).values
y = df.iloc[:, 2].values

In [5]:
def tokenize(text):
    """
    Tokenize the text function and remove stop words (Will be passed as a parameter to the count vectorizer)
    
    Arguments:
        text -> Text message which needs to be tokenized
    Output:
        no_stop_words -> List of tokens from the provided text
    """
    tokens = word_tokenize(text)
    no_stop_words = [word for word in tokens if word not in stopwords.words('arabic')]
    return no_stop_words

In [6]:
# Perform a train-test split (80% training & 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

Now we will build a pipeline that take the features from our text data using count tokens (CountVectorizer) and tf-idf scores (TfidfTransformer) and then it will be passed to a multinomial naive bayes classifier which will give us the prediction.

In [34]:
pipeline = Pipeline([('count', CountVectorizer(tokenizer=tokenize, max_df = 0.8)),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

In [35]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('count',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.8,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at 0x000002B2DFE55288>,
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
   

In [42]:
y_pred = pipeline.predict(X_test)

In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          AE       0.69      0.15      0.24      5334
          BH       0.70      0.09      0.15      5234
          DZ       0.88      0.19      0.31      3180
          EG       0.35      0.97      0.52     11507
          IQ       0.97      0.11      0.19      3108
          JO       0.74      0.07      0.13      5629
          KW       0.29      0.79      0.43      8354
          LB       0.80      0.44      0.56      5500
          LY       0.65      0.57      0.61      7332
          MA       0.99      0.21      0.35      2380
          OM       0.91      0.04      0.08      3820
          PL       0.32      0.67      0.44      8736
          QA       0.51      0.38      0.43      6191
          SA       0.62      0.14      0.22      5352
          SD       0.97      0.07      0.12      2905
          SY       0.99      0.03      0.06      3259
          TN       0.93      0.02      0.04      1848
          YE       1.00    

## DL Model

In [2]:
# import tensorflow as tf
# import tensorflow_hub as hub
# from simpletransformers.classification import ClassificationModel

In [3]:
# train_df, test_df = train_test_split(df[['text', 'dialect']], test_size=0.2)

In [4]:
# # define hyperparameter
# train_args ={"num_train_epochs": 4}

# # Create a ClassificationModel
# model = ClassificationModel(
#     "bert", "asafaya/bert-base-arabic",
#     num_labels=18,
#     args=train_args
# )

In [5]:
# model.train_model(train_df)

In [4]:
# Perform a train-test split (80% training & 20% test)
X = df['text']
y = df['dialect']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [5]:
# Max word length
max_len = X_train.apply(lambda x: len(str(x).split())).max()
max_len

87

In [6]:
# Perform tokenization & padding sequence
token = Tokenizer()
token.fit_on_texts(list(X_train) + list(X_test))
X_train_seq = token.texts_to_sequences(X_train)
X_test_seq = token.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)
word_index = token.word_index

In [9]:
# Perform one hot encoding on the target variable (dialect)
one_hot = OneHotEncoder(handle_unknown='ignore')
y_train_encoded = pd.DataFrame(one_hot.fit_transform(y_train.to_frame()).toarray())
y_text_encoded = pd.DataFrame(one_hot.fit_transform(y_test.to_frame()).toarray())

In [10]:
# Renaming the columns to be their actual names
y_train_encoded.rename(columns={0: 'AE', 1: 'BH', 2: 'DZ', 3: 'EG', 4: 'IQ', 5:'JO', 6: 'KW', 7: 'LB', 8: 'LY', 9: 'MA',
                                10: 'OM', 11: 'PL', 12: 'QA', 13: 'SA', 14: 'SD', 15: 'SY', 16: 'TN', 17: 'YE'} , inplace=True)
y_test.rename(columns={0: 'AE', 1: 'BH', 2: 'DZ', 3: 'EG', 4: 'IQ', 5:'JO', 6: 'KW', 7: 'LB', 8: 'LY', 9: 'MA',
                                10: 'OM', 11: 'PL', 12: 'QA', 13: 'SA', 14: 'SD', 15: 'SY', 16: 'TN', 17: 'YE'} , inplace=True)

In [8]:
# RNN model 
model = Sequential()
model.add(Embedding(len(word_index)+1, 300, input_length=max_len))
model.add(SimpleRNN(100))
model.add(Dense(18, activation = 'sigmoid'))

In [13]:
model.compile(loss= 'categorical_crossentropy', optimizer= 'adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 87, 300)           109046100 
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               40100     
                                                                 
 dense (Dense)               (None, 18)                1818      
                                                                 
Total params: 109,088,018
Trainable params: 109,088,018
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train_pad, y_train_encoded.values, epochs = 3)

Epoch 1/3
Epoch 2/3

In [77]:
tokenizer = Tokenizer(num_words=9000,lower=False)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_matrix(X_train, mode='tfidf')
X_test = tokenizer.texts_to_matrix(X_test, mode='tfidf')

In [80]:
encoder = LabelEncoder()
encoder.fit(y)
dialect_transformed = encoder.fit_transform(y)

num_classes = int((len(set(dialect_transformed))))

In [None]:
y_train = encoder.fit_transform(train_tags)
y_test = encoder.fit_transform(test_tags)
y_train= keras.utils.to_categorical(y_train,num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [85]:
max_words = len(tokenizer.word_index) + 1

In [None]:
def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [20]:
# Build the model
model = Sequential()
model.add(Dense(1024, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy','Recall','Precision', f1_metric,'TruePositives','TrueNegatives','FalsePositives','FalseNegatives'])
batch_size = 100
epochs = 2

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)