In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random

# from Text_Normalization import Text_Normalization

from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Embedding, LSTM, GRU , Conv1D, MaxPool1D, Flatten
import matplotlib.pyplot as plt

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Preprocess all data and export to csv

# df = pd.read_csv('Data - Dirty.csv')
# df = Text_Normalization(df,"tweets")
# df.to_csv("Data - Cleaned.csv", index=False)

In [3]:
# Test if GPU is Detected

# tf.debugging.set_log_device_placement(True)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [7]:
df = pd.read_csv("./Data - Cleaned.csv")
df.head()

Unnamed: 0,tweets,label
0,حقوق المرأة,1
1,حقوق المرأة الإسلام,1
2,لجنة التنمية بشبرا ما زال التسجيل مستمر دورة ...,1
3,حقوق المرأة التي تضمنها وزارة العدل,1
4,ولي امر الزوجة ولي الزوجة ولي المراة الاخطاء ...,1


In [8]:
# Reformat to a list

fullDataList = df.values.tolist()
random.seed(5)
random.shuffle(fullDataList)

print("\n" , fullDataList[0:3])


 [[' نبي حل عطالة الصيادله متخرجين سنوات وبالالاف عاطلين ان احتياج ملحوظ ونشوف اغلب مراكزنا الصحيه معطين الصيدليه للتمريض خسارة التعب ٦ سنوات وزير الصحه الصوره ', 0], [' سعوديات نطلب اسقاط الولايه يا جماعة اخاطب عنده منطق ليه الولاية ما تسقط المرأة العاقلة ليه يكون أمرها تحت تصرف شخص آخر يعني صار هوس السعوديات فترة ما انها تبغى تتزوج عشان تفتك سجن ابوها وبعدين تكتشف انها بمعتقل زوجها ليه تخلون البنت تتعلق بأشخاص وتضيع طموحاتها ', 1], [' سعوديات نطلب اسقاط الولايه وحنا نطلب تحديد سن للرشد وإلغاء الولاية وش الصعب والمستحيل هذه البديهيات الحقوق تسسقط ', 1]]


In [9]:
trainingSize = int(len(fullDataList) * 0.8)  # 3357 from 4197
maxSentenceLength = int(df["tweets"].str.split().str.len().mean()) # 20
embeddingDim = 256
truncType = "post"
paddingType = "post"

learningRate = 0.0001
optimizer = Adam(learningRate)
lossFunc = "binary_crossentropy"

epochs = 200

### Word Embedding

In [11]:

# split data into x and y - (data, result) - (sentence, sentiment)
sentences = []
sentimentScores = []

for xyPair in fullDataList:
    sentences.append(xyPair[0])
    sentimentScores.append(xyPair[1])


#Tokenized the sentences into numbers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)


# Compiled the total number of words after tokenization
wordIndex = tokenizer.word_index 
vocabSize = len(wordIndex) + 1


# Same as tokenization step but with sequences of words + added padding and truncating to data with inconsistent lengths
sequences = tokenizer.texts_to_sequences(sentences)
paddedSequences = pad_sequences(sequences, maxlen=maxSentenceLength, truncating=truncType, padding=paddingType)

In [12]:
# Split train and test sets

trainSequences = paddedSequences[0:trainingSize]
trainSentiments = sentimentScores[0:trainingSize]

testSequences = paddedSequences[trainingSize:]
testSentiments = sentimentScores[trainingSize:]


In [15]:
print("Number of training Sequences: ", len(trainSequences))
print("Number of training Sentiments: ", len(trainSentiments))
print("Number of Testing Sequences: ", len(testSequences))
print("Number of Testing Sentiments: ", len(testSentiments))

print("\n Random padded sequence:\n", paddedSequences[3])


Number of training Sequences:  3357
Number of training Sentiments:  3357
Number of Testing Sequences:  840
Number of Testing Sentiments:  840

 Random padded sequence:
 [  18   19  688 2019 2020 8688    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]


### Applying GloVe Embedding

In [24]:
# create dictionary from glove file data {word : Vector values}

gloveFileName = "./Arabic_GloVe.txt"
embeddingIndex = {}

with open(gloveFileName, encoding="utf-8") as glove:
    
    for line in glove:
        values = line.split()
        word = values[0]
        
        try:
            coefs = np.asarray(values[1:], dtype='float32')
            embeddingIndex[word] = coefs
        except ValueError:
            continue

In [25]:
# filter words in glove file to existing words in our data

embeddingMatrix = np.zeros((vocabSize, embeddingDim))

for word,i in wordIndex.items():
    embeddingVector = embeddingIndex.get(word)
    if embeddingVector is not None:
        embeddingMatrix[i] = embeddingVector 



embeddingMatrix.shape

(24041, 256)

### Building The Model

In [27]:
model = Sequential()

model.add(Embedding(vocabSize, embeddingDim, input_length=maxSentenceLength, weights=[embeddingMatrix], trainable=False))
model.add(Dropout(0.2))


# model.add(Conv1D(16, 3, activation="relu"))
# model.add(MaxPool1D(pool_size=4))
# model.add(LSTM(15))
model.add(GRU(15))


model.add(Dropout(0.2))

# model.add(Flatten())

model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer=optimizer, loss=lossFunc, metrics=['accuracy'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 256)           6154496   
                                                                 
 dropout_12 (Dropout)        (None, 20, 256)           0         
                                                                 
 gru_6 (GRU)                 (None, 15)                12285     
                                                                 
 dropout_13 (Dropout)        (None, 15)                0         
                                                                 
 dense_6 (Dense)             (None, 1)                 16        
                                                                 
Total params: 6,166,797
Trainable params: 12,301
Non-trainable params: 6,154,496
_________________________________________________________________


In [17]:
# cpPath="./lstm_weights.h5"
cpPath="./gru_weights.h5"
# cpPath="./conv_weights.h5"
# cpPath="./hybrid_lstm_weights.h5"
# cpPath="./hybrid_gru_weights.h5"

cpCallback = ModelCheckpoint(cpPath, save_best_only=True, save_weights_only=True, verbose=1)



In [None]:
history = model.fit(x=np.array(trainSequences),
                    y=np.array(trainSentiments),
                    epochs=epochs,
                    validation_data=(np.array(testSequences), np.array(testSentiments)), 
                    callbacks=[cpCallback],
                    verbose=1)


### Evaluation

In [28]:
testSequenceArray = np.array(testSequences)
testSentimentArray = np.array(testSentiments)

model.built = True
model.load_weights(cpPath)

loss, acc = model.evaluate(testSequenceArray, testSentimentArray, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100 * acc))


27/27 - 5s - loss: 0.3349 - accuracy: 0.8619 - 5s/epoch - 198ms/step
Restored model, accuracy: 86.19%


### Testing

In [18]:
def getAndPrintSentiment(sample):
    sentimentsList = ["Negative", "Positive"]

    predictSequence = tokenizer.texts_to_sequences(sample)
    paddedPredictSequences = pad_sequences(predictSequence, maxlen=maxSentenceLength, truncating=truncType, padding=paddingType)

    predictionValue = int(model.predict(paddedPredictSequences)[0][0].round())

    prediction = sentimentsList[predictionValue]

    print("Score:", model.predict(paddedPredictSequences)[0][0])
    print("Prediction:", prediction)

### Visual Representation


In [None]:
acc = history.history['accuracy']
valAcc = history.history['val_accuracy']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r')
plt.plot(epochs, valAcc, 'b')
plt.title('Training and Test accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(["Training Accuracy", "Test Accuracy"])

plt.figure()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt


prediction = model.predict(testSequenceArray)
y_pred = [1 if x>0.5 else 0 for x in prediction]
cm = confusion_matrix(testSentimentArray, y_pred)


labels = ["0", "1"]
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
from sklearn.metrics import precision_recall_fscore_support
precision, recall, f1_score, _ =  precision_recall_fscore_support(testSentimentArray, y_pred, average='macro')
print('Recall = ', recall)
print('Precision = ', precision)
print('F1 Score = ',f1_score)