We use a product review dataset

In [1]:
!wget https://raw.githubusercontent.com/hadyelsahar/large-arabic-sentiment-analysis-resouces/master/datasets/PROD.csv

--2019-06-04 04:45:26--  https://raw.githubusercontent.com/hadyelsahar/large-arabic-sentiment-analysis-resouces/master/datasets/PROD.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 527639 (515K) [text/plain]
Saving to: ‘PROD.csv’


2019-06-04 04:45:31 (9.79 MB/s) - ‘PROD.csv’ saved [527639/527639]



## Imports

In [2]:
import tensorflow as tf
import re
import numpy as np
import csv 
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional, CuDNNGRU
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle


Using TensorFlow backend.


## Read the Dataset

preprocess a review by removing special characters and long spaces

In [0]:
def process_review(review):
  out = re.sub(r"[^\w\s]", '', review)
  out = re.sub(r"[a-zA-Z]", '', out)
  out = re.sub(r"\n", '', out)
  out = re.sub(r"\s+", ' ', out)
  return out.strip()

In [4]:
with open('PROD.csv', 'r') as csv_file:
  reviews = []
  labels  = []
  all_text = ""
  count = 0
  pos_count = 0
  
  #read the data
  lines = csv.reader(csv_file, delimiter = ",")
  for i, line in enumerate(lines):
    
    #ignore the first line
    if i == 0: continue
    
    #preprocess the data
    review = process_review(line[0])
    label  = int(line[1])
    
    #only allow postiive and negative reviews, 
    #also make them the same length
    
    if label == 1:
      pos_count +=1
    elif label == 0:
      continue
    else:
      label += 1
      
    if label == 1 and pos_count > 862:
      continue
    
    if review == "":
      continue
    reviews.append(review)
    all_text += review +' \n '
    labels.append(label)
    
#shuffle the data
reviews, labels = shuffle(reviews, labels)
print(len(reviews))

1648


Look at the data

In [5]:
for i in range(0, 10):
  print(reviews[i], labels[i])

تواليت للحلاقة رائحة سلبية 0
شكرا على سرعة التجاوب في ارسال الطلبية 1
رائعه جدا وانصح باستخدامها فهي سريعه في التعقيم وتلخذ مساحه كبيره حيث تاخذ ٦ رضاعات وبحجم ٢٠٠ مل وتعقم خلال عشر دقائق فقط 1
سيئة جدا ومبالغ في سعرها والله لا أنصح فيها 0
المناديل لاتقوم بفائدتها تترك بقعة بيضاء ع الملابس ورائحتها سيئة جدا 0
لعبة تستحق التجربة وبي سعر قليل لا يضرك أن تقول اسعر كبير لكن في المحلات الأخرى في كل لعبة سعره ٣٠٠ للمعلوميه اللعبة جديدة لا مستعملة 1
ممتازة جدا ولها منظر جميل بعد لصقها على الحائط 1
غير جيدة ولا انصح بها نهائيا 0
بائع محترم انصح بالتعامل معه دقة في المواعيد وتغليف جيد 1
1 100 0


In [0]:
#save the dataset
with open('product_review.txt', 'w') as f:
  for i in range(len(reviews)):
    f.write(f"{reviews[i]}, {labels[i]} '\n'")

## Create Sequences
Create sequences by using the most repeated 500 words

In [0]:
tknzr = Tokenizer(lower=True, split=" ")
tknzr.fit_on_texts(reviews)

#making sequences:
X = tknzr.texts_to_sequences(reviews)
X = pad_sequences(X, padding='post', value=0)

## Create Numpy Arrays

In [45]:
X = np.array(X)
y = np.array(labels)

print(X.shape)

(1648, 113)


## Create the model

In [0]:
model = Sequential()
model.add(Embedding(len(tknzr.word_index), 32))
model.add(Bidirectional(GRU(units = 32)))
model.add(Dense(32, activation = 'tanh'))
model.add(Dropout(0.3))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [94]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 32)          217056    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 64)                12480     
_________________________________________________________________
dense_12 (Dense)             (None, 32)                2080      
_________________________________________________________________
dropout_4 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 33        
Total params: 231,649
Trainable params: 231,649
Non-trainable params: 0
_________________________________________________________________


## Train the model

In [96]:
model.fit(X, y, validation_split = 0.1, epochs = 7, batch_size= 128, shuffle = True)

Train on 1483 samples, validate on 165 samples


<tensorflow.python.keras.callbacks.History at 0x7f9857375da0>

## Tests

In [0]:
class_names = ['سلبي' , 'إيجابي']
def classify(sentence):
  sentence = process_review(sentence)
  sequence = [tknzr.word_index[word] for word in sentence.split(' ')]
  sequence = pad_sequences([sequence], maxlen = X.shape[1], padding='post', value=0)
  #print(sequence.dtype)
  #print(sequence)
  pred = model.predict(sequence)[0][0]
  print(class_names[np.round(pred).astype('int')], pred)
  

In [98]:
classify("جميل")

إيجابي 0.545708


In [99]:
classify("السلعة كانت جيدة")

إيجابي 0.85597116


In [100]:
classify("سيء")

سلبي 0.03726467


In [101]:
classify("لا بأس بها")

سلبي 0.03799555


In [102]:
classify("تفاجأت بجودة المنتج")

سلبي 0.1514895


In [0]:
import csv
def create_csv(file, dict):
    with open(file, 'w') as csvfile:
        writer = csv.writer(csvfile)
        for key in dict.keys():
            writer.writerow([key,dict[key]])

In [0]:
create_csv("word2index.csv", tknzr.word_index)

In [0]:
model.save("keras.h5")