## Sentiment Analysis

In [None]:
!pip install tkseem

In [None]:
!wget https://raw.githubusercontent.com/ARBML/tkseem/master/tasks/sentiment_analysis/sentiment/data.txt
!wget https://raw.githubusercontent.com/ARBML/tkseem/master/tasks/sentiment_analysis/sentiment/labels.txt

### Imports

In [2]:
import numpy as np
import tkseem as tk
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional

### Tokenize

In [3]:
tokenizer = tk.SentencePieceTokenizer()
tokenizer.process_data('data.txt')
tokenizer.train()

Reading the data ...
Training SentencePiece...


### Read Data

In [4]:
sentences = open('data/raw/train.txt', 'r').read().splitlines()
labels = open('labels.txt', 'r').read().splitlines()

max_length = 20 

X = tokenizer.encode_sentences(sentences, max_length = max_length)
y = np.array([int(lbl) for lbl in labels])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Model

In [5]:
model = Sequential()
model.add(Embedding(tokenizer.vocab_size, 32))
model.add(Bidirectional(GRU(units = 32)))
model.add(Dense(32, activation = 'tanh'))
model.add(Dropout(0.3))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

### Train

In [6]:
history = model.fit(X_train, y_train, epochs = 12, validation_split = 0.1,  batch_size= 128, shuffle = True)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


### Test

In [7]:
def classify(sentence):
  sequence = tokenizer.encode_sentences([sentence], max_length = max_length)[0]
  pred = model.predict(sequence)[0][0]
  print(pred)

In [8]:
classify("سيئة جدا جدا")
classify("رائعة جدا")

0.09430933
0.85310614
