In [1]:
!pip install openpyxl
!pip install gdown
!pip install farasapy

In [2]:
!gdown 1h620Wmx1yvkTKibH6N2wCNddOakBTUzg -O data.xlsx
!git clone https://github.com/aub-mind/arabert

In [3]:
from transformers import TFBertModel, BertTokenizer, BertConfig
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
from arabert.preprocess import ArabertPreprocessor

arabert_prep = ArabertPreprocessor(model_name='aubmindlab/bert-base-arabertv02-twitter',keep_emojis=False)
MAX_LENGHT = 32
config = BertConfig.from_pretrained( 'aubmindlab/bert-base-arabertv02-twitter', output_hidden_states=False)    
pre = BertTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02-twitter")
bert = TFBertModel.from_pretrained("aubmindlab/bert-base-arabertv02-twitter", config=config, from_pt=True, trainable=True)

In [4]:
df_dict = pd.read_excel("data.xlsx", sheet_name=None)
df = pd.concat(df_dict, axis=0, ignore_index=True)
classes = df.intent.unique().tolist()
print(classes)
#df.groupby('intent').nunique()

In [10]:
df['text'] = df['text'].apply(lambda x: arabert_prep.preprocess(x))

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df["text"], df["intent"],
                                                    stratify=df["intent"], 
                                                    test_size=0.1)

In [12]:
#X = X.apply(lambda x: pre(x, return_tensors="tf"))
y_train = y_train.apply(lambda x: classes.index(x))
y_valid = y_valid.apply(lambda x: classes.index(x))

X_train = pre(X_train.tolist(), return_tensors="tf", padding='max_length', max_length=MAX_LENGHT, truncation=True)
X_valid = pre(X_valid.tolist(), return_tensors="tf", padding='max_length', max_length=MAX_LENGHT, truncation=True)
#print(pre_data)

In [13]:

input_text = tf.keras.Input(shape=(MAX_LENGHT,), dtype=tf.int32, name="input")
#preprocessing = keras.layers.Lambda(lambda_layer, name="lambda_layer")(text.tolist())
bert_output = bert(input_text)

net = tf.keras.layers.Dropout(0.5, name='DropOut1')(bert_output['pooler_output'])
net = tf.keras.layers.Dense(units=768, activation='tanh', name='classifier')(net)
net = tf.keras.layers.Dropout(0.5, name='DropOut2')(net)
net = tf.keras.layers.Dense(units=len(classes), activation='softmax', name='output')(net)

model = tf.keras.Model(input_text, net)

In [14]:
loss = tf.keras.losses.SparseCategoricalCrossentropy()
metrics = tf.keras.metrics.SparseCategoricalAccuracy(name="acc")

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-6),
              loss=loss,
              metrics=metrics
            )

In [15]:
model.summary()

In [16]:
history = model.fit(
                    x = X_train['input_ids'],
                    y = y_train,
                    validation_data = (X_valid['input_ids'], y_valid),
                    epochs=100,
                   shuffle=True
                   )

In [17]:
from matplotlib import pyplot as plt
print(history.history.keys())
loss_train = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(len(loss_train))

plt.plot(epochs, loss_train, 'g', label='Training loss')
plt.plot(epochs, loss_val, 'b', label='validation loss')
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('fig-loss.png')
plt.show()

In [18]:
from matplotlib import pyplot as plt
print(history.history.keys())
acc_train = history.history['acc']
acc_val = history.history['val_acc']
epochs = range(len(acc_train))

plt.plot(epochs, acc_train, 'g', label='Training Accuracy')
plt.plot(epochs, acc_val, 'b', label='validation Accuracy')
plt.title('Training Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('fig-Accuracy.png')
plt.show()

In [21]:
#print(classes)
example = "اقرا لي الايميلات."
ids = pre(example, return_tensors="tf", padding='max_length', max_length=MAX_LENGHT, truncation=True)['input_ids']
pred = model.predict(ids)
index = tf.math.argmax(pred[0])
print(classes[index] + " with percentage: " + str(pred[0][index] * 100) + "\n")

scores = {}
for i in range(len(classes)):
    scores[classes[i]] = pred[0][i] * 100

scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
for k,v in scores.items():
    print(k + " with percentage: " + "{:.2f}".format(v))

In [22]:
#model.save('./intent_model.h5', save_format='h5')

In [26]:
#from IPython.display import FileLink
#FileLink(r'./intent_model.h5')

In [23]:
#!curl --upload-file './intent_model.h5' https://transfer.sh/intent_model.h5