# Sentiment Analysis from Arabic Tweets Using AraBERT

**Workflow:**
1. Import Data
2. Load AraBERT model
3. Preprocessing
4. Training and validation
5. Saving the model


In [None]:
!pip install ktrain

Collecting ktrain
  Downloading ktrain-0.28.3.tar.gz (25.3 MB)
[K     |████████████████████████████████| 25.3 MB 1.2 MB/s 
[?25hCollecting scikit-learn==0.23.2
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 58.0 MB/s 
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 53.9 MB/s 
Collecting cchardet
  Downloading cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263 kB)
[K     |████████████████████████████████| 263 kB 71.9 MB/s 
Collecting syntok
  Downloading syntok-1.3.3-py3-none-any.whl (22 kB)
Collecting seqeval==0.0.19
  Downloading seqeval-0.0.19.tar.gz (30 kB)
Collecting transformers<=4.10.3,>=4.0.0
  Downloading transformers-4.10.3-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 53.6 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ktrain
from ktrain import text
from sklearn.metrics import ConfusionMatrixDisplay
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 

## Import Data

In [None]:
df_train = pd.read_excel('/content/train2.xlsx')
df_test = pd.read_excel('/content/test_data.xlsx')
df_val = pd.read_excel('/content/val_data.xlsx')


In [None]:
df_train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Username,Timestamp,followers,Tweets,label
0,19565,19565,i_rw707,Mon Jan 10 00:22:37 +0000 2022,40,باقي ماشفت شي,pos
1,21642,21642,AliAlhosen1,Thu Oct 08 18:28:34 +0000 2020,55,برعايه اخوي عبد الرحمن الحميد مسابقه خفيفه صلو...,neg
2,27673,27673,mussic_4,Sat Oct 10 21:22:36 +0000 2020,22178,الرجل يضع أحيانا لايك علي بوست لم يعجبه بالفعل...,pos
3,7736,7736,Meedo_8,Sun Oct 11 00:31:19 +0000 2020,15221,الشيء الوحيد الذي وصلوا فيه للعالميه هو المسيا...,pos
4,28823,28823,fulla794,Mon Oct 12 19:56:57 +0000 2020,73,ايش رأيك تشهر تغريداتك لملايين المتابعين فقط ب...,pos


In [None]:
# set hyperparameters
maxlen = 64
batch_size = 16
lr = 2e-5
epochs = 3

In [None]:
df_train

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Username,Timestamp,followers,Tweets,label
0,19565,19565,i_rw707,Mon Jan 10 00:22:37 +0000 2022,40,باقي ماشفت شي,pos
1,21642,21642,AliAlhosen1,Thu Oct 08 18:28:34 +0000 2020,55,برعايه اخوي عبد الرحمن الحميد مسابقه خفيفه صلو...,neg
2,27673,27673,mussic_4,Sat Oct 10 21:22:36 +0000 2020,22178,الرجل يضع أحيانا لايك علي بوست لم يعجبه بالفعل...,pos
3,7736,7736,Meedo_8,Sun Oct 11 00:31:19 +0000 2020,15221,الشيء الوحيد الذي وصلوا فيه للعالميه هو المسيا...,pos
4,28823,28823,fulla794,Mon Oct 12 19:56:57 +0000 2020,73,ايش رأيك تشهر تغريداتك لملايين المتابعين فقط ب...,pos
...,...,...,...,...,...,...,...
28969,17992,17992,Pure_Mind77,Mon Oct 12 20:31:30 +0000 2020,25207,ينفذ الإرشاد الأكاديمي بثانويه الخوارزمي مبادر...,neg
28970,41969,41969,HAlmoshr,Sat Oct 10 19:10:17 +0000 2020,696883,بنشكر الله انو القلب عضله مو عظم بضل التعضيل ا...,neg
28971,18949,18949,RadwaHassan21,Sat Oct 10 18:39:55 +0000 2020,254,يجعلها دايمه ياارب,pos
28972,12671,12671,suhairhayek,Sat Oct 10 20:52:09 +0000 2020,1308,س تظل الصبآحآت تشرق ب فرح يطبطب علي الروح مآ خ...,pos


In [None]:
df_test

Unnamed: 0.1,Unnamed: 0,Username,Timestamp,followers,Tweets,label
0,31610,kasm__2010,Sun Jan 09 10:29:46 +0000 2022,74,متي ناوي تفاتحني بالموضوع,pos
1,7725,mohmedkheder1,Sat Oct 10 21:33:32 +0000 2020,229,بغيت شنو,neg
2,31906,a_lkhayr,Thu Oct 08 20:56:13 +0000 2020,856,إن شاء الله يارب,pos
3,37229,Osamatmamm21,Fri Oct 09 21:57:17 +0000 2020,583,بمناسبه فوز الهلال سحب علي آيفون رتويت وتابع ا...,pos
4,27088,Afnanelmhlawy4,Thu Oct 08 19:57:17 +0000 2020,3127,إشراقه الصباح معجزه إلهيه تفوق بجمالها سكينه ا...,pos
...,...,...,...,...,...,...
9050,32054,saharatshan,Mon Oct 12 21:46:46 +0000 2020,2190,مارح يفهمون كلاب هلال,pos
9051,251,aziiiiz56,Thu Oct 08 18:51:03 +0000 2020,1788,باتسر تقلبون عليه يالمشجعين الهلاليين المتعصبي...,pos
9052,38330,albrahim_noura,Sat Oct 10 19:50:05 +0000 2020,283,قبضوا امنجي في في نقطه تفتيش معاه مسدس و قيل م...,neg
9053,11751,Yoon73gi,Mon Jan 10 16:56:29 +0000 2022,75,هل تتفق علي ذلك بالطبع,pos


In [None]:
df_val

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Username,Timestamp,followers,Tweets,label
0,30960,30960,Ey68pb,Thu Oct 08 20:49:35 +0000 2020,80,حزين ان التيكت مجاش ومنه لله السبب بس صباح الت...,neg
1,10138,10138,Ahlam19928,Sun Oct 11 00:38:31 +0000 2020,0,حبيباتي صحباتي الف مبروك تخرجكم والف مبروك علي...,neg
2,13486,13486,rashtial10,Sat Oct 10 20:15:15 +0000 2020,6,ا بر هموم القلب تر ك للصلاه والا تر الدنيا مشا...,pos
3,40793,40793,iMajeed_,Mon Oct 12 21:20:18 +0000 2020,126,ايه ما تفارق تفكيرك لن تعرف الراحه الا اذا همس...,pos
4,25910,25910,H3Pax,Sat Oct 10 22:51:48 +0000 2020,327,لا يمكن ان اتعاطف مع اي سعودي من رجل الاعمال ف...,neg
...,...,...,...,...,...,...,...
7239,26938,26938,alolyyan999,Sat Oct 10 19:37:53 +0000 2020,418,واحنا ايش يبقا لنا مع هاللعيبه وكذا بدونهم ايش...,neg
7240,32483,32483,ahed6991,Sun Jan 09 23:10:18 +0000 2022,842,اجل هالكلام عام صح,neg
7241,9418,9418,fahadx_h,Fri Oct 09 21:19:32 +0000 2020,720,جيل عظيم,neg
7242,41586,41586,abuali_168,Mon Oct 12 16:54:56 +0000 2020,3396,انتم بحاجه ليوم جميل لذلك صباحكم انا,pos


In [None]:
# X = df['Tweets'].values
# y = df['label'].values
# X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)
# print('training data: ', X_train.shape[0])
# print('test data: ', X_test.shape[0])

training data:  36218
test data:  9055


In [None]:
df_val.isna().sum()

Unnamed: 0      0
Unnamed: 0.1    0
Username        0
Timestamp       0
followers       0
Tweets          4
label           0
dtype: int64

In [None]:
df_val.dropna(subset=['Tweets'], inplace=True)

In [None]:
df_val.isna().sum()

Unnamed: 0      0
Unnamed: 0.1    0
Username        0
Timestamp       0
followers       0
Tweets          0
label           0
dtype: int64

In [None]:
df_train.isna().sum()

Unnamed: 0       0
Unnamed: 0.1     0
Username         0
Timestamp        0
followers        0
Tweets          23
label            0
dtype: int64

In [None]:
df_train.dropna(subset=['Tweets'], inplace=True)

In [None]:
df_train.isna().sum()

Unnamed: 0      0
Unnamed: 0.1    0
Username        0
Timestamp       0
followers       0
Tweets          0
label           0
dtype: int64

In [None]:
df_test.isna().sum()

Unnamed: 0    0
Username      0
Timestamp     0
followers     0
Tweets        6
label         0
dtype: int64

In [None]:
df_test.dropna(subset=['Tweets'], inplace=True)

In [None]:
df_test.isna().sum()

Unnamed: 0    0
Username      0
Timestamp     0
followers     0
Tweets        0
label         0
dtype: int64

## Load Model

In [None]:
MODEL_NAME = 'aubmindlab/bert-base-arabertv01'
t = text.Transformer(MODEL_NAME, maxlen=maxlen)

## Preprocessing

In [None]:
trn = t.preprocess_train(df_train.Tweets.values, df_train.label.values)
val = t.preprocess_test(df_val.Tweets.values, df_val.label.values)
tst = t.preprocess_test(df_test.Tweets.values, df_test.label.values)

preprocessing train...
language: ar
train sequence lengths:
	mean : 12
	95percentile : 22
	99percentile : 24




Is Multi-Label? False
preprocessing test...
language: ar
test sequence lengths:
	mean : 12
	95percentile : 22
	99percentile : 24


preprocessing test...
language: ar
test sequence lengths:
	mean : 12
	95percentile : 22
	99percentile : 25


## Train the model

#### Wrap the model in a learner object

In [None]:
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=batch_size)

Downloading:   0%|          | 0.00/742M [00:00<?, ?B/s]

#### Train

In [None]:
history = learner.fit_onecycle(lr, epochs)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


## Evaluate

In [None]:
learner.validate(val_data=tst)

              precision    recall  f1-score   support

           0       0.79      0.81      0.80      4514
           1       0.80      0.78      0.79      4535

    accuracy                           0.79      9049
   macro avg       0.79      0.79      0.79      9049
weighted avg       0.79      0.79      0.79      9049



array([[3643,  871],
       [ 996, 3539]])

Let's make a prediction

In [None]:
p = ktrain.get_predictor(learner.model, t)

In [None]:
p.predict("التعامل مع اللغة العربية معقد وليس له فائدة")

'neg'

In [None]:
p.predict("المدن الذكية سوف تكون قفزة مبهرة في تطور الذكاء الاصطناعي")

'pos'

## Saving the model
To reload the predictor use: ktrain.load_predictor


In [None]:
ktrain.load_predictor

<function ktrain.core.load_predictor>

In [None]:
predictor.save("/content/drive/MyDrive/models/ar-bert-model")