#Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#copy the BERT model to Colab
!mkdir arabert
!cp -r "/content/drive/My Drive/arabert/" ./

#Installing Java and pyarabic for Farasa

To do Farasa segmenting you will need FarasaSegmenter.jar in the same directory as the preprocess.py file 

(you can get the Farasa segmenter from http://qatsdemo.cloudapp.net/farasa/register.html)

In [None]:
#install java on colab (needed for Farasa)
import os       
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()
!pip install py4j
!pip install pyarabic 

openjdk version "11.0.5" 2019-10-15
OpenJDK Runtime Environment (build 11.0.5+10-post-Ubuntu-0ubuntu1.118.04)
OpenJDK 64-Bit Server VM (build 11.0.5+10-post-Ubuntu-0ubuntu1.118.04, mixed mode, sharing)


In [None]:
#This command is usefull when the java runtime hangs after a runtime restart (colab issue)
!pkill "java"

#Clone the BERT repo that is compatible with our model

The cloned repo is made compatible with the vocab token that that have "\[ \]" (for the link \[رابط\], for twitter handles \[مستخدم\], for emails \[بريد\] and for the "+" in the Farasa segmenter ex: "الدراسات"-->"\[ال+, دراس ,+ات\]"

The preprocess file that we used is included in the araBERT repository,

In [None]:
!git clone https://github.com/WissamAntoun/bert #this implementation also has a compatible tokenizer

Cloning into 'bert'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects:   3% (1/32)[Kremote: Counting objects:   6% (2/32)[Kremote: Counting objects:   9% (3/32)[Kremote: Counting objects:  12% (4/32)[Kremote: Counting objects:  15% (5/32)[Kremote: Counting objects:  18% (6/32)[Kremote: Counting objects:  21% (7/32)[Kremote: Counting objects:  25% (8/32)[Kremote: Counting objects:  28% (9/32)[Kremote: Counting objects:  31% (10/32)[Kremote: Counting objects:  34% (11/32)[Kremote: Counting objects:  37% (12/32)[Kremote: Counting objects:  40% (13/32)[Kremote: Counting objects:  43% (14/32)[Kremote: Counting objects:  46% (15/32)[Kremote: Counting objects:  50% (16/32)[Kremote: Counting objects:  53% (17/32)[Kremote: Counting objects:  56% (18/32)[Kremote: Counting objects:  59% (19/32)[Kremote: Counting objects:  62% (20/32)[Kremote: Counting objects:  65% (21/32)[Kremote: Counting objects:  68% (22/32)[Kremote: Counting obj

In [None]:
!mv ./FarasaSegmenterJar.jar ./bert

mv: cannot stat './FarasaSegmenterJar.jar': No such file or directory


In [None]:
import tensorflow as tf
from bert import tokenization
from bert.preprocess_arabert import preprocess

#Mount your drive folder and configure the path to the araBERT folder
ARABERT_PATH = "./arabert"

In [None]:
#test BERT tokenizer
bert_tokenizer = tokenization.FullTokenizer(ARABERT_PATH+"/vocab.txt")




In [None]:
text = " @arabert https://arabert.com الدراسات النظرية للتصميم الحديث"
text_prep = preprocess(text)
print(text_prep)

[مستخدم] [رابط] ال+ دراس +ات ال+ نظري +ة ل+ ال+ تصميم ال+ حديث


In [None]:
bert_tokenizer.tokenize(text_prep)

['[مستخدم]',
 '[رابط]',
 'ال+',
 'دراس',
 '+ات',
 'ال+',
 'نظري',
 '+ة',
 'ل+',
 'ال+',
 'تصميم',
 'ال+',
 'حديث']

##Tensorflow Training

**ENABLE GPU RUNTIME if your files are on drive or colab local drive!!!**

Test Sentiment Analysis score on a dataset like the AJGT

K. M. Alomari, H. M. ElSherif, and K. Shaalan, “Arabic tweets sentimental analysis using machine learning,” in Proceedings of the International Conference on Industrial, Engineering and Other Applications of Applied Intelligent Systems, pp. 602–610, Montreal, Canada, June 2017.

In [None]:
import os
import sys
import json
import nltk
import random
import logging
import tensorflow as tf
import pandas as pd

from glob import glob
from tensorflow.keras.utils import Progbar
from tqdm import tqdm
sys.path.append("bert")

import bert
from bert import modeling, optimization, tokenization
from bert.run_classifier import input_fn_builder, model_fn_builder

from sklearn.model_selection import train_test_split
  
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s :  %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]

if 'COLAB_TPU_ADDR' in os.environ:
  log.info("Using TPU runtime")
  USE_TPU = True
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

  with tf.Session(TPU_ADDRESS) as session:
    log.info('TPU address is ' + TPU_ADDRESS)
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    
else:
  log.warning('Not connected to TPU runtime')
  USE_TPU = False

2020-01-31 01:48:17,157 :  Not connected to TPU runtime


In [None]:
# Input data pipeline config
TRAIN_BATCH_SIZE = 32 #@param {type:"integer"} #You can probably 
                                              #increase when using TPUS
MAX_SEQ_LENGTH = 128 #@param {type:"integer"} #512 if running on TPU

# Training procedure config
EVAL_BATCH_SIZE = 64 
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 6 #@param {type:"integer"}
WARMUP_PROPORTION = 0.1 #@param {type:"number"}
NUM_TPU_CORES = 8
PREDICT_BATCH_SIZE = 8

CONFIG_FILE = os.path.join(ARABERT_PATH, "bert_config.json")
INIT_CHECKPOINT = os.path.join(ARABERT_PATH,"arabert_model.ckpt")

OUTPUT_DIR_PER_MODEL = "./finetuned_model"
bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)

log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))

print("ARABERT_PATH: "+ARABERT_PATH)
print("CONFIG_FILE: "+CONFIG_FILE)
print("INIT_CHECKPOINT: "+INIT_CHECKPOINT)

2020-01-31 01:48:22,175 :  Using checkpoint: ./arabert/arabert_model.ckpt


ARABERT_PATH: ./arabert
CONFIG_FILE: ./arabert/bert_config.json
INIT_CHECKPOINT: ./arabert/arabert_model.ckpt


In [None]:
df_AJGT = pd.read_excel('./bert/AJGT.xlsx',header=0)

DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'

df_AJGT = df_AJGT[['Feed', 'Sentiment']]
df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]

df_AJGT['text'] = df_AJGT['text'].apply(lambda x: preprocess(x,True))

train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2,random_state=42)

In [None]:
train_InputExamples = train_AJGT.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                    text_a = x["text"], 
                                                                    text_b = None, 
                                                                    label = x["label"]), axis = 1)

test_InputExamples = test_AJGT.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                    text_a = x["text"], 
                                                                    text_b = None, 
                                                                    label = x["label"]), axis = 1)

In [None]:
labels = list(df_AJGT.label.unique())
print(labels)

train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, labels, MAX_SEQ_LENGTH, bert_tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, labels, MAX_SEQ_LENGTH, bert_tokenizer)

2020-01-31 01:19:19,286 :  From /content/bert/run_classifier.py:775: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

2020-01-31 01:19:19,288 :  Writing example 0 of 1440
2020-01-31 01:19:19,290 :  *** Example ***
2020-01-31 01:19:19,291 :  guid: None
2020-01-31 01:19:19,295 :  tokens: [CLS] سبحان الله ب+ حمد +ه عدد خلق +ه رضى نفس +ه زن +ه عرش +ه مداد كلم +ات +ه [SEP]
2020-01-31 01:19:19,296 :  input_ids: 29756 36006 12695 448 3945 129 5367 4095 129 4444 6746 129 630 129 5383 129 21336 6025 1012 129 29758 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2020-01-31 01:19:19,299 :  input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

['Positive', 'Negative']


2020-01-31 01:19:19,680 :  Writing example 0 of 360
2020-01-31 01:19:19,681 :  *** Example ***
2020-01-31 01:19:19,683 :  guid: None
2020-01-31 01:19:19,684 :  tokens: [CLS] و+ الله حرام و+ الله موتو +ه ل+ شعب ال+ اردني من و ##ين بدن +ا نجيب ال+ كو من و ##ين يا الله ارحم ##و من في ال+ ارض يرحمك ##م من في ال+ سماء الله حرام [SEP]
2020-01-31 01:19:19,685 :  input_ids: 29756 897 12695 16006 897 12695 22398 129 816 4928 3000 31462 857 117 8268 3106 124 22786 3000 813 857 117 8268 900 12695 12271 1005 857 781 3000 2889 41768 1002 857 781 3000 17867 12695 16006 29758 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2020-01-31 01:19:19,686 :  input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [None]:
num_train_steps = int(len(train_features) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
num_steps_per_epoch = int(len(train_features) / TRAIN_BATCH_SIZE)

print("num train steps: {}".format(num_train_steps))
print("num warmup steps: {}".format(num_warmup_steps))
print("num_steps_per_epoch: {}".format(num_steps_per_epoch))

model_fn = model_fn_builder(
  bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
  num_labels=2,
  init_checkpoint=INIT_CHECKPOINT,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=USE_TPU,
  use_one_hot_embeddings=USE_TPU
)

tpu_cluster_resolver = None
if USE_TPU:
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=OUTPUT_DIR_PER_MODEL,
    save_checkpoints_steps=num_steps_per_epoch,
    keep_checkpoint_max=0,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=num_steps_per_epoch,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=USE_TPU,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    predict_batch_size=PREDICT_BATCH_SIZE)
  
train_input_fn = input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=USE_TPU)

test_input_fn = input_fn_builder(
  features=test_features,
  seq_length=MAX_SEQ_LENGTH,
  is_training=False,
  drop_remainder=USE_TPU)

2020-01-31 01:48:26,198 :  Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x7fc3dfcd1d90>) includes params argument, but params are not passed to Estimator.
2020-01-31 01:48:26,201 :  Using config: {'_model_dir': './finetuned_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 45, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 0, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc3dfc52be0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '

num train steps: 270
num warmup steps: 27
num_steps_per_epoch: 45


##Train the model

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
print(f'Beginning Training!')
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

Beginning Training!


2020-01-31 01:50:41,886 :  Calling model_fn.
2020-01-31 01:50:41,887 :  Running train on CPU
2020-01-31 01:50:41,888 :  *** Features ***
2020-01-31 01:50:41,889 :    name = input_ids, shape = (32, 128)
2020-01-31 01:50:41,890 :    name = input_mask, shape = (32, 128)
2020-01-31 01:50:41,892 :    name = label_ids, shape = (32,)
2020-01-31 01:50:41,893 :    name = segment_ids, shape = (32, 128)
2020-01-31 01:50:44,568 :  **** Trainable Variables ****
2020-01-31 01:50:44,569 :    name = bert/embeddings/word_embeddings:0, shape = (64000, 768), *INIT_FROM_CKPT*
2020-01-31 01:50:44,569 :    name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
2020-01-31 01:50:44,573 :    name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
2020-01-31 01:50:44,578 :    name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
2020-01-31 01:50:44,582 :    name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
2020-0

<tensorflow_estimator.python.estimator.tpu.tpu_estimator.TPUEstimator at 0x7fc3dfc37eb8>

##Evaluate the model on all saved checkpoint files

In [None]:
print(f'Beginning Evaluation!')
eval_model_files = tf.gfile.Glob(os.path.join(OUTPUT_DIR_PER_MODEL,'*index'))

for eval_checkpoint in tqdm(sorted(eval_model_files,key=lambda x: int(x[0:-6].split('-')[-1]))):
  result = estimator.evaluate(input_fn=test_input_fn, steps=int(len(test_features)/EVAL_BATCH_SIZE),checkpoint_path=eval_checkpoint[0:-6])
  tf.logging.info("***** Eval results *****")
  for key in sorted(result.keys()):
    tf.logging.info("  %s = %s", key, str(result[key]))

  0%|          | 0/7 [00:00<?, ?it/s]2020-01-31 01:56:30,066 :  Calling model_fn.
2020-01-31 01:56:30,066 :  Running eval on CPU
2020-01-31 01:56:30,067 :  *** Features ***
2020-01-31 01:56:30,069 :    name = input_ids, shape = (64, 128)
2020-01-31 01:56:30,071 :    name = input_mask, shape = (64, 128)
2020-01-31 01:56:30,073 :    name = label_ids, shape = (64,)
2020-01-31 01:56:30,074 :    name = segment_ids, shape = (64, 128)


Beginning Evaluation!


2020-01-31 01:56:32,058 :  **** Trainable Variables ****
2020-01-31 01:56:32,060 :    name = bert/embeddings/word_embeddings:0, shape = (64000, 768), *INIT_FROM_CKPT*
2020-01-31 01:56:32,060 :    name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
2020-01-31 01:56:32,068 :    name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
2020-01-31 01:56:32,071 :    name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
2020-01-31 01:56:32,073 :    name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
2020-01-31 01:56:32,075 :    name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
2020-01-31 01:56:32,077 :    name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
2020-01-31 01:56:32,079 :    name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
2020-01-31 01:56:32,081 :    name =

##Results

araBERT achieved >93 acc on AJGT compared to 84 for mBERT (Tested prev, you can also try it using tf_hub scripts)

we think that araBERT can get better score with more data cleaning and preprocessing.

It also shows shows that it can adapt well for dialectal data (which is the most comon)