## Import Required Libraries

In [None]:
# General libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from tqdm import tqdm
import matplotlib.pyplot as plt
import re
# scikit-learn libraries:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
# NLP libraries:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from official.nlp import optimization
# keras & tf libraries:
import tensorflow as tf
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
import tensorflow_hub as hub
import tensorflow_text as text


## Constants

In [None]:
# General:
DATA_PATH = "/"
MODEL_LANG_AR = "arabic"
MODEL_LANG_EN = "english"

# Model Hyperparameters:
ACTIVATION = 'sigmoid' | None | 'softmax'
LOSS = tf.keras.losses.BinaryCrossentropy(from_logits=True)
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]
EPOCHS = 100
INIT_LR = 2e-5
OPTIMIZER_TYPE = 'adamw'

## Read Data

In [None]:
data = pd.read_csv(DATA_PATH)

## TF Preprocessing & model

In [None]:
"""
Label languages:
"""
def isEnglish(s):
    s = str(s)
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return "ar"
    else:
        return "en"
data['lang'] = data['Text Column'].apply(lambda x: isEnglish(x))

In [None]:
# BERT model to fine-tune
bert_model_name = 'bert_en_cased_L-12_H-768_A-12' 
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3'

In [None]:
def encode_labels_and_split_data(data):
    le = LabelEncoder()
    data['label'] = le.fit_transform(data['label'])
    y = data['label'].values
    X = data['Text Column']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= SPLIT_PERC, random_state= SPLIT_RANDOM_STATE, shuffle = True)
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = encode_labels_and_split_data(data)

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_encoder = hub.KerasLayer(tfhub_handle_encoder)

# test:
test = ['This is an amazing test!']
text_preprocessed = bert_preprocess_model(test)
print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

In [None]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    lyr = outputs['pooled_output']
    lyr = tf.keras.layers.Dropout(0.2)(lyr)
    lyr = tf.keras.layers.Dense(1, activation=ACTIVATION, name='classifier')(lyr)
    return tf.keras.Model(text_input, lyr)

In [None]:
def train_model(X_train, y_train):
    model = build_classifier_model()
    steps_per_epoch = tf.data.experimental.cardinality(X_train).numpy()
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1*num_train_steps)
    optimizer = optimization.create_optimizer(init_lr= INIT_LR,
                                              num_train_steps= num_train_steps,
                                              num_warmup_steps= num_warmup_steps,
                                              optimizer_type= OPTIMIZER_TYPE)
    model.compile(optimizer=optimizer,
                             loss=LOSS,
                             metrics=METRICS)
    history = model.fit(X_train, y_train,epochs=EPOCHS)

train_model(X_train, y_train)