## Import Required Libraries

In [None]:
# General libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from tqdm import tqdm
import matplotlib.pyplot as plt
import re
# scikit-learn libraries:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
# NLP libraries:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
# keras & tf libraries:
import tensorflow as tf
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
print(tf.__version__)

## Constants

In [None]:
# General:
DATA_PATH = "/"
MODEL_LANG_AR = "arabic"
MODEL_LANG_EN = "english"
PUNCT = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

CONTRACTION_DICT = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}

# Hyperparameters:
SPLIT_PERC = 0.25
SPLIT_RANDOM_STATE = 7
EMBEDDING_DIMENSION = 100
VOCABULARY_SIZE = 50000
MAX_LENGTH = 200
OOV_TOK = '<OOV>'
TRUNCATE_TYPE = 'post'
PADDING_TYPE = 'post'
NUM_EPOCHS = 20


## Read Data

In [None]:
data = pd.read_csv(DATA_PATH)

## Preprocess Data

#### Clean text:

In [None]:
"""
Label languages:
"""
def isEnglish(s):
    s = str(s)
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return "ar"
    else:
        return "en"

"""
Lowercase:
"""
data['Text Column'] = data['Text Column'].str.lower()


"""
Remove punctuations:
"""
def clean_text(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(PUNCT, f'')
    return x

"""
Remove contractions:
"""
def _get_contractions(CONTRACTION_DICT):
    contraction_re = re.compile('(%s)' % '|'.join(CONTRACTION_DICT.keys()))
    return CONTRACTION_DICT, contraction_re

contractions, contractions_re = _get_contractions(CONTRACTION_DICT)

def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)
"""
Remove stopwords from sentence. Note: Model language can be changed according to the language of your model.

"""
def remove_stopwords(sentence, MODEL_LANG_EN):
    sentence = str(sentence)
    nltk.download("stopwords", quiet=True) 
    stemmer = PorterStemmer()
    words = [word.lower() for word in word_tokenize(sentence)]
    words = [w for w in words if w not in stopwords.words(MODEL_LANG_EN)] # Remove stopwords 
    words = [w for w in words if "@" not in w]    
    sent = ' '.join(str(x) for x in words) 
    return sent    

"""
Lemmatize text:
"""
def getLemmText(text):
    tokens=word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens=[lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

"""
Stem words:
"""
def getStemmText(text):
    tokens=word_tokenize(text)
    ps = PorterStemmer()
    tokens=[ps.stem(word) for word in tokens]
    return ' '.join(tokens)

data['lang'] = data['Text Column'].apply(lambda x: isEnglish(x))
data['Text Column'] = data['Text Column'].apply(lambda x: clean_text(x))
data['Text Column'] = data['Text Column'].apply(lambda x: replace_contractions(x))
data['Text Column'] = data['Text Column'].apply(lambda x: remove_stopwords(x))
data['Text Column'] = list(map(getLemmText,data['Text Column']))
data['Text Column'] = list(map(getStemmText,data['Text Column']))

#### Model input preparation

In [None]:
def encode_labels_and_split_data(data):
    le = LabelEncoder()
    data['label'] = le.fit_transform(data['label'])
    y = data['label'].values
    X = data['Text Column']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= SPLIT_PERC, random_state= SPLIT_RANDOM_STATE, shuffle = True)
    return X_train, X_test, y_train, y_test


def prepare_inputs(xtrain, xtest):
    tokenizer = Tokenizer(num_words=VOCABULARY_SIZE, oov_token=OOV_TOK)
    tokenizer.fit_on_texts(list(xtrain) + list(xtest))
    xtrain_sequences = tokenizer.texts_to_sequences(xtrain)
    xtest_sequences = tokenizer.texts_to_sequences(xtest)
    word_index = tokenizer.word_index
    print('Vocabulary size:', len(word_index))
    xtrain_pad = sequence.pad_sequences(xtrain_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNCATE_TYPE)
    xtest_pad = sequence.pad_sequences(xtest_sequences, maxlen=MAX_LENGTH, padding=PADDING_TYPE, truncating=TRUNCATE_TYPE)
    return xtrain_pad, xtest_pad

X_train, X_test, y_train, y_test = encode_labels_and_split_data(data)
xtrain_pad, xtest_pad = prepare_inputs(xtrain, xtest)
print(len(xtrain_pad[0]))

## Build model

In [None]:
def build_model():
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, EMBEDDING_DIMENSION))
    model.add(SpatialDropout1D(0.25))
    model.add(Bidirectional(LSTM(EMBEDDING_DIMENSION, dropout=0.25, recurrent_dropout=0.25)))
    model.add(Dense(EMBEDDING_DIMENSION, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(150, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(EMBEDDING_DIMENSION, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(len(data['SBS Code'].unique())))
    model.add(Activation('softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
    return model
model  = build_model()
history = model.fit(xtrain_pad, ytrain, 
                    epochs=NUM_EPOCHS, 
                    validation_data=(xtest_pad, ytest), verbose=1)