### LSTM Model:

This notebook was run on Kaggle to make use of the GPU, so the output is removed. You can find the output in the screenshots folder.

In [1]:
!pip install 
!pip install tensorflow_addons 
!pip install gdown

In [2]:
#Importing libraries
import gdown
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder 
import tensorflow as tf
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
import tensorflow_addons as tfa
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from livelossplot import PlotLossesKerasTF

### 2- Importing the clean dataset:

In [4]:
#Downloading the clean dataste from Google Drive
!gdown 1Tjlk8QcMPLgEuOIQGO4n6WFnWoV-y0Ej

In [5]:
#Import the clean dataset
data = pd.read_csv("clean_dialect_dataset.csv", index_col=0)

In [6]:
data.head()

In [7]:
data.shape

In [8]:
#LAbel encoding the dialect
encoder = LabelEncoder()
data["dialect_transformed"] = encoder.fit_transform(data["dialect"])

In [9]:
#Will save for future use in the server
output = open('dl_dialect_encoder.pkl', 'wb')
pickle.dump(encoder, output)
output.close()

In [10]:
X = data.text
y = data.dialect

In [11]:
#OneHot Encoding for the target column to work with dense layer in the model
y = OneHotEncoder().fit_transform(np.array(y).reshape(-1,1)).toarray()

In [12]:
#Splitting the data
X_train, X_test, y_train, y_test  = train_test_split(X, y,train_size=0.8,
                                               stratify=data.loc[:,"dialect_transformed"],
                                               random_state=77)

## Using a Neural Network Approach (LSTM):

In [39]:
# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

#Splitting the training data into data and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

#Tokenizing the tweets
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences so each sequence is the same length
X_train_seq_padded = pad_sequences(X_train_seq, 140)
X_val_seq_padded = pad_sequences(X_val_seq, 140)
X_test_seq_padded = pad_sequences(X_test_seq, 140)

In [46]:
# Construct a simple RNN model
model = Sequential()

model.add(Embedding(len(tokenizer.index_word)+1, 128))
model.add(LSTM(128, dropout=0.01, recurrent_dropout=0.01))
model.add(Dense(256, activation='relu'))
model.add(Dense(18, activation='sigmoid'))

In [47]:
METRICS = [
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

#First tried the categorical_crossentropy but the performance was not that good.
#Then tried the SigmoidFocalCrossEntropy which resulted in better results but still overfitting!!
# model.compile(optimizer=Adam(learning_rate=0.0001), loss = 'categorical_crossentropy', metrics=METRICS)
model.compile(optimizer=Adam(learning_rate=0.001), loss = tfa.losses.SigmoidFocalCrossEntropy(alpha=0.20, gamma=2.0,reduction=tf.keras.losses.Reduction.AUTO), metrics=METRICS)
model.summary()

In [48]:
epochs =3

#Reduce learning rate when a metric has stopped improving.
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=2, min_lr=0.0001, mode='auto')

#save the Keras model or model weights at some frequency.
checkpoint = ModelCheckpoint("model_weights.h5", monitor='val_prc',save_weights_only=True, mode='max', verbose=1)

#Stop training when a monitored metric has stopped improving.
early_stop = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

#Callback
callbacks = [PlotLossesKerasTF(), reduce_lr, early_stop]

In [49]:
#Train the RNN model
history = model.fit(X_train_seq_padded, y_train,
                    batch_size = 512,
                    epochs = epochs,
                    validation_data=(X_val_seq_padded, y_val),
                    callbacks = callbacks)

In [50]:
y_pred = np.argmax(model.predict([X_test_seq_padded]), axis=1)
y_test_class = np.argmax(y_test, axis=1)

In [51]:
#Classification report and score on the test dataset
print(classification_report(y_test_class,y_pred))
print('Score:',accuracy_score(y_test_class,y_pred))