# Classification

In this file, we train an LSTM to be able to classify a speech.

In [None]:
import numpy as np
import pickle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

from sklearn.model_selection import train_test_split

import sys
sys.path.append("../data")
from load_data import load_data

SAVE_PATH = "../saved_models"

In [None]:
data = load_data()

In [None]:
documents = list(data["text"])
flattened_documents = [" ".join(doc) for doc in documents]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(flattened_documents)

with open(f"{SAVE_PATH}/classification_tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
X = tokenizer.texts_to_sequences(flattened_documents)
maxlen = 5000
X = np.array(pad_sequences(X, padding="post", maxlen=maxlen))
y = np.array(to_categorical(list(data["speaker"]), num_classes=5))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=250, input_shape=(maxlen,)))
model.add(Bidirectional(LSTM(units=150)))
model.add(Dropout(0.5))
model.add(Dense(units=128, activation="relu", kernel_regularizer=l2(0.01)))
model.add(Dense(units=5, activation="softmax", kernel_regularizer=l2(0.01)))

model.compile(optimizer=Adam(learning_rate=0.001, clipvalue=1.0), loss="categorical_crossentropy", metrics=["accuracy"])

early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)

model.fit(X_train, y_train, epochs=1, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping, lr_scheduler])

model.save(f"{SAVE_PATH}/classification_model.keras")

In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")