# Categorical Embedding

## Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import functools

from keras import backend as K
from itertools import chain
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.metrics import top_k_categorical_accuracy
from keras.models import Model, Sequential
from keras.layers import Dense, Dropout, Input, Embedding,Reshape, Concatenate, Conv1D, BatchNormalization, GlobalMaxPooling1D, MaxPooling1D
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score, accuracy_score

## Data

In [None]:
data = pd.read_csv("./data/splited_full_RASFF_DATA.csv", sep=";", header=0, index_col=0)
data = data.sample(frac=1)

data.head(1)

In [None]:
class Stage:
	def __init__(self, input, output):
		self.input = input
		self.output = output

		self.x = data.iloc[:, input]
		self.y = data.iloc[:, output]

		self.x_train, self.y_train = None, None
		self.x_val, self.y_val = None, None
		self.x_test, self.y_test = None, None

		self.__transform()

	def __transform(self):
		strategy_x = OneHotEncoder(handle_unknown="ignore")
		strategy_y = OneHotEncoder(handle_unknown="ignore", sparse=False)

		strategy_x.fit(self.x.values)
		strategy_y.fit(self.y.values)

		self.x = strategy_x.transform(self.x.values)
		self.y = strategy_y.transform(self.y.values)

		self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, test_size=0.2)
		self.x_train, self.x_val, self.y_train, self.y_val = train_test_split(self.x_train, self.y_train, test_size=0.25, random_state=42, shuffle=True)

	def get_metrics(self):
		result = model.predict(self.x_test, batch_size=64)
		result = np.argmax(result, axis=-1)

		print(f"- Accuracy: {round(accuracy_score(np.argmax(self.y_test, axis=-1), result)*100, 2)}%")
		print(f"- Specifity: {round(get_specifity(np.argmax(self.y_test, axis=-1), result)*100, 2)}%")
		print(f"- Sensitivity: {round(recall_score(np.argmax(self.y_test, axis=-1), result, average='macro', zero_division=0)*100, 2)}%")
		print(f"- Precision: {round(precision_score(np.argmax(self.y_test, axis=-1), result, average='macro', zero_division=0)*100, 2)}%")

		print(classification_report(np.argmax(self.y_test, axis=-1), result, zero_division=True))

		cm = confusion_matrix(np.argmax(self.y_test, axis=-1), result)
		cm = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(range(0, cm.shape[0])))

		_, ax = plt.subplots(figsize=(10, 10))
		cm.plot(ax=ax)

		plt.show()

## Preprocessing

In [None]:
data.DATE_CASE = data.DATE_CASE.astype(str)
data.HAZARDS_CAT = data.HAZARDS_CAT.astype(str)
data.COUNT_DESTIN = data.COUNT_DESTIN.astype(str)
data.COUNT_CONCERN = data.COUNT_CONCERN.astype(str)

data = data.dropna(subset=['DATE_CASE'])

In [None]:
def get_specifity(y_actual, y_pred):
    TN = []
    FP = []

    for index ,_id in enumerate(np.union1d(y_actual, y_pred)):
        FP.append(0)
        TN.append(0)

        for i in range(len(y_pred)):
            if y_pred[i] == _id and y_actual[i] != y_pred[i]:
                FP[index] += 1
            if y_actual[i] == y_pred[i] != _id:
                TN[index] += 1

    TN = sum(TN)
    FP = sum(FP)

    return TN/(TN + FP)

## Data Mining

In [None]:
stage1 = Stage(
	input=[0, 1, 6, 8],
	output=[2]
)

In [None]:
K.clear_session()

model = Sequential()
model.add(Dense(2048, activation="relu", input_dim=len(stage1.x_train[1].toarray().flatten())))
model.add(Dropout(0.5))
model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.4))
model.add(Dense(512, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(43, activation = "softmax"))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['categorical_accuracy'])

model.summary()

In [23]:
hist = model.fit(stage1.x_train ,stage1.y_train, epochs=170, validation_data=(stage1.x_val, stage1.y_val), batch_size=500)

In [None]:
stage1.input_list_train, stage1.input_list_test, stage1.input_list_testval = preproc(stage1.x_train, stage1.x_test, stage1.x_val)

hist = model.fit(stage1.input_list_train, stage1.y_train, validation_data=(stage1.input_list_testval, stage1.y_val), epochs=5, batch_size=64, verbose=1)

In [None]:
plt.style.use("ggplot")

plt.figure()

plt.plot(hist.history["loss"], label="train_loss")
plt.plot(hist.history["val_loss"], label="val_loss")
plt.plot(hist.history["accuracy"], label="train_acc")
plt.plot(hist.history["val_accuracy"], label="val_acc")

plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend(loc="lower left")

In [None]:
stage1.get_metrics()