# Categorical embedding

## Libraries

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix
import matplotlib.pyplot as plt
from keras import metrics
from keras.models import Model
from keras.layers import Dense, Dropout, Input, Embedding,Reshape, Concatenate

## Training data

In [None]:
data = pd.read_csv("../splited_full_RASFF_DATA.csv", sep=";", header=0, index_col=0)
data = data.sample(frac=1)

data.head(5)

## Basic pre-processing

In [None]:
print("Initial length:", len(data))

data.HAZARDS_CAT = data.HAZARDS_CAT.astype(str)
data.DATE_CASE = data.DATE_CASE.astype(str)
data.DATE_CASE = pd.to_datetime(data.DATE_CASE, errors="coerce")
data.DATE_CASE = data.DATE_CASE.dt.month

data.dropna(subset=["DATE_CASE"], inplace=True)

print("Final length:", len(data))

## Features selection

In [None]:
categorical_vars = [0, 1, 6, 8]
target_vars = [2]

X = data.iloc[:, categorical_vars]
Y = data.iloc[:, target_vars]

In [None]:
ency = OneHotEncoder(handle_unknown="ignore", sparse=False)

ency.fit(Y.values)

y_one_hot = ency.transform(Y.values)

## Split train-val-test

In [None]:
x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(X, y_one_hot, test_size=0.2, random_state=42, shuffle=True)
x_training_data, x_val_data, y_training_data, y_val_data = train_test_split(x_training_data, y_training_data, test_size=0.2, random_state=42, shuffle=True)

## Coding and conversion to lists for beign able to introduce it into the model

In [None]:
categorical_vars = data.iloc[:, categorical_vars].columns

def preproc(X_train, X_test, X_val):
    input_list_train = []
    input_list_test = []
    input_list_testval = []
    
    for c in categorical_vars:
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        
        input_list_train.append(X_train[c].map(val_map).values)
        input_list_test.append(X_test[c].map(val_map).fillna(0).values)
        input_list_testval.append(X_val[c].map(val_map).fillna(0).values)

    return input_list_train, input_list_test,input_list_testval

In [None]:
input_list_train, input_list_test, input_list_testval = preproc(x_training_data, x_test_data, x_val_data)

## Metrics definition

In [None]:
def top_1_categorical_accuracy(y_true, y_pred):
	return metrics.top_k_categorical_accuracy(y_true, y_pred, k=1)

def top_2_categorical_accuracy(y_true, y_pred):
    return metrics.top_k_categorical_accuracy(y_true, y_pred, k=2)

def top_3_categorical_accuracy(y_true, y_pred):
    return metrics.top_k_categorical_accuracy(y_true, y_pred, k=3)

## Embeddings + MLP Models (cases 1 and 3)

In [None]:
input_models = []
output_embeddings = []

for categorical_var in categorical_vars:
    cat_emb_name = categorical_var.replace(" ", "") + "_Embedding"
    input_name = "Input_" + categorical_var.replace(" ", "")
    no_of_unique_cat = x_training_data[categorical_var].nunique()
    embedding_size = int(min(np.ceil((no_of_unique_cat)/2), 50))
   
    input_model = Input(shape=(1, ), name=input_name)
    output_model = Embedding(no_of_unique_cat, embedding_size, name=cat_emb_name)(input_model)
    output_model = Reshape(target_shape=(embedding_size, ))(output_model)    
    
    input_models.append(input_model)
    output_embeddings.append(output_model)
  
output = Concatenate()(output_embeddings)
output = Dense(2048,activation="relu")(output)
output = Dropout(0.3)(output)
output = Dense(1024,activation="relu")(output)
output = Dropout(0.2)(output)
output = Dense(512,activation="relu")(output)
output = Dropout(0.2)(output)
output = Dense(42, activation="softmax")(output)

model = Model(inputs=input_models, outputs=output)

In [None]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy", top_1_categorical_accuracy,top_2_categorical_accuracy,top_3_categorical_accuracy])

model.summary()

In [None]:
# REVIEW: No validation data has been provided
 
# hist = model.fit(input_list_train, y_training_data, validation_data=(input_list_test, y_test_data), epochs=5 , batch_size=64, verbose=1)
hist = model.fit(input_list_train, y_training_data, validation_data=(input_list_testval, y_val_data), epochs=5, batch_size=64, verbose=1)

In [None]:
def get_metrics():
	result = model.predict(input_list_test, batch_size=64)
	result = np.argmax(result, axis=-1)

	valid_loss, valid_accuracy, acc1, acc2, acc3 = model.evaluate(input_list_test)

	print("Loss:", valid_loss)
	print("Accuracy:", valid_accuracy)
	print("Top-1 Accuracy:", acc1)
	print("Top-2 Accuracy:", acc2)
	print("Top-3 Accuracy:", acc3)

	print(classification_report(np.argmax(y_test_data, axis=-1), result, zero_division=True))

	cm = confusion_matrix(np.argmax(y_test_data, axis=-1), result)
	cm = ConfusionMatrixDisplay(confusion_matrix=cm)

	fig, ax = plt.subplots(figsize=(20, 20))
	cm.plot(ax=ax)

	plt.show()

In [None]:
get_metrics()

In [None]:
# result = model.predict(input_list_test, batch_size=64)
# result = np.argmax(result, axis=1)

# valid_loss, valid_accuracy, acc1, acc2, acc3 = model.evaluate(input_list_test)

In [None]:
# print("Loss:", valid_loss)
# print("Accuracy:", valid_accuracy)
# print("Top-1 Accuracy:", acc1)
# print("Top-2 Accuracy:", acc2)
# print("Top-3 Accuracy:", acc3)

In [None]:
# print(classification_report(np.argmax(y_test_data, axis=-1), result, zero_division=True))

In [None]:
# cm = confusion_matrix(np.argmax(y_test_data, axis=-1), result)
# cm = ConfusionMatrixDisplay(confusion_matrix=cm)

# fig, ax = plt.subplots(figsize=(20, 20))
# cm.plot(ax=ax)

In [None]:
N = 5

plt.style.use("ggplot")

plt.figure()

plt.plot(np.arange(0, N), hist.history["loss"], label="train_loss")
plt.plot(np.arange(0, N), hist.history["val_loss"], label="val_loss")
plt.plot(np.arange(0, N), hist.history["accuracy"], label="train_acc")
plt.plot(np.arange(0, N), hist.history["val_accuracy"], label="val_acc")

plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend(loc="lower left")

plt.show()

In [None]:
historials = []
evaluations = []

for i in range (1, 6):
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy", top_1_categorical_accuracy, top_2_categorical_accuracy, top_3_categorical_accuracy])
    
    hist = model.fit(input_list_train, y_training_data, validation_data=(input_list_testval, y_val_data), epochs=25, batch_size=64, verbose=0)
    historials.append(hist)
    
    evaluation = model.evaluate(x=input_list_test, y=y_test_data)
    evaluations.append(evaluation)
    
    model.save("model" + str(i) + ".h5")

    get_metrics()

    print("\n\n-----------------------\n")