# Categorical embedding

## Libraries

In [6]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix
import matplotlib.pyplot as plt
from keras import metrics
from keras.models import Model
from keras.layers import Dense, Dropout, Input, Embedding,Reshape, Concatenate
from itertools import chain

## Training data

In [2]:
data = pd.read_csv("./data/full_RASFF_DATA.csv", sep=";", header=0, index_col=0)

data.head(3)

Unnamed: 0,CLASSIF,DATE_CASE,REF,NOT_COUNTRY,SUBJET,PROD_CAT,TYPE,RISK_DECISION,ACTION_TAKEN,DISTRIBUTION_STAT,PRODUCT,HAZARDS,HAZARDS_CAT,COUNT_ORIGEN,COUNT_DESTIN,COUNT_CONCERN,NUMBER
0,alert,2020-10-16,2020.4364,France,Listeria monocytogenes (presence) in ham trimm...,meat and meat products (other than poultry),food,serious,recall from consumers,distribution to other member countries,ham trimmings,listeria monocytogenes,microbial contaminants (other),France,"Czech Republic,United Kingdom",,
1,border rejection,2020-10-16,2020.4349,Bulgaria,prochloraz (0.696 mg/kg - ppm) in mandarins fr...,fruits and vegetables,food,serious,destruction,product not (yet) placed on the market,mandarins,prochloraz,pesticide residues,Turkey,,Bulgaria,
2,border rejection,2020-10-16,2020.435,Bulgaria,fenvalerate (0.357 mg/kg - ppm) in chilled man...,fruits and vegetables,food,serious,re-dispatch,product not (yet) placed on the market,chilled mandarins,fenvalerate,pesticide residues,Turkey,,Bulgaria,


## Basic pre-processing

In [3]:
data.DATE_CASE = data.DATE_CASE.astype(str)
data.HAZARDS_CAT = data.HAZARDS_CAT.astype(str)
data.COUNT_DESTIN = data.COUNT_DESTIN.astype(str)
data.COUNT_CONCERN = data.COUNT_CONCERN.astype(str)

data.dropna(subset=data.columns[[1, 3, 5, 8, 9, 12, 13]], inplace=True)

data = data.sample(frac=1)

In [4]:
def chainer(s):
    return list(chain.from_iterable(s.str.split(',')))

In [7]:
lens = data['HAZARDS_CAT'].str.split(',').map(len)
split1 = pd.DataFrame({'DATE_CASE': np.repeat(data['DATE_CASE'], lens),
                    'NOT_COUNTRY': np.repeat(data['NOT_COUNTRY'], lens),
                    'PROD_CAT': np.repeat(data['PROD_CAT'], lens),
                    'TYPE': np.repeat(data['TYPE'], lens),
                    'RISK_DECISION': np.repeat(data['RISK_DECISION'], lens),
                    'ACTION_TAKEN': np.repeat(data['ACTION_TAKEN'], lens),
                    'DISTRIBUTION_STAT': np.repeat(data['DISTRIBUTION_STAT'], lens),
                    'HAZARDS_CAT': chainer(data['HAZARDS_CAT']),
                    'COUNT_ORIGEN': np.repeat(data['COUNT_ORIGEN'], lens),
                    'COUNT_DESTIN': np.repeat(data['COUNT_DESTIN'], lens),
                    'COUNT_CONCERN': np.repeat(data['COUNT_CONCERN'], lens)})

lens = split1['COUNT_ORIGEN'].str.split(',').map(len)
split2 = pd.DataFrame({'DATE_CASE': np.repeat(split1['DATE_CASE'], lens),
                    'NOT_COUNTRY': np.repeat(split1['NOT_COUNTRY'], lens),
                    'PROD_CAT': np.repeat(split1['PROD_CAT'], lens),
                    'TYPE': np.repeat(split1['TYPE'], lens),
                    'RISK_DECISION': np.repeat(split1['RISK_DECISION'], lens),
                    'ACTION_TAKEN': np.repeat(split1['ACTION_TAKEN'], lens),
                    'DISTRIBUTION_STAT': np.repeat(split1['DISTRIBUTION_STAT'], lens),
                    'HAZARDS_CAT': np.repeat(split1['HAZARDS_CAT'], lens),
                    'COUNT_ORIGEN': chainer(split1['COUNT_ORIGEN']),
                    'COUNT_DESTIN': np.repeat(split1['COUNT_DESTIN'], lens),
                    'COUNT_CONCERN': np.repeat(split1['COUNT_CONCERN'], lens)})

lens = split2['COUNT_DESTIN'].str.split(',').map(len)
split3 = pd.DataFrame({'DATE_CASE': np.repeat(split2['DATE_CASE'], lens),
                    'NOT_COUNTRY': np.repeat(split2['NOT_COUNTRY'], lens),
                    'PROD_CAT': np.repeat(split2['PROD_CAT'], lens),
                    'TYPE': np.repeat(split2['TYPE'], lens),
                    'RISK_DECISION': np.repeat(split2['RISK_DECISION'], lens),
                    'ACTION_TAKEN': np.repeat(split2['ACTION_TAKEN'], lens),
                    'DISTRIBUTION_STAT': np.repeat(split2['DISTRIBUTION_STAT'], lens),
                    'HAZARDS_CAT': np.repeat(split2['HAZARDS_CAT'], lens),
                    'COUNT_ORIGEN': np.repeat(split2['COUNT_ORIGEN'], lens),
                    'COUNT_DESTIN': chainer(split2['COUNT_DESTIN']),
                    'COUNT_CONCERN': np.repeat(split2['COUNT_CONCERN'], lens)})

lens = split3['COUNT_CONCERN'].str.split(',').map(len)
split4 = pd.DataFrame({'DATE_CASE': np.repeat(split3['DATE_CASE'], lens),
                    'NOT_COUNTRY': np.repeat(split3['NOT_COUNTRY'], lens),
                    'PROD_CAT': np.repeat(split3['PROD_CAT'], lens),
                    'TYPE': np.repeat(split3['TYPE'], lens),
                    'RISK_DECISION': np.repeat(split3['RISK_DECISION'], lens),
                    'ACTION_TAKEN': np.repeat(split3['ACTION_TAKEN'], lens),
                    'DISTRIBUTION_STAT': np.repeat(split3['DISTRIBUTION_STAT'], lens),
                    'HAZARDS_CAT': np.repeat(split3['HAZARDS_CAT'], lens),
                    'COUNT_ORIGEN': np.repeat(split3['COUNT_ORIGEN'], lens),
                    'COUNT_DESTIN': np.repeat(split3['COUNT_DESTIN'], lens),
                    'COUNT_CONCERN': chainer(split3['COUNT_CONCERN'])})

split4 = split4.reset_index(drop = True)
split4 = split4.dropna(subset = ['DATE_CASE'])

data = split4.copy()

In [8]:
data.head(3)

Unnamed: 0,DATE_CASE,NOT_COUNTRY,PROD_CAT,TYPE,RISK_DECISION,ACTION_TAKEN,DISTRIBUTION_STAT,HAZARDS_CAT,COUNT_ORIGEN,COUNT_DESTIN,COUNT_CONCERN
0,2003-06-18,Italy,fruits and vegetables,food,undecided,re-dispatch,,food additives and flavourings,Syria,,Italy
1,2003-06-18,Italy,fruits and vegetables,food,undecided,re-dispatch,,food additives and flavourings,Syria,,Spain
2,2019-11-14,Finland,other food product / mixed,food,undecided,recall from consumers,distribution restricted to notifying country,mycotoxins,United Kingdom,Finland,


## Features selection

In [9]:
categorical_vars = [0, 1, 6, 8]
target_vars = [2]

X = data.iloc[:, categorical_vars]
Y = data.iloc[:, target_vars]

In [14]:
ency = OneHotEncoder(handle_unknown="ignore", sparse=False)

ency.fit(Y.values)

OneHotEncoder(handle_unknown='ignore', sparse=False)

## Split train-val-test

In [15]:
# x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(X, y_one_hot, test_size=0.2, random_state=42, shuffle=True)

train_mask = (data.DATE_CASE >= "2004-01-01") & (data.DATE_CASE <= "2018-12-31")
test_mask = (data.DATE_CASE >= "2019-01-01") & (data.DATE_CASE <= "2019-12-31")

x_training_data = X.loc[train_mask]
y_training_data = Y.loc[train_mask]
x_test_data = X.loc[test_mask]
y_test_data = Y.loc[test_mask]

x_training_data = ency.transform(x_training_data.values)
x_test_data = ency.transform(x_test_data.values)

x_training_data, x_val_data, y_training_data, y_val_data = train_test_split(x_training_data, y_training_data, test_size=0.2, random_state=42, shuffle=True)

ValueError: The number of features in X is different to the number of features of the fitted data. The fitted data had 1 features and the X has 4 features.

## Coding and conversion to lists for beign able to introduce it into the model

In [None]:
categorical_vars = data.iloc[:, categorical_vars].columns

def preproc(X_train, X_test, X_val):
    input_list_train = []
    input_list_test = []
    input_list_testval = []
    
    for c in categorical_vars:
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        
        input_list_train.append(X_train[c].map(val_map).values)
        input_list_test.append(X_test[c].map(val_map).fillna(0).values)
        input_list_testval.append(X_val[c].map(val_map).fillna(0).values)

    return input_list_train, input_list_test,input_list_testval

In [None]:
input_list_train, input_list_test, input_list_testval = preproc(x_training_data, x_test_data, x_val_data)

## Metrics definition

In [None]:
def top_1_categorical_accuracy(y_true, y_pred):
	return metrics.top_k_categorical_accuracy(y_true, y_pred, k=1)

def top_2_categorical_accuracy(y_true, y_pred):
    return metrics.top_k_categorical_accuracy(y_true, y_pred, k=2)

def top_3_categorical_accuracy(y_true, y_pred):
    return metrics.top_k_categorical_accuracy(y_true, y_pred, k=3)

## Embeddings + MLP Models (cases 1 and 3)

In [None]:
input_models = []
output_embeddings = []

for categorical_var in categorical_vars:
    cat_emb_name = categorical_var.replace(" ", "") + "_Embedding"
    input_name = "Input_" + categorical_var.replace(" ", "")
    no_of_unique_cat = x_training_data[categorical_var].nunique()
    embedding_size = int(min(np.ceil((no_of_unique_cat)/2), 50))
   
    input_model = Input(shape=(1, ), name=input_name)
    output_model = Embedding(no_of_unique_cat, embedding_size, name=cat_emb_name)(input_model)
    output_model = Reshape(target_shape=(embedding_size, ))(output_model)    
    
    input_models.append(input_model)
    output_embeddings.append(output_model)
  
output = Concatenate()(output_embeddings)
output = Dense(2048,activation="relu")(output)
output = Dropout(0.3)(output)
output = Dense(1024,activation="relu")(output)
output = Dropout(0.2)(output)
output = Dense(512,activation="relu")(output)
output = Dropout(0.2)(output)
output = Dense(42, activation="softmax")(output)

model = Model(inputs=input_models, outputs=output)

In [None]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy", top_1_categorical_accuracy,top_2_categorical_accuracy,top_3_categorical_accuracy])

model.summary()

In [None]:
# REVIEW: No validation data has been provided
 
# hist = model.fit(input_list_train, y_training_data, validation_data=(input_list_test, y_test_data), epochs=5 , batch_size=64, verbose=1)
hist = model.fit(input_list_train, y_training_data, validation_data=(input_list_testval, y_val_data), epochs=5, batch_size=64, verbose=1)

In [None]:
def get_metrics():
	result = model.predict(input_list_test, batch_size=64)
	result = np.argmax(result, axis=-1)

	valid_loss, valid_accuracy, acc1, acc2, acc3 = model.evaluate(input_list_test)

	print("Loss:", valid_loss)
	print("Accuracy:", valid_accuracy)
	print("Top-1 Accuracy:", acc1)
	print("Top-2 Accuracy:", acc2)
	print("Top-3 Accuracy:", acc3)

	print(classification_report(np.argmax(y_test_data, axis=-1), result, zero_division=True))

	cm = confusion_matrix(np.argmax(y_test_data, axis=-1), result)
	cm = ConfusionMatrixDisplay(confusion_matrix=cm)

	fig, ax = plt.subplots(figsize=(20, 20))
	cm.plot(ax=ax)

	plt.show()

In [None]:
get_metrics()

In [None]:
# result = model.predict(input_list_test, batch_size=64)
# result = np.argmax(result, axis=1)

# valid_loss, valid_accuracy, acc1, acc2, acc3 = model.evaluate(input_list_test)

In [None]:
# print("Loss:", valid_loss)
# print("Accuracy:", valid_accuracy)
# print("Top-1 Accuracy:", acc1)
# print("Top-2 Accuracy:", acc2)
# print("Top-3 Accuracy:", acc3)

In [None]:
# print(classification_report(np.argmax(y_test_data, axis=-1), result, zero_division=True))

In [None]:
# cm = confusion_matrix(np.argmax(y_test_data, axis=-1), result)
# cm = ConfusionMatrixDisplay(confusion_matrix=cm)

# fig, ax = plt.subplots(figsize=(20, 20))
# cm.plot(ax=ax)

In [None]:
N = 5

plt.style.use("ggplot")

plt.figure()

plt.plot(np.arange(0, N), hist.history["loss"], label="train_loss")
plt.plot(np.arange(0, N), hist.history["val_loss"], label="val_loss")
plt.plot(np.arange(0, N), hist.history["accuracy"], label="train_acc")
plt.plot(np.arange(0, N), hist.history["val_accuracy"], label="val_acc")

plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend(loc="lower left")

plt.show()

In [None]:
historials = []
evaluations = []

for i in range (1, 6):
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy", top_1_categorical_accuracy, top_2_categorical_accuracy, top_3_categorical_accuracy])
    
    hist = model.fit(input_list_train, y_training_data, validation_data=(input_list_testval, y_val_data), epochs=25, batch_size=64, verbose=0)
    historials.append(hist)
    
    evaluation = model.evaluate(x=input_list_test, y=y_test_data)
    evaluations.append(evaluation)
    
    model.save("model" + str(i) + ".h5")

    get_metrics()

    print("\n\n-----------------------\n")