In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn import tree
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix, ConfusionMatrixDisplay

# Non-neural models

In [2]:
data = pd.read_csv("./data/splited_RASFF_DATA_16092019.csv", sep=";", header=0, index_col=0)

data.head(3)

Unnamed: 0,DATE_CASE,NOT_COUNTRY,PROD_CAT,TYPE,RISK_DECISION,ACTION_TAKEN,DISTRIBUTION_STAT,HAZARDS_CAT,COUNT_ORIGEN,COUNT_DESTIN,COUNT_CONCERN
0,2019-09-13,France,meat and meat products (other than poultry),food,serious,no stock left,no distribution from notifying country,pathogenic micro-organisms,Romania,France,Belgium
1,2019-09-13,Denmark,"cocoa and cocoa preparations, coffee and tea",food,serious,recall from consumers,no distribution from notifying country,natural toxins (other),Germany,Denmark,
2,2019-09-13,Poland,confectionery,food,serious,informing authorities,distribution restricted to notifying country,food additives and flavourings,Croatia,Poland,


In [3]:
class Stage:
	def __init__(self, input, output):
		self.input = input
		self.output = output

		self.x = data.iloc[:, input]
		self.y = data.iloc[:, output]

		self.x_train, self.x_test, self.y_train, self.y_test = None, None, None, None

		self.classifier = self.Classifier()

	def transform(self):
		# strategy = OneHotEncoder(handle_unknown="ignore") # One Hot Encoder
		# strategy = OrdinalEncoder() # Integer
		# strategy = FeatureHasher(n_features=25, input_type="string") # Hashing
		strategy = MultiLabelBinarizer() # Binary
		
		strategy.fit(self.x.values)
		
		self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(strategy.transform(self.x.values), self.y, test_size=0.2)

	class Classifier:
		pass

## Preprocessing

In [4]:
data.DATE_CASE = data.DATE_CASE.astype(str)
data.HAZARDS_CAT = data.HAZARDS_CAT.astype(str)

In [5]:
data = data.sample(frac=1)

## Transformation

In [6]:
stage1 = Stage(
	input=[0, 1, 6, 8],
	output=[2] # Product category
)

stage2 = Stage(
	input=[0, 1, 2, 6, 8],
	output=[7] # Hazard category
)

stage3 = Stage(
	input=[0, 1, 2, 6, 7, 8],
	output=[5] # Decision taken
)

## Data mining

In [7]:
stage1.transform()
stage2.transform()
stage3.transform()

### Decision trees

In [None]:
class DecisionTree:
	def __init__(self, stage):
		self.stage = stage
		
		self.classifier = tree.DecisionTreeClassifier(criterion="gini")
		self.classifier.fit(stage.x_train, stage.y_train)

		self.y_predict = None

	def predict(self):
		self.y_predict = self.classifier.predict(self.stage.x_test)

	def get_metrics(self):
		print(f"- Accuracy: {round(accuracy_score(self.stage.y_test, self.y_predict)*100, 2)}%")
		print(f"- Specifity: {round(self.classifier.score(self.stage.x_test, self.stage.y_test)*100, 2)}%")
		print(f"- Sensitivity: {round(recall_score(self.stage.y_test, self.y_predict, average='macro', zero_division=0)*100, 2)}%")
		print(f"- Precission: {round(precision_score(self.stage.y_test, self.y_predict, average='macro', zero_division=0)*100, 2)}%")
		
		print(classification_report(self.stage.y_test, self.y_predict, zero_division=0))

		cm = confusion_matrix(self.stage.y_test, self.y_predict)
		cm = ConfusionMatrixDisplay(confusion_matrix=cm)

		_, ax = plt.subplots(figsize=(10, 10))
		cm.plot(ax=ax)

In [None]:
stage1.classifier.decision_tree = DecisionTree(stage1)
print("Stage 1 completed")

stage2.classifier.decision_tree = DecisionTree(stage2)
print("Stage 2 completed")

stage3.classifier.decision_tree = DecisionTree(stage3)
print("Stage 3 completed")

In [None]:
stage1.classifier.decision_tree.predict()
stage2.classifier.decision_tree.predict()
stage3.classifier.decision_tree.predict()

In [None]:
print("Stage 1")
stage1.classifier.decision_tree.get_metrics()

In [None]:
print("Stage 2")
stage2.classifier.decision_tree.get_metrics()

In [None]:
print("Stage 3")
stage3.classifier.decision_tree.get_metrics()

In [None]:
# cm = confusion_matrix(stage1.y_test, stage1.classifier.decision_tree.y_predict)
# A = accuracy_score(stage1.y_test, stage1.classifier.decision_tree.y_predict)

# FP = cm.sum(axis=0) - np.diag(cm)  
# FN = cm.sum(axis=1) - np.diag(cm)
# TP = np.diag(cm)
# TN = (A*TP + A*FP + A*FN - TP)/(1 - A)

# print(np.mean((TP + TN)/(TP + FP + FN + TN))) # Accuracy
# print(np.mean(TN/(TN+FP))) # Specifity
# print(np.mean(TP/(TP+FN))) # Sensitivity
# print(np.mean(TP/(TP+FP))) # Precission