In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn import tree
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier, RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix, ConfusionMatrixDisplay

# Non-neural models

In [2]:
data = pd.read_csv("./data/full_RASFF_DATA.csv", sep=";", header=0, index_col=0)

data.head(3)

Unnamed: 0,CLASSIF,DATE_CASE,REF,NOT_COUNTRY,SUBJET,PROD_CAT,TYPE,RISK_DECISION,ACTION_TAKEN,DISTRIBUTION_STAT,PRODUCT,HAZARDS,HAZARDS_CAT,COUNT_ORIGEN,COUNT_DESTIN,COUNT_CONCERN,NUMBER
0,alert,2020-10-16,2020.4364,France,Listeria monocytogenes (presence) in ham trimm...,meat and meat products (other than poultry),food,serious,recall from consumers,distribution to other member countries,ham trimmings,listeria monocytogenes,microbial contaminants (other),France,"Czech Republic,United Kingdom",,
1,border rejection,2020-10-16,2020.4349,Bulgaria,prochloraz (0.696 mg/kg - ppm) in mandarins fr...,fruits and vegetables,food,serious,destruction,product not (yet) placed on the market,mandarins,prochloraz,pesticide residues,Turkey,,Bulgaria,
2,border rejection,2020-10-16,2020.435,Bulgaria,fenvalerate (0.357 mg/kg - ppm) in chilled man...,fruits and vegetables,food,serious,re-dispatch,product not (yet) placed on the market,chilled mandarins,fenvalerate,pesticide residues,Turkey,,Bulgaria,


In [3]:
train_mask = (data.DATE_CASE >= "2004-01-01") & (data.DATE_CASE <= "2018-12-31")
test_mask = (data.DATE_CASE >= "2019-01-01") & (data.DATE_CASE <= "2019-12-31")

class Stage:
	def __init__(self, input, output):
		self.input = input
		self.output = output

		self.x = data.iloc[:, input]
		self.y = data.iloc[:, output]

		self.x_train, self.x_test, self.y_train, self.y_test = None, None, None, None

		self.classifier = self.Classifier()
		
		self.__transform()

	def __transform(self):
		strategy = OneHotEncoder(handle_unknown="ignore") # One Hot Encoder
		# strategy = OrdinalEncoder() # Integer
		# strategy = FeatureHasher(n_features=25, input_type="string") # Hashing
		# strategy = MultiLabelBinarizer() # Binary

		strategy.fit(self.x.values)

		self.x_train = self.x.loc[train_mask]
		self.y_train = self.y.loc[train_mask]

		self.x_test = self.x.loc[test_mask]
		self.y_test = self.y.loc[test_mask]

		print(self.x_train.shape, self.y_train.shape)
		print(self.x_test.shape, self.y_test.shape)
		
		self.x_train = strategy.transform(self.x_train.values)
		self.x_test = strategy.transform(self.x_test.values)

		# self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(strategy.transform(self.x.values), self.y, test_size=0.2)

	class Classifier:
		pass

## Preprocessing

In [4]:
data.DATE_CASE = data.DATE_CASE.astype(str)
data.HAZARDS_CAT = data.HAZARDS_CAT.astype(str)

data.dropna(subset=data.columns[[1, 3, 5, 8, 9, 12, 13]], inplace=True)

In [5]:
data = data.sample(frac=1)

## Transformation

In [6]:
stage1 = Stage(
	# input=[0, 1, 6, 8],
	input=[1, 3, 9, 13],
	output=[5] # Product category
)

stage2 = Stage(
	# input=[0, 1, 2, 6, 8],
	input=[1, 3, 9, 13, 5],
	output=[12] # Hazard category
)

stage3 = Stage(
	# input=[0, 1, 2, 6, 7, 8],
	input=[1, 3, 9, 13, 5, 12],
	output=[8] # Decision taken
)

(47537, 4) (47537, 1)
(3989, 4) (3989, 1)
(47537, 5) (47537, 1)
(3989, 5) (3989, 1)
(47537, 6) (47537, 1)
(3989, 6) (3989, 1)


## Data mining

### Decision trees

In [None]:
class DecisionTree:
	def __init__(self, stage, params):
		self.stage = stage
		self.params = params

		self.classifier = GridSearchCV(tree.DecisionTreeClassifier(random_state=42), self.params, cv=3)
		self.classifier.fit(stage.x_train, stage.y_train)

		self.best_params = self.classifier.best_params_

		self.classifier = tree.DecisionTreeClassifier(**self.best_params, random_state=42)
		self.classifier.fit(stage.x_train, stage.y_train)

		self.y_predict = None

	def predict(self):
		self.y_predict = self.classifier.predict(self.stage.x_test)

	def get_metrics(self):
		print(f"- Accuracy: {round(accuracy_score(self.stage.y_test, self.y_predict)*100, 2)}%")
		print(f"- Specifity: {round(self.classifier.score(self.stage.x_test, self.stage.y_test)*100, 2)}%")
		print(f"- Sensitivity: {round(recall_score(self.stage.y_test, self.y_predict, average='macro', zero_division=0)*100, 2)}%")
		print(f"- Precision: {round(precision_score(self.stage.y_test, self.y_predict, average='macro', zero_division=0)*100, 2)}%")
		
		print(classification_report(self.stage.y_test, self.y_predict, zero_division=0))

		cm = confusion_matrix(self.stage.y_test, self.y_predict)
		cm = ConfusionMatrixDisplay(confusion_matrix=cm)

		_, ax = plt.subplots(figsize=(10, 10))
		cm.plot(ax=ax)

In [None]:
params = {
	"criterion": ["gini", "entropy"],
	"splitter": ["best", "random"],
	"max_features": ["auto", "sqrt", "log2"]
}

stage1.classifier.decision_tree = DecisionTree(stage1, params)
print(f"Stage 1 completed: {stage1.classifier.decision_tree.best_params}")

stage2.classifier.decision_tree = DecisionTree(stage2, params)
print(f"Stage 2 completed: {stage2.classifier.decision_tree.best_params}")

stage3.classifier.decision_tree = DecisionTree(stage3, params)
print(f"Stage 3 completed: {stage3.classifier.decision_tree.best_params}")

In [None]:
stage1.classifier.decision_tree.predict()
stage2.classifier.decision_tree.predict()
stage3.classifier.decision_tree.predict()

In [None]:
print("Stage 1")
stage1.classifier.decision_tree.get_metrics()

In [None]:
print("Stage 2")
stage2.classifier.decision_tree.get_metrics()

In [None]:
print("Stage 3")
stage3.classifier.decision_tree.get_metrics()

### Boosted trees

In [7]:
class BoostedTrees:
	def __init__(self, stage, params):
		self.stage = stage
		self.params = params
		
		self.classifier = GridSearchCV(GradientBoostingClassifier(max_features="sqrt", subsample=0.8, random_state=10), self.params, n_jobs=4, cv=3)
		self.classifier.fit(stage.x_train, stage.y_train.values.ravel())

		self.best_params = self.classifier.best_params_

		self.classifier = GradientBoostingClassifier(**self.best_params, max_features="sqrt", subsample=0.8, random_state=10)
		self.classifier.fit(stage.x_train, stage.y_train.values.ravel())

		self.y_predict = None

	def predict(self):
		self.y_predict = self.classifier.predict(self.stage.x_test)

	def get_metrics(self):
		print(f"- Accuracy: {round(accuracy_score(self.stage.y_test, self.y_predict)*100, 2)}%")
		print(f"- Specifity: {round(self.classifier.score(self.stage.x_test, self.stage.y_test)*100, 2)}%")
		print(f"- Sensitivity: {round(recall_score(self.stage.y_test, self.y_predict, average='macro', zero_division=0)*100, 2)}%")
		print(f"- Precission: {round(precision_score(self.stage.y_test, self.y_predict, average='macro', zero_division=0)*100, 2)}%")
		
		print(classification_report(self.stage.y_test, self.y_predict, zero_division=0))

		cm = confusion_matrix(self.stage.y_test, self.y_predict)
		cm = ConfusionMatrixDisplay(confusion_matrix=cm)

		_, ax = plt.subplots(figsize=(10, 10))
		cm.plot(ax=ax)

In [8]:
params = {
	"n_estimators": range(20, 51, 10),
	"learning_rate": [1, 0.1, 0.01],
	"max_depth": range(5, 10, 2),
	"min_samples_split": range(200, 601, 200)
}

stage1.classifier.boosted_trees = BoostedTrees(stage1, params)
print(f"Stage 1 completed: {stage1.classifier.boosted_trees.best_params}")

stage2.classifier.boosted_trees = BoostedTrees(stage2, params)
print(f"Stage 2 completed: {stage2.classifier.boosted_trees.best_params}")

stage3.classifier.boosted_trees = BoostedTrees(stage3, params)
print(f"Stage 3 completed: {stage3.classifier.boosted_trees.best_params}")

In [None]:
stage1.classifier.boosted_trees.predict()
stage2.classifier.boosted_trees.predict()
stage3.classifier.boosted_trees.predict()

In [None]:
print("Stage 1")
stage1.classifier.boosted_trees.get_metrics()

In [None]:
print("Stage 2")
stage2.classifier.boosted_trees.get_metrics()

In [None]:
print("Stage 3")
stage3.classifier.boosted_trees.get_metrics()

### Random Forest

In [None]:
class RandomForest:
	def __init__(self, stage,params):
		self.stage = stage
		self.params=params
		

		rf = RandomForestClassifier()


		self.classifier = GridSearchCV(estimator = rf, param_grid = self.params, 
                          cv = 3, n_jobs = -1, verbose = 2)
		self.classifier.fit(stage.x_train, stage.y_train)

		self.best_params = self.classifier.best_params_

		self.classifier = RandomForestClassifier(**self.best_params)
		self.classifier.fit(stage.x_train, stage.y_train)

		self.y_predict = None

	def predict(self):
		self.y_predict = self.classifier.predict(self.stage.x_test)

	def get_metrics(self):
		print(f"- Accuracy: {round(accuracy_score(self.stage.y_test, self.y_predict)*100, 2)}%")
		print(f"- Specifity: {round(self.classifier.score(self.stage.x_test, self.stage.y_test)*100, 2)}%")
		print(f"- Sensitivity: {round(recall_score(self.stage.y_test, self.y_predict, average='macro', zero_division=0)*100, 2)}%")
		print(f"- Precision: {round(precision_score(self.stage.y_test, self.y_predict, average='macro', zero_division=0)*100, 2)}%")
		
		print(classification_report(self.stage.y_test, self.y_predict, zero_division=0))

		cm = confusion_matrix(self.stage.y_test, self.y_predict)
		cm = ConfusionMatrixDisplay(confusion_matrix=cm)

		_, ax = plt.subplots(figsize=(10, 10))
		cm.plot(ax=ax)

In [None]:
params = {
    'bootstrap': [True],
    'max_depth': [100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [4, 5],
    'min_samples_split': [10, 12],
    'n_estimators': [200, 300, 500]}

stage1.classifier.Random_forest = RandomForest(stage1, params)
print(f"Stage 1 completed: {stage1.classifier.Random_forest.best_params}")

stage2.classifier.Random_forest = RandomForest(stage2, params)
print(f"Stage 2 completed: {stage2.classifier.Random_forest.best_params}")

stage3.classifier.Random_forest = RandomForest(stage3, params)
print(f"Stage 3 completed: {stage3.classifier.Random_forest.best_params}")

In [None]:
stage1.classifier.Random_forest.predict()
stage2.classifier.Random_forest.predict()
stage3.classifier.Random_forest.predict()

In [None]:
print("Stage 1")
stage1.classifier.Random_forest.get_metrics()

In [None]:
print("Stage 2")
stage2.classifier.Random_forest.get_metrics()

In [None]:
print("Stage 3")
stage3.classifier.Random_forest.get_metrics()

### Logistic Regression

In [None]:
class LogisticRegression:
	def __init__(self, stage,params):
		self.stage = stage
		self.params=params
		

		lr = LogisticRegression(average='micro')


		self.classifier = GridSearchCV(estimator = lr, param_grid = self.params, 
                          scoring = 'recall')
		self.classifier.fit(stage.x_train, stage.y_train)

		self.best_params = self.classifier.best_params_

		self.classifier = lr(**self.best_params)
		self.classifier.fit(stage.x_train, stage.y_train)

		self.y_predict = None

	def predict(self):
		self.y_predict = self.classifier.predict(self.stage.x_test)

	def get_metrics(self):
		print(f"- Accuracy: {round(accuracy_score(self.stage.y_test, self.y_predict)*100, 2)}%")
		print(f"- Specifity: {round(self.classifier.score(self.stage.x_test, self.stage.y_test)*100, 2)}%")
		print(f"- Sensitivity: {round(recall_score(self.stage.y_test, self.y_predict, average='macro', zero_division=0)*100, 2)}%")
		print(f"- Precision: {round(precision_score(self.stage.y_test, self.y_predict, average='macro', zero_division=0)*100, 2)}%")
		
		print(classification_report(self.stage.y_test, self.y_predict, zero_division=0))

		cm = confusion_matrix(self.stage.y_test, self.y_predict)
		cm = ConfusionMatrixDisplay(confusion_matrix=cm)

		_, ax = plt.subplots(figsize=(10, 10))
		cm.plot(ax=ax)

In [None]:
params = {'penalty': ['l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}

stage1.classifier.Logistic_regression = LogisticRegression(stage1, params)
print(f"Stage 1 completed: {stage1.classifier.Logistic_regression.best_params}")

stage2.classifier.Logistic_regression = LogisticRegression(stage2, params)
print(f"Stage 2 completed: {stage2.classifier.Logistic_regression.best_params}")

stage3.classifier.Logistic_regression = LogisticRegression(stage3, params)
print(f"Stage 3 completed: {stage3.classifier.Logistic_regression.best_params}")

In [None]:
stage1.classifier.Random_forest.predict()
stage2.classifier.Random_forest.predict()
stage3.classifier.Random_forest.predict()

In [None]:
print("Stage 1")
stage1.classifier.Random_forest.get_metrics()

In [None]:
print("Stage 2")
stage2.classifier.Random_forest.get_metrics()

In [None]:
print("Stage 3")
stage3.classifier.Random_forest.get_metrics()