## **Machine Learning Algorithms Benchmark - Texts**

This is the application of benchmarking algorithms to the provided texts of the training dataset.

Import some required python modules

In [3]:
# Importing some packages

import pandas as pd
import numpy as np
import spacy, torch, re
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from datasets import Dataset
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    classification_report,
    f1_score,
    precision_recall_fscore_support,
    accuracy_score,
)
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    StratifiedKFold,
)
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)


# Loading the English spacy model
eng_model = spacy.load("en_core_web_sm")

Load the training dataset and process the columns

In [None]:
# Loading the 'food incidents' training dataset
food_dataset = pd.read_csv("../data/train/incidents_train.csv")

# Dropping and renaming some columns
food_dataset.drop(
    columns=["Unnamed: 0", "year", "month", "day", "country", "title"], inplace=True
)
food_dataset.rename(
    columns={
        "hazard-category": "hazard_category",
        "product-category": "product_category",
    },
    inplace=True,
)

food_dataset.sample(2)

Unnamed: 0,text,hazard_category,product_category,hazard,product
1502,Garden of Life LLC is voluntarily withdrawing ...,biological,"dietetic foods, food supplements, fortified foods",salmonella,dietary supplement
1531,"Boulder Brands UK Ltd, Asda and Tesco are reca...",allergens,cereals and bakery products,sesame seeds and products thereof,bakery products


Encode all the classes into numeric values

In [5]:
# Dictionary to hold the label encoders
encoders = {}

# Iterating through the labels
for label in food_dataset.columns.drop("text"):

    # Encoding labels into numeric values
    label_encoder = LabelEncoder()
    food_dataset[f"{label}_encoded"] = label_encoder.fit_transform(food_dataset[label])

    # Storing the label encoder
    encoders[f"{label}_encoded"] = label_encoder

food_dataset.sample(2)

Unnamed: 0,text,hazard_category,product_category,hazard,product,hazard_category_encoded,product_category_encoded,hazard_encoded,product_encoded
4107,Food Recall Warning - Certain Kawartha Dairy b...,foreign bodies,ices and desserts,metal fragment,ice cream,4,12,57,530
1468,"American Pure Whey, New Bern, NC is recalling ...",allergens,"dietetic foods, food supplements, fortified foods",milk and products thereof,whey protein,0,4,59,1007


Process the texts by removing redundant terms. After exploring the texts i noticed that the main information is mentioned at the beginning. After removing some terms i keep only the first 100 tokens.

In [None]:
def process_text(text):
    """
    This function is used to process the texts and remove reduntant terms.

    :param text: Text to process
    :return: The processed text
    """

    # Removing terms
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"\b\d{2}/\d{2}/\d{4}\b", "", text)
    text = re.sub(r"\b\d+/\d+\b", "", text)
    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "", text)
    text = re.sub(r"\d+([.,]\d+)?", "", text)

    # Converting the text into tokens
    tokens = eng_model(text)

    # List to keep the final tokens of the text
    final_tokens = []

    # Iterating over the first 100 tokens
    for token in tokens[:100]:

        # Checking whether the token is reduntant term or not
        if (
            not token.is_stop
            and not token.is_punct
            and not token.is_space
            and token.is_ascii
            and not len(token) == 1
        ):

            # Updating
            final_tokens.append(token.lemma_.lower())

    # Concatenating the final text
    new_text = " ".join(final_tokens)

    return new_text


# Processing the texts
food_dataset.insert(1, "text_processed", food_dataset["text"].apply(process_text))

In [7]:
food_dataset.sample(2)

Unnamed: 0,text,text_processed,hazard_category,product_category,hazard,product,hazard_category_encoded,product_category_encoded,hazard_encoded,product_encoded
1866,Errington Cheese has instigated a precautionar...,errington cheese instigate precautionary recal...,biological,"meat, egg and dairy products",escherichia coli,blue cheese,1,13,36,80
835,"FOR IMMEDIATE RELEASE - July 10, 2009 - Haifa ...",immediate release july haifa smoked fish inc v...,biological,seafood,listeria monocytogenes,Fishes not identified,1,19,55,2


### **Classification Task - Hazard Categories**

Keep only the columns 'text_processed' and 'hazard_category_encoded' from the initial dataset

In [None]:
# Selecting specific columns
hazard_category_dataset = food_dataset[["text_processed", "hazard_category_encoded"]]

hazard_category_dataset.sample(2)

Unnamed: 0,text_processed,hazard_category_encoded
2010,houston tx ron home style foods inc. recall tr...,1
229,pra date publish jun product description glass...,4


**Logistic Regression**

Below is the main function for training and implementing the Logistic Regression classifier. It performs nested cross validation for model and hyperparameter selection.

In [None]:
def train_evaluate_model_lr(dataset, label, outer_cv, inner_cv, pipeline, param_grid):
    """
    This function  used to evaluate the classifier.

    :param dataset: Dataset to use
    :param label: Label to use
    :param outer_cv: Outer cross validation
    :param inner_cv: Inner cross validation
    :param pipeline: Pipeline to use
    :param param_grid: Hyperparameter grid to use
    :return: Macro f1 scores and hyperparameters
    """

    # Defining the lists to hold values through the cross validation process
    f1_scores = []
    hyperparameters = []

    # Iterating through the folds
    for train_idx, test_idx in outer_cv.split(
        dataset["text_processed"], dataset[label]
    ):

        # Constructing the training and test datasets
        title_train, title_test = (
            dataset["text_processed"].iloc[train_idx],
            dataset["text_processed"].iloc[test_idx],
        )
        label_train, label_test = (
            dataset[label].iloc[train_idx],
            dataset[label].iloc[test_idx],
        )

        # Setting up the grid search for hyperparameter tuning
        grid_search = GridSearchCV(
            pipeline, param_grid, cv=inner_cv, scoring="f1_macro", n_jobs=-1, verbose=0
        )

        # Fitting the training dataset and performing hyperparameter tuning
        grid_search.fit(title_train, label_train)

        # Getting the best model
        best_model = grid_search.best_estimator_

        # Predicting over the testing dataset
        label_pred = best_model.predict(title_test)

        # Updating lists to store the macro f1 score and hyperparameters of the current best model
        f1_scores.append(f1_score(label_test, label_pred, average="macro"))
        hyperparameters.append(grid_search.best_params_)

        print(classification_report(label_test, label_pred, zero_division=0))

    return f1_scores, hyperparameters

Below, all the steps for training and evaluating the Logistic Regression model for hazard categories are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("logreg", LogisticRegression()),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "logreg__penalty": ["l1", "l2"],
    "logreg__solver": ["liblinear"],
    "logreg__max_iter": [2000, 3000],
    "logreg__C": [10, 20],
    "logreg__class_weight": ["balanced"],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the logistic regression classifier
haz_cat_macro_f1_scores, haz_cat_hyperparameters = train_evaluate_model_lr(
    hazard_category_dataset,
    "hazard_category_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)



              precision    recall  f1-score   support

           0       0.95      0.96      0.95       618
           1       0.97      0.97      0.97       581
           2       0.83      0.86      0.85        95
           3       0.75      0.38      0.50         8
           4       0.93      0.96      0.94       187
           5       0.76      0.76      0.76       124
           6       0.00      0.00      0.00         1
           7       0.71      0.83      0.77        18
           8       0.87      0.61      0.72        44
           9       0.69      0.50      0.58        18

    accuracy                           0.93      1694
   macro avg       0.75      0.68      0.70      1694
weighted avg       0.92      0.93      0.92      1694

              precision    recall  f1-score   support

           0       0.92      0.97      0.95       618
           1       0.94      0.97      0.96       580
           2       0.81      0.79      0.80        96
           3       0.80 

The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(haz_cat_macro_f1_scores))
print("Standard Deviation:", np.std(haz_cat_macro_f1_scores))

# Printing best model
print("\nThe hyperparameters which gives the best results are:")
print(haz_cat_hyperparameters[np.argmax(haz_cat_macro_f1_scores)])

Cross-Validation Macro f1:
Mean: 0.6764134702712327
Standard Deviation: 0.02560663337452602

The hyperparameters which gives the best results are:
{'logreg__C': 20, 'logreg__class_weight': 'balanced', 'logreg__max_iter': 2000, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


**BERT - RoBERTa**

Define a function to split the dataset into training, validation and testing datasets using stratification to maintain the distribution of classes in them.

In [13]:
def split_dataset(dataset, label, percentage):
    """
    This function used to split the initial dataset into training, validation
    and testing datasets.

    :param dataset: Dataset to split
    :param label: Label to use for splitting
    :param percentage: Percentage of data to use for testing and validation
    :return:
    """

    # Splitting into train and temp sets
    train_set, temp_set = train_test_split(
        dataset,
        test_size=percentage,
        stratify=dataset[label],
        random_state=42,
    )

    # Splitting into validation and testing sets
    val_set, test_set = train_test_split(
        temp_set, test_size=0.5, random_state=42, stratify=temp_set[label]
    )

    return train_set, val_set, test_set

Define a function to calculate class weights for the training dataset to be used by the classifier to pay more attention to minority classes.

In [14]:
def compute_weight(train_set, label):
    """
    This function used to compute the class weights.

    :param train_set: Training set
    :param label: Label to use for computing class weights
    :return: Class weights
    """

    # Computing class weights
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.sort(np.unique(train_set[label])),
        y=train_set[label].values,
    )

    # Converting weights to a tensor and moving to GPU
    class_weights = torch.tensor(class_weights, dtype=torch.float).to("cuda")

    return class_weights

Define a function to tokenize the data based on the BERT tokenizer

In [15]:
def tokenize_data(train_set, val_set, test_set, label, model):
    """
    This function used to tokenize the text data and return the correspondig
    datasets together with their labels.

    :param train_set: Training set
    :param val_set: Validation set
    :param test_set: Testing set
    :param label: Label to use
    :param model: Model to use
    :return: Tokenized datasets
    """

    # Setting up the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model)

    # Tokenizing text data
    train_encodings = tokenizer(
        train_set["text_processed"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )
    val_encodings = tokenizer(
        val_set["text_processed"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )
    test_encodings = tokenizer(
        test_set["text_processed"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )

    # Converting list of labels to PyTorch tensors
    train_encodings["labels"] = torch.tensor(train_set[label].tolist())
    val_encodings["labels"] = torch.tensor(val_set[label].tolist())
    test_encodings["labels"] = torch.tensor(test_set[label].tolist())

    # Converting to Datasets
    train_dataset = Dataset.from_dict(train_encodings)
    val_dataset = Dataset.from_dict(val_encodings)
    test_dataset = Dataset.from_dict(test_encodings)

    return train_dataset, val_dataset, test_dataset

Define a function that computes some weights to be used by the classifier at each epoch to evaluate its performance against the validation dataset.

In [16]:
def compute_metrics(eval_pred):
    """
    This function used to evaluate the perfomance of the model for each epoch
    based on the evaluation dataset.

    :param eval_pred: Evaluation predictions
    :return: Evaluation metrics
    """

    # Extracting logits and labels
    logits, labels = eval_pred

    # Getting predictions
    predictions = torch.argmax(torch.tensor(logits), dim=1).numpy()

    # Calculating evaluation metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="macro", zero_division=0
    )
    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

Define a class that overrides the way the classifier calculates the loss. Here it takes into account the weights of the classes for the cross entropy loss function.

In [17]:
class WeightedTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        This function used to compute loss using class weights handling the
        imbalance classes.

        :param model: The model
        :param inputs: The inputs
        :param return_outputs: Whether to return outputs
        :param kwargs: Keyword arguments
        :return: The loss and outputs
        """

        # Extracting labels
        labels = inputs.pop("labels")

        # Passing inputs and getting results
        outputs = model(**inputs)
        logits = outputs.logits

        # Defining weighted cross entropy loss function
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)

        # Calculating
        loss = loss_fct(logits, labels)

        # Checking and returning
        if return_outputs:

            return (loss, outputs)

        else:

            return loss

Define a function to train and evaluate the model

In [None]:
def train_model(train_dataset, val_dataset, model_name, number_classes):
    """
    This function used to train the model.

    :param train_dataset: Training dataset
    :param val_dataset: Validation dataset
    :param model_name: Model name
    :param number_classes: Number of classes
    :return: Trained model
    """

    # Setting up the model to use and moving it to GPU
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=number_classes
    )
    model.to("cuda")

    # Setting up the training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        num_train_epochs=10,
        report_to="none",
        save_total_limit=1,
    )

    # Setting up the trainer of the model
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    # Training model
    trainer.train()

    return trainer

Define a function to test the trained model on unseen data

In [19]:
def test_model(model, test_dataset):
    """
    This function used to test the model.

    :param model: Trained model
    :param test_dataset: Testing dataset
    :return: Predicted labels and actual labels
    """

    # Running model on test dataset
    predictions = model.predict(test_dataset)

    # Getting the predicted labels
    test_logits = predictions.predictions
    labels_pred = torch.argmax(torch.tensor(test_logits), dim=1).numpy()

    # Getting the actual labels
    labels_actual = predictions.label_ids

    return labels_pred, labels_actual

Below, all the steps for training the BERT model for hazard categories are presented

- Separation of the initial dataset
- Calculation of the weights of the classes
- Data tokenization
- Training the model

In [None]:
# Splitting (i choose 0.5 to keep samples for each label to all datasets)
train_set, val_set, test_set = split_dataset(
    hazard_category_dataset, "hazard_category_encoded", 0.5
)

# Calculating class weights
class_weights = compute_weight(train_set, "hazard_category_encoded")

# Tokenizing
train_dataset, val_dataset, test_dataset = tokenize_data(
    train_set, val_set, test_set, "hazard_category_encoded", "roberta-base"
)

# Traning
haz_cat_bert_model = train_model(train_dataset, val_dataset, "roberta-base", 10)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.333376,0.835433,0.467477,0.44047,0.442287
2,No log,1.153068,0.874016,0.518634,0.493833,0.496115
3,No log,1.077913,0.880315,0.608035,0.578279,0.54713
4,1.294900,1.247298,0.899213,0.597959,0.595124,0.592081
5,1.294900,1.190444,0.913386,0.63306,0.601794,0.614193
6,1.294900,1.285473,0.907874,0.724983,0.616287,0.637271
7,0.539300,1.235868,0.912598,0.741433,0.647321,0.661936
8,0.539300,1.294958,0.912598,0.759214,0.653546,0.675182
9,0.539300,1.306767,0.91811,0.754317,0.655088,0.672993
10,0.206800,1.311983,0.916535,0.749841,0.652965,0.670068


Now, we test the trained model on the useen data obtained from the initial split

In [None]:
# Testing
labels_pred, labels_actual = test_model(haz_cat_bert_model, test_dataset)

# Printing the classification report
print("Evaluating on Test Dataset")
print(classification_report(labels_actual, labels_pred, zero_division=0))

Evaluating on Test Dataset
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       464
           1       0.97      0.96      0.96       435
           2       0.87      0.86      0.87        72
           3       1.00      0.50      0.67         6
           4       0.90      0.96      0.93       141
           5       0.65      0.70      0.67        93
           6       0.00      0.00      0.00         1
           7       0.73      0.62      0.67        13
           8       0.61      0.58      0.59        33
           9       0.75      0.46      0.57        13

    accuracy                           0.91      1271
   macro avg       0.74      0.66      0.69      1271
weighted avg       0.91      0.91      0.91      1271



**XGBoost**

Below is the main function for training and implementing the XGBoost classifier. It performs nested cross validation for model and hyperparameter selection.

In [None]:
def train_evaluate_model_xgb(dataset, label, outer_cv, inner_cv, pipeline, param_grid):
    """
    This function used to evaluate the xgboost classifier.

    :param dataset: Dataset to use
    :param label: Label to use
    :param outer_cv: Outer cross validation
    :param inner_cv: Inner cross validation
    :param pipeline: Pipeline to use
    :param param_grid: Hyperparameter grid to use
    :return: Macro f1 scores and hyperparameters
    """

    # Defining the lists to hold values through the cross validation process
    f1_scores = []
    hyperparameters = []

    # Iterating through the folds
    for train_idx, test_idx in outer_cv.split(
        dataset["text_processed"], dataset[label]
    ):

        # Constructing the training and test datasets
        title_train, title_test = (
            dataset["text_processed"].iloc[train_idx],
            dataset["text_processed"].iloc[test_idx],
        )
        label_train, label_test = (
            dataset[label].iloc[train_idx],
            dataset[label].iloc[test_idx],
        )

        # Calculating the class weigths
        unique_classes = np.sort(np.unique(label_train))
        weights = compute_class_weight(
            "balanced", classes=unique_classes, y=label_train.values
        )
        class_weight_map = {cls: weight for cls, weight in zip(unique_classes, weights)}
        class_weights = [class_weight_map[lab] for lab in label_train.values]

        # Setting up the grid search for hyperparameter tuning
        grid_search = GridSearchCV(
            pipeline, param_grid, cv=inner_cv, scoring="f1_macro", n_jobs=-1, verbose=0
        )

        # Fitting the training dataset and performing hyperparameter tuning
        grid_search.fit(title_train, label_train, xgb__sample_weight=class_weights)

        # Getting the best model
        best_model = grid_search.best_estimator_

        # Predicting over the testing dataset
        label_pred = best_model.predict(title_test)

        # Updating lists to store the macro f1 score and hyperparameters of the current best model
        f1_scores.append(f1_score(label_test, label_pred, average="macro"))
        hyperparameters.append(grid_search.best_params_)

        print(classification_report(label_test, label_pred, zero_division=0))

    return f1_scores, hyperparameters

Below, all the steps for training and evaluating the XGBoost model for hazard categories are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        (
            "xgb",
            XGBClassifier(random_state=42, objective="multi:softmax", num_class=10),
        ),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "xgb__max_depth": [5],
    "xgb__learning_rate": [0.1],
    "xgb__n_estimators": [100],
    "xgb__gamma": [0],
    "xgb__min_child_weight": [1],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the xgboost classifier
haz_cat_macro_f1_scores_2, haz_cat_hyperparameters_2 = train_evaluate_model_xgb(
    hazard_category_dataset,
    "hazard_category_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       618
           1       0.97      0.93      0.95       581
           2       0.70      0.81      0.75        95
           3       0.75      0.38      0.50         8
           4       0.91      0.96      0.94       187
           5       0.62      0.78      0.69       124
           6       1.00      1.00      1.00         1
           7       0.61      0.61      0.61        18
           8       0.59      0.45      0.51        44
           9       0.69      0.50      0.58        18

    accuracy                           0.90      1694
   macro avg       0.78      0.74      0.75      1694
weighted avg       0.90      0.90      0.90      1694

              precision    recall  f1-score   support

           0       0.94      0.94      0.94       618
           1       0.98      0.93      0.95       580
           2       0.78      0.80      0.79        96
           3       0.33 

The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(haz_cat_macro_f1_scores_2))
print("Standard Deviation:", np.std(haz_cat_macro_f1_scores_2))

# Printing best model
print("\nThe hyperparameters which gives the best results are:")
print(haz_cat_hyperparameters_2[np.argmax(haz_cat_macro_f1_scores_2)])

Cross-Validation Macro f1:
Mean: 0.6962319837411263
Standard Deviation: 0.05902540965084855

The hyperparameters which gives the best results are:
{'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'xgb__gamma': 0, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__min_child_weight': 1, 'xgb__n_estimators': 100}


### **Classification Task - Product Categories**

Keep only the columns 'text_processed' and 'product_category_encoded' from the initial dataset

In [None]:
# Selecting specific columns
product_category_dataset = food_dataset[["text_processed", "product_category_encoded"]]

product_category_dataset.sample(2)

Unnamed: 0,text_processed,product_category_encoded
2922,azka impex limited recall pran muri moa puffed...,1
2354,chun yuen trading co. south el monte californi...,2


**Logistic Regression**

Below, all the steps for training and evaluating the Logistic Regression model for product categories are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

<br>

We use the previous predefined method to train and evaluate the classifier

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("logreg", LogisticRegression()),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "logreg__penalty": ["l1", "l2"],
    "logreg__solver": ["liblinear"],
    "logreg__max_iter": [3000, 4000],
    "logreg__C": [20, 30],
    "logreg__class_weight": ["balanced"],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the logistic regression classifier
prod_cat_macro_f1_scores, prod_cat_hyperparameters = train_evaluate_model_lr(
    product_category_dataset,
    "product_category_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)

              precision    recall  f1-score   support

           0       0.50      0.60      0.55        20
           1       0.63      0.67      0.65       223
           2       0.61      0.73      0.67        70
           3       0.62      0.42      0.50        57
           4       0.72      0.75      0.73        44
           5       0.29      0.67      0.40         6
           6       1.00      0.50      0.67         2
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         2
           9       0.70      0.74      0.72       178
          10       0.61      0.48      0.53        42
          11       0.67      0.67      0.67         3
          12       0.85      0.78      0.82        74
          13       0.83      0.84      0.83       478
          14       0.71      0.67      0.69        45
          15       0.59      0.66      0.62        88
          16       0.70      0.39      0.50        18
          17       0.75    



              precision    recall  f1-score   support

           0       0.73      0.42      0.53        19
           1       0.67      0.69      0.68       224
           2       0.64      0.70      0.67        70
           3       0.54      0.44      0.49        57
           4       0.69      0.66      0.67        44
           5       0.62      0.83      0.71         6
           6       0.50      0.50      0.50         2
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         2
           9       0.64      0.70      0.67       179
          10       0.55      0.51      0.53        41
          11       1.00      0.50      0.67         2
          12       0.90      0.88      0.89        74
          13       0.84      0.84      0.84       478
          14       0.70      0.58      0.63        45
          15       0.59      0.69      0.64        87
          16       0.67      0.33      0.44        18
          17       0.67    

The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(prod_cat_macro_f1_scores))
print("Standard Deviation:", np.std(prod_cat_macro_f1_scores))

# Printing best model
print("\nThe hyperparameters which gives the best results are:")
print(prod_cat_hyperparameters[np.argmax(prod_cat_macro_f1_scores)])

Cross-Validation Macro f1:
Mean: 0.5582280122794763
Standard Deviation: 0.006090098461327

The hyperparameters which gives the best results are:
{'logreg__C': 20, 'logreg__class_weight': 'balanced', 'logreg__max_iter': 3000, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


**BERT - RoBERTa**

Below, all the steps for training the BERT model for the product categories are presented

- Separation of the initial dataset
- Calculation of the weights of the classes
- Data tokenization
- Training the model

<br>

We use all the previous predefined methods to train and evaluate the classifier

In [None]:
# Splitting (i choose 0.4 to keep samples for each label to all datasets)
train_set, val_set, test_set = split_dataset(
    product_category_dataset, "product_category_encoded", 0.4
)

# Calculating class weights
class_weights = compute_weight(train_set, "product_category_encoded")

# Tokenizing
train_dataset, val_dataset, test_dataset = tokenize_data(
    train_set, val_set, test_set, "product_category_encoded", "roberta-base"
)

# Traning
prod_cat_bert_model = train_model(train_dataset, val_dataset, "roberta-base", 22)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,2.451057,0.480315,0.282178,0.267969,0.216991
2,No log,1.64387,0.609252,0.472684,0.507562,0.464306
3,2.220700,1.53738,0.67126,0.512428,0.546624,0.511593
4,2.220700,1.336082,0.695866,0.529028,0.586927,0.543367
5,2.220700,1.385488,0.707677,0.596855,0.614625,0.59249
6,0.953500,1.447285,0.702756,0.601943,0.607198,0.589602
7,0.953500,1.496121,0.713583,0.627944,0.654114,0.629535
8,0.381500,1.552909,0.720472,0.669406,0.701907,0.677372
9,0.381500,1.599677,0.712598,0.684464,0.700617,0.686527
10,0.381500,1.635129,0.71752,0.642977,0.655167,0.643517


Now, we test the trained model on the useen data obtained from the initial split

In [None]:
# Testing
labels_pred, labels_actual = test_model(prod_cat_bert_model, test_dataset)

# Printing the classification report
print("Evaluating on Test Dataset")
print(classification_report(labels_actual, labels_pred, zero_division=0))

Evaluating on Test Dataset
              precision    recall  f1-score   support

           0       0.91      0.83      0.87        12
           1       0.76      0.69      0.72       134
           2       0.71      0.71      0.71        42
           3       0.66      0.62      0.64        34
           4       0.69      0.77      0.73        26
           5       0.50      0.75      0.60         4
           6       0.50      1.00      0.67         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.78      0.80      0.79       107
          10       0.89      0.68      0.77        25
          11       1.00      1.00      1.00         1
          12       0.82      0.91      0.86        45
          13       0.94      0.83      0.88       287
          14       0.76      0.70      0.73        27
          15       0.78      0.81      0.80        53
          16       0.39      0.64      0.48        11


**XGBoost**

Below, all the steps for training and evaluating the XGboost model for product categories are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

<br>

We use the previous predefined method to train and evaluate the classifier

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        (
            "xgb",
            XGBClassifier(random_state=42, objective="multi:softmax", num_class=22),
        ),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "xgb__max_depth": [5],
    "xgb__learning_rate": [0.1],
    "xgb__n_estimators": [100, 200],
    "xgb__gamma": [0],
    "xgb__min_child_weight": [1],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the xgboost classifier
prod_cat_macro_f1_scores_2, prod_cat_hyperparameters_2 = train_evaluate_model_xgb(
    product_category_dataset,
    "product_category_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)

              precision    recall  f1-score   support

           0       0.46      0.55      0.50        20
           1       0.58      0.57      0.58       223
           2       0.53      0.67      0.59        70
           3       0.35      0.35      0.35        57
           4       0.63      0.66      0.64        44
           5       0.33      0.67      0.44         6
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         2
           8       0.33      0.50      0.40         2
           9       0.57      0.60      0.58       178
          10       0.41      0.52      0.46        42
          11       0.40      0.67      0.50         3
          12       0.87      0.72      0.79        74
          13       0.81      0.76      0.78       478
          14       0.59      0.60      0.59        45
          15       0.52      0.70      0.60        88
          16       0.33      0.33      0.33        18
          17       1.00    

The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(prod_cat_macro_f1_scores_2))
print("Standard Deviation:", np.std(prod_cat_macro_f1_scores_2))

# Printing best model
print("\nThe hyperparameters which gives the best results are:")
print(prod_cat_hyperparameters_2[np.argmax(prod_cat_macro_f1_scores_2)])

Cross-Validation Macro f1:
Mean: 0.4588489641251509
Standard Deviation: 0.018580918911457533

The hyperparameters which gives the best results are:
{'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'xgb__gamma': 0, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__min_child_weight': 1, 'xgb__n_estimators': 100}


### **Classification Task - Hazard Vectors**

Keep only the columns 'text_processed' and 'hazard_encoded' from the initial dataset

In [None]:
# Selecting specific columns
hazard_dataset = food_dataset[["text_processed", "hazard_encoded"]]

hazard_dataset.sample(2)

Unnamed: 0,text_processed,hazard_encoded
2922,azka impex limited recall pran muri moa puffed...,59
2354,chun yuen trading co. south el monte californi...,59


**Logistic Regression**

Below, all the steps for training and evaluating the Logistic Regression model for hazard vectors are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

<br>

We use the previous predefined method to train and evaluate the classifier

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("logreg", LogisticRegression()),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "logreg__penalty": ["l1", "l2"],
    "logreg__solver": ["liblinear"],
    "logreg__max_iter": [3000, 4000],
    "logreg__C": [10, 20],
    "logreg__class_weight": ["balanced"],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the logistic regression classifier
haz_macro_f1_scores, haz_hyperparameters = train_evaluate_model_lr(
    hazard_dataset,
    "hazard_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)



              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.00      0.00      0.00         2
           2       0.33      1.00      0.50         1
           3       1.00      1.00      1.00         1
           4       0.00      0.00      0.00         4
           5       0.71      0.91      0.80        22
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         3
           9       1.00      1.00      1.00         5
          10       0.33      0.50      0.40         2
          11       0.67      0.50      0.57         4
          12       0.00      0.00      0.00         1
          13       0.40      0.29      0.33         7
          14       0.73      0.89      0.80         9
          15       1.00      1.00      1.00         2
          16       0.00      0.00      0.00         2
          17       0.68    



The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(haz_macro_f1_scores))
print("Standard Deviation:", np.std(haz_macro_f1_scores))

# Printing best model
print("\nThe hyperparameters which gives the best results are:")
print(haz_hyperparameters[np.argmax(haz_macro_f1_scores)])

Cross-Validation Macro f1:
Mean: 0.474863408984009
Standard Deviation: 0.017328708056214937

The hyperparameters which gives the best results are:
{'logreg__C': 20, 'logreg__class_weight': 'balanced', 'logreg__max_iter': 3000, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


**BERT - RoBERTa**

Below, all the steps for training the BERT model for the hazard vectors are presented

- Separation of the initial dataset
- Calculation of the weights of the classes
- Data tokenization
- Training the model

<br>

We use all the previous predefined methods to train and evaluate the classifier

In [None]:
# Splitting (i choose 0.6 to keep samples for each label to all datasets)
train_set, val_set, test_set = split_dataset(hazard_dataset, "hazard_encoded", 0.6)

# Calculating class weights
class_weights = compute_weight(train_set, "hazard_encoded")

# Tokenizing
train_dataset, val_dataset, test_dataset = tokenize_data(
    train_set, val_set, test_set, "hazard_encoded", "roberta-base"
)

# Traning
haz_bert_model = train_model(train_dataset, val_dataset, "roberta-base", 128)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,4.846732,0.00459,3.6e-05,0.007812,7.1e-05
2,No log,4.825397,0.040656,0.000318,0.007812,0.00061
3,No log,4.818655,0.116066,0.000907,0.007812,0.001625
4,4.855800,4.819998,0.017049,0.000133,0.007812,0.000262
5,4.855800,4.821696,0.019672,0.000154,0.007812,0.000301
6,4.855800,4.817992,0.122623,0.000958,0.007812,0.001707
7,4.855800,4.818659,0.122623,0.000958,0.007812,0.001707
8,4.838500,4.821162,0.122623,0.000958,0.007812,0.001707
9,4.838500,4.823054,0.122623,0.000958,0.007812,0.001707


Now, we test the trained model on the useen data obtained from the initial split

In [None]:
# Testing
labels_pred, labels_actual = test_model(haz_bert_model, test_dataset)

# Printing the classification report
print("Evaluating on Test Dataset")
print(classification_report(labels_actual, labels_pred, zero_division=0))

Evaluating on Test Dataset
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00        20
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         6
          14       0.00      0.00      0.00         9
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         2


**XGBoost**

Below, all the steps for training and evaluating the XGBoost model for hazard vectors are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

<br>

We use the previous predefined method to train and evaluate the classifier

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        (
            "xgb",
            XGBClassifier(random_state=42, objective="multi:softmax", num_class=128),
        ),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "xgb__max_depth": [5],
    "xgb__learning_rate": [0.1],
    "xgb__n_estimators": [100],
    "xgb__gamma": [0],
    "xgb__min_child_weight": [1],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the xgboost classifier
haz_macro_f1_scores_2, haz_hyperparameters_2 = train_evaluate_model_xgb(
    hazard_dataset,
    "hazard_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)

              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.00      0.00      0.00         2
           2       0.50      1.00      0.67         1
           3       1.00      1.00      1.00         1
           4       0.00      0.00      0.00         4
           5       0.77      0.77      0.77        22
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         3
           9       1.00      1.00      1.00         5
          10       0.50      0.50      0.50         2
          11       0.75      0.75      0.75         4
          12       0.00      0.00      0.00         1
          13       0.67      0.57      0.62         7
          14       0.69      1.00      0.82         9
          15       0.50      1.00      0.67         2
          16       0.00      0.00      0.00         2
          17       0.70    

The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(haz_macro_f1_scores_2))
print("Standard Deviation:", np.std(haz_macro_f1_scores_2))

# Printing best model
print("\nThe hyperparameters which gives the best results are:")
print(haz_hyperparameters_2[np.argmax(haz_macro_f1_scores_2)])

Cross-Validation Macro f1:
Mean: 0.43987976164409254
Standard Deviation: 0.02247386345071928

The hyperparameters which gives the best results are:
{'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'xgb__gamma': 0, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__min_child_weight': 1, 'xgb__n_estimators': 100}


### **Classification Task - Product Vectors**

Keep only the columns 'text_processed' and 'product_encoded' from the initial dataset

In [8]:
# Selecting specific columns
product_dataset = food_dataset[["text_processed", "product_encoded"]]

product_dataset.sample(2)

Unnamed: 0,text_processed,product_encoded
4633,high class import violation active thu current...,547
2031,washington dec. texas best proteins farm marke...,796


**Logistic Regression**

Below, all the steps for training and evaluating the Logistic Regression model for product vectors are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

<br>

We use the previous predefined method to train and evaluate the classifier

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("logreg", LogisticRegression()),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "logreg__penalty": ["l1"],
    "logreg__solver": ["liblinear"],
    "logreg__max_iter": [2000],
    "logreg__C": [10, 20],
    "logreg__class_weight": ["balanced"],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the logistic regression classifier
prod_macro_f1_scores, prod_hyperparameters = train_evaluate_model_lr(
    product_dataset,
    "product_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)



              precision    recall  f1-score   support

           0       0.50      0.50      0.50         4
           2       0.50      0.75      0.60        12
           3       0.00      0.00      0.00         1
           4       0.22      0.50      0.31         4
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           8       0.50      0.50      0.50         2
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         2
          12       0.80      1.00      0.89         4
          13       1.00      0.50      0.67         2
          14       1.00      1.00      1.00         1
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          17       0.50      1.00      0.67         1
          18       0.00      0.00      0.00         1
          19       0.75    



              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           2       0.35      0.55      0.43        11
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         3
           8       0.20      0.50      0.29         2
           9       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       1.00      0.67      0.80         3
          13       1.00      0.50      0.67         2
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         0
          17       0.00      0.00      0.00         1
          19       0.75      1.00      0.86         3
          20       0.25      0.25      0.25         4
          22       0.00      0.00      0.00         1
          24       0.00    



              precision    recall  f1-score   support

           0       0.75      0.75      0.75         4
           1       0.00      0.00      0.00         1
           2       0.24      0.36      0.29        11
           4       0.25      0.33      0.29         3
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         3
           8       0.10      0.50      0.17         2
          11       0.00      0.00      0.00         1
          12       1.00      1.00      1.00         3
          13       1.00      0.50      0.67         2
          14       0.50      1.00      0.67         1
          16       0.00      0.00      0.00         0
          17       0.00      0.00      0.00         1
          18       1.00      1.00      1.00         1
          19       0.67      1.00      0.80         2
          20       0.00      0.00      0.00         4
          22       0.00      0.00      0.00         1
          23       0.00    

The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(prod_macro_f1_scores))
print("Standard Deviation:", np.std(prod_macro_f1_scores))

# Printing best model
print("\nThe hyperparameters which give the best results are:")
print(prod_hyperparameters[np.argmax(prod_macro_f1_scores)])

Cross-Validation Macro f1:
Mean: 0.19708794567380383
Standard Deviation: 0.0057071739303565005

The hyperparameters which give the best results are:
{'logreg__C': 20, 'logreg__class_weight': 'balanced', 'logreg__max_iter': 2000, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


**BERT - RoBERTa**

After some testing i noticed that because there are too many labels with only one or two samples, the BERT model can be trained, as it is not possible to split the original dataset into training, validation and testing datasets. I decided to remove all the labels with such an amount of samples, as it is very difficult for a classifier to learn to classify such labels with so little information

In [10]:
# Getting the samples of each label
label_counts = product_dataset["product_encoded"].value_counts()

# Finding labels with one or two samples
labels_with_one_sample = label_counts[label_counts == 1].index
labels_with_two_samples = label_counts[label_counts == 2].index

# Removing the labesl with one or two samples
product_dataset_modified = product_dataset[
    (~product_dataset["product_encoded"].isin(labels_with_one_sample))
    & (~product_dataset["product_encoded"].isin(labels_with_two_samples))
]

# Encoding agin
product_encoder_level_2 = LabelEncoder()
product_dataset_modified["product_encoded_level_2"] = (
    product_encoder_level_2.fit_transform(product_dataset_modified["product_encoded"])
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_dataset_modified["product_encoded_level_2"] = (


Below, all the steps for training the BERT model for the product vectors are presented

- Separation of the initial dataset
- Calculation of the weights of the classes
- Data tokenization
- Training the model

<br>

We use all the previous predefined methods to train and evaluate the classifier

In [20]:
# Splitting
train_set, val_set, test_set = split_dataset(
    product_dataset_modified, "product_encoded_level_2", 0.6
)

# Calculating class weights
class_weights = compute_weight(train_set, "product_encoded_level_2")

# Tokenizating
train_dataset, val_dataset, test_dataset = tokenize_data(
    train_set, val_set, test_set, "product_encoded_level_2", "roberta-base"
)

# Traning
prod_bert_model = train_model(
    train_dataset,
    val_dataset,
    "roberta-base",
    len(product_dataset_modified["product_encoded_level_2"].unique()),
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,5.990897,0.004662,1.2e-05,0.002519,2.3e-05
2,No log,5.988992,0.000777,2e-06,0.002519,4e-06
3,No log,5.987225,0.001554,4e-06,0.002519,8e-06
4,No log,5.985886,0.003885,1e-05,0.002519,1.9e-05


Now, we test the trained model on the useen data obtained from the initial split

In [None]:
# Testing
labels_pred, labels_actual = test_model(prod_bert_model, test_dataset)

# Printing the classification report
print("Evaluating on Test Dataset")
print(classification_report(labels_actual, labels_pred, zero_division=0))

Evaluating on Test Dataset
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         3
          12       0.00      0.00      0.00         4
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         3
          15       0.00      0.00      0.00         3
          16       0.00      0.00      0.00         5


**XGBoost**

Here trying to run XGBoost as in the previous classification tasks, i got an error because there are not enough samples for some labels in order to split the dataset. Because of this i decided to keep the same approach as i did in the BERT model and remove the labels with one or two samples, as it is not possible to learn a model with so little information

Below, all the steps for training and evaluating the XGBoost model for product vectors are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

<br>

We use the previous predefined method to train and evaluate the classifier

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        (
            "xgb",
            XGBClassifier(
                random_state=42,
                objective="multi:softmax",
                num_class=len(
                    product_dataset_modified["product_encoded_level_2"].unique()
                ),
            ),
        ),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "xgb__max_depth": [5],
    "xgb__learning_rate": [0.1],
    "xgb__n_estimators": [100],
    "xgb__gamma": [0],
    "xgb__min_child_weight": [1],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the xgboost classifier
prod_macro_f1_scores_2, prod_hyperparameters_2 = train_evaluate_model_xgb(
    product_dataset_modified,
    "product_encoded_level_2",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)

              precision    recall  f1-score   support

           0       0.67      1.00      0.80         4
           1       0.31      0.36      0.33        11
           2       1.00      0.25      0.40         4
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           5       0.20      0.50      0.29         2
           6       0.00      0.00      0.00         1
           7       0.60      1.00      0.75         3
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       1.00      1.00      1.00         3
          12       0.00      0.00      0.00         3
          13       0.00      0.00      0.00         2
          14       0.33      0.33      0.33         3
          15       0.67      0.67      0.67         3
          16       1.00      0.67      0.80         6
          17       1.00    

The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [26]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(prod_macro_f1_scores_2))
print("Standard Deviation:", np.std(prod_macro_f1_scores_2))

# Printing best model
print("\nThe hyperparameters which gives the best results are:")
print(prod_hyperparameters_2[np.argmax(prod_macro_f1_scores_2)])

Cross-Validation Macro f1:
Mean: 0.27067282625275735
Standard Deviation: 0.006834882422954483

The hyperparameters which gives the best results are:
{'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'xgb__gamma': 0, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__min_child_weight': 1, 'xgb__n_estimators': 100}


### **Results**

Based on the above bechmark analyses we have the following performance based on the macro f1.

<br>

|Classification Task|Logistic Regression|BERT - RoBERTa|XGBoost|
|-------------------|-------------------|--------------|-------|
|Hazard-Category|Mean: 0.6764 Std: 0.0256|Approximate: 0.69|Mean: 0.6962 Std: 0.0590|
|Product-Category|Mean: 0.5582 Std: 0.0060|Approximate: 0.69|Mean: 0.4588 Std: 0.0185|
|Hazard|Mean: 0.4748 Std: 0.0173|Approximate: 0.00|Mean: 0.4398 Std: 0.0224|
|Product|Mean: 0.1970 Std: 0.0057|Approximate: 0.00|Mean: 0.2706 Std: 0.0068|


- Hazard-Category

  As we can see, all three algorithms have almost the same performance on the macro f1. Based on some experiments i did to predict the hazard category using the validation dataset and considering the score obtained, i observed that the logistic regression classifier has better performance than the other two. This is probably because it has a lower standard deviation than XGBoost even though it has a better mean and also because in the BERT model i did not apply cross-validation as i did in the others (due to computational resources) and the f1 macro seems to be a bit optimistic (that's why i wrote "approximate").

- Product-Category

  As we can see, the BERT model fits better and after a few experiments on the validation dataset and considering the score obtained, it indeed performs better.

- Hazard

  Here the best performing classifier seems to be the logistic regression with a better mean and standard deviation than the XGBoost model. The BERT model here could not be fitted because there are many labels and most of them have very few samples compared to the very few labels that are strongly overrepresented in the dataset with too many samples. Probably to work better a good approach is to use oversampling and undersampling to increase the visibility of the minority labels and decrease the visibility of the majority labels accordingly. I did not try something like this because i faced some obstacles applying it.

- Product

  Here the XGBoost model performs better than linear regression, since it has a higher mean macro f1. However, because XGboost is not trained on the whole dataset as i exclude some labels as i explained earlier, unlike linear regression, the latter can handle and predict all labels better, which is actually true after testing both with the validation dataset and the resulting score was higher for linear regression. The BERT model as in the hazard vector classification task could not be fitted for the same reason.

Based on the above results, the outcomes obtained using the "text" column of the training dataset are better than those of the benchmark analyses using the "title" column, except for the product vector classification task, where the linear regression classifier on "title" performs better (as you can see in the other notebook that performs this type of benchmark). The best performing models for each classification task except the product vector task (which is mentioned on the other notebook as it contains the best performing model) and their predictions on the validation dataset are presented below.

### **Classification Tasks - Hazard Category - Product Category**

Train a Logistic Regression classifier for the hazard categories based on the best hyperparameters obtained earlier

In [None]:
# Getting the best hyperparameters
haz_cat_best_hyperparameters = haz_cat_hyperparameters[np.argmax(haz_cat_macro_f1_scores)]

# Setting the model pipeline
haz_cat_model = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_df=haz_cat_best_hyperparameters["tfidf__max_df"],
        ngram_range=haz_cat_best_hyperparameters["tfidf__ngram_range"],
    )),
    ("logreg", LogisticRegression(
        penalty=haz_cat_best_hyperparameters["logreg__penalty"],
        solver=haz_cat_best_hyperparameters["logreg__solver"],
        max_iter=haz_cat_best_hyperparameters["logreg__max_iter"],
        C=haz_cat_best_hyperparameters["logreg__C"],
        class_weight=haz_cat_best_hyperparameters["logreg__class_weight"],
    )),
])

# Training
haz_cat_model.fit(hazard_category_dataset["text_processed"], hazard_category_dataset["hazard_category_encoded"])

Load and transform the validation dataset

In [None]:
# Loading
validation_dataset = pd.read_csv("../data/validation/incidents.csv")

# Dropping
validation_dataset.drop(columns=["Unnamed: 0", "year", "month", "day", "country", "title"], inplace= True)

# Processing the texts
validation_dataset["text"] = validation_dataset["text"].apply(process_text)

validation_dataset.sample(5)

Unnamed: 0,text
541,identify features barcode item code date avail...
29,pra date publish dec product description gm we...
441,h&c food inc. brooklyn ny recall oz enoki mush...
141,washington august oberto brands kent wa establ...
132,washington dec. lee bros. foodservice inc. san...


Predict hazard categories

In [None]:
# Predict hazard categories
validation_dataset["hazard-category"] = validation_dataset["text"].apply(lambda row: encoders["hazard_category_encoded"].inverse_transform([haz_cat_model.predict([row])[0]])[0])

Process the texts from the validation dataset to predict the product categories based on the best BERT model obtained earlier

In [None]:
# Setting up the tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Tokenizing
encodings = tokenizer(
    validation_dataset["text"].tolist(),
    truncation=True,
    padding="max_length",
    max_length=128,
    return_tensors="pt",
)

# Creating the dataset based on the encodings
text_dataset = Dataset.from_dict(encodings)

# Predicting
predictions = prod_cat_bert_model.predict(text_dataset)

# Getting the predictions
labels_pred = torch.argmax(torch.tensor(predictions.predictions), dim=1).numpy()

# Translating the predicted labels to the actual names
translated_labels_pred = encoders["product_category_encoded"].inverse_transform(labels_pred)

# Updating
validation_dataset["product-category"] = translated_labels_pred

In [None]:
# Storing
validation_dataset[["hazard-category", "product-category"]].to_csv("../submission/subtask_1/submission.csv", index=False)

### **Classification Task - Hazard**

Train a Logistic Regression classifier for the hazard vectors based on the best hyperparameters obtained earlier

In [None]:
# Getting the best hyperparameters
haz_best_hyperparameters = haz_hyperparameters[np.argmax(haz_macro_f1_scores)]

# Setting the model pipeline
haz_model = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_df=haz_best_hyperparameters["tfidf__max_df"],
        ngram_range=haz_best_hyperparameters["tfidf__ngram_range"],
    )),
    ("logreg", LogisticRegression(
        penalty=haz_best_hyperparameters["logreg__penalty"],
        solver=haz_best_hyperparameters["logreg__solver"],
        max_iter=haz_best_hyperparameters["logreg__max_iter"],
        C=haz_best_hyperparameters["logreg__C"],
        class_weight=haz_best_hyperparameters["logreg__class_weight"],
    )),
])

# Training
haz_model.fit(hazard_dataset["text_processed"], hazard_dataset["hazard_encoded"])



Predict the hazard vectors

In [None]:
# Predict hazard
validation_dataset["hazard"] = validation_dataset["text"].apply(lambda row: encoders["hazard_encoded"].inverse_transform([haz_model.predict([row])[0]])[0])

# Storing
validation_dataset["hazard"].to_csv("../submission/subtask_2/submission_hazard.csv", index=False)