## **Machine Learning Algorithms Benchmark - Titles**

This is the application of benchmarking algorithms to the provided titles of the training dataset.

Import some required python modules

In [2]:
# Importing some packages

import pandas as pd
import numpy as np
import spacy, torch
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from datasets import Dataset
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    classification_report,
    f1_score,
    precision_recall_fscore_support,
    accuracy_score,
)
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    StratifiedKFold,
)
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)

# Loading the English spacy model
eng_model = spacy.load("en_core_web_sm")

Load the training dataset and process the columns

In [None]:
# Loading the 'food incidents' training dataset
food_dataset = pd.read_csv("../data/train/incidents_train.csv")

# Dropping and renaming some columns
food_dataset.drop(
    columns=["Unnamed: 0", "year", "month", "day", "country", "text"], inplace=True
)
food_dataset.rename(
    columns={
        "hazard-category": "hazard_category",
        "product-category": "product_category",
    },
    inplace=True,
)

food_dataset.sample(2)

Unnamed: 0,title,hazard_category,product_category,hazard,product
1024,New York Firm Recalls Pork and Poultry Product...,fraud,"meat, egg and dairy products",inspection issues,poultry meat and poultry meat products
265,United Biscuits recalls certain batches of its...,foreign bodies,confectionery,metal fragment,fruit snacks


Encode all the classes into numeric values

In [4]:
# Dictionary to hold the label encoders
encoders = {}

# Iterating through the labels
for label in food_dataset.columns.drop("title"):

    # Encoding labels into numeric values
    label_encoder = LabelEncoder()
    food_dataset[f"{label}_encoded"] = label_encoder.fit_transform(food_dataset[label])

    # Storing the label encoder
    encoders[f"{label}_encoded"] = label_encoder

food_dataset.sample(2)

Unnamed: 0,title,hazard_category,product_category,hazard,product,hazard_category_encoded,product_category_encoded,hazard_encoded,product_encoded
3672,Wang Korea brand fish sausage recalled due to ...,allergens,seafood,eggs and products thereof,Fishes not identified,0,19,34,2
4778,FSIS Issues Public Health Alert for Ready-To-E...,biological,"meat, egg and dairy products",listeria monocytogenes,chicken based products,1,13,55,167


Process the titles by removing redundant terms

In [None]:
def process_title(title):
    """
    This function is used to process the titles and remove reduntant texts.

    :param text: Title to process
    :return: The processed title
    """

    # Converting the title to tokens
    tokens = eng_model(title)

    # List to keep the final tokens of the title
    final_tokens = []

    for token in tokens:

        # Checking whether the token is reduntant term or not
        if (
            not token.is_stop
            and not token.is_punct
            and not token.is_space
            and token.is_ascii
            and not len(token) == 1
        ):

            # Updating
            final_tokens.append(token.lemma_.lower())

    # Concatenating the final title
    new_text = " ".join(final_tokens)

    return new_text


# Processing the titles
food_dataset.insert(1, "title_processed", food_dataset["title"].apply(process_title))

food_dataset.sample(2)

Unnamed: 0,title,title_processed,hazard_category,product_category,hazard,product,hazard_category_encoded,product_category_encoded,hazard_encoded,product_encoded
4820,The Third Synthesis Inc Issues Allergy Alert o...,synthesis inc issues allergy alert undeclared ...,allergens,cereals and bakery products,eggs and products thereof,bakery products,0,1,34,41
627,Illinois Firm Recalls Stuffed Beef Products fo...,illinois firm recall stuffed beef products pos...,biological,"meat, egg and dairy products",listeria monocytogenes,precooked cooked beef meat products,1,13,55,742


#### **Classification Task - Hazard Categories**

Keep only the columns 'title_processed' and 'hazard_category_encoded' from the initial dataset

In [None]:
# Selecting specific columns
hazard_category_dataset = food_dataset[["title_processed", "hazard_category_encoded"]]

hazard_category_dataset.sample(2)

Unnamed: 0,title_processed,hazard_category_encoded
4507,je dois boutique issues voluntary nationwide r...,2
3964,frank brand milk chocolate covered raisins rec...,0


**Logistic Regression**

Below is the main function for training and implementing the Logistic Regression classifier. It performs nested cross validation for model and hyperparameter selection.

In [None]:
def train_evaluate_model_lr(dataset, label, outer_cv, inner_cv, pipeline, param_grid):
    """
    This function  used to evaluate the classifier.

    :param dataset: Dataset to use
    :param label: Label to use
    :param outer_cv: Outer cross validation
    :param inner_cv: Inner cross validation
    :param pipeline: Pipeline to use
    :param param_grid: Hyperparameter grid to use
    :return: Macro f1 scores and hyperparameters
    """

    # Defining the lists to hold values through the cross validation process
    f1_scores = []
    hyperparameters = []

    # Iterating through the folds
    for train_idx, test_idx in outer_cv.split(
        dataset["title_processed"], dataset[label]
    ):

        # Constructing the training and test datasets
        title_train, title_test = (
            dataset["title_processed"].iloc[train_idx],
            dataset["title_processed"].iloc[test_idx],
        )
        label_train, label_test = (
            dataset[label].iloc[train_idx],
            dataset[label].iloc[test_idx],
        )

        # Setting up the grid search for hyperparameter tuning
        grid_search = GridSearchCV(
            pipeline, param_grid, cv=inner_cv, scoring="f1_macro", n_jobs=-1, verbose=0
        )

        # Fitting the training dataset and performing hyperparameter tuning
        grid_search.fit(title_train, label_train)

        # Getting the best model
        best_model = grid_search.best_estimator_

        # Predicting over the testing dataset
        label_pred = best_model.predict(title_test)

        # Updating lists to store the macro f1 score and hyperparameters of the current best model
        f1_scores.append(f1_score(label_test, label_pred, average="macro"))
        hyperparameters.append(grid_search.best_params_)

        print(classification_report(label_test, label_pred, zero_division=0))

    return f1_scores, hyperparameters

Below, all the steps for training and evaluating the Logistic Regression model for hazard categories are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("logreg", LogisticRegression()),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5, 0.7],
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "logreg__penalty": ["l1", "l2"],
    "logreg__solver": ["liblinear"],
    "logreg__max_iter": [2000, 3000],
    "logreg__C": [10, 20],
    "logreg__class_weight": ["balanced"],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the logistic regression classifier
haz_cat_macro_f1_scores, haz_cat_hyperparameters = train_evaluate_model_lr(
    hazard_category_dataset,
    "hazard_category_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)

  _data = np.array(data, dtype=dtype, copy=copy,


              precision    recall  f1-score   support

           0       0.86      0.91      0.88       618
           1       0.87      0.90      0.89       581
           2       0.68      0.71      0.69        95
           3       0.50      0.38      0.43         8
           4       0.81      0.73      0.77       187
           5       0.71      0.68      0.69       124
           6       1.00      1.00      1.00         1
           7       0.43      0.50      0.46        18
           8       0.74      0.39      0.51        44
           9       0.89      0.44      0.59        18

    accuracy                           0.83      1694
   macro avg       0.75      0.66      0.69      1694
weighted avg       0.83      0.83      0.83      1694



  _data = np.array(data, dtype=dtype, copy=copy,


              precision    recall  f1-score   support

           0       0.86      0.90      0.88       618
           1       0.88      0.91      0.90       580
           2       0.74      0.65      0.69        96
           3       0.57      0.50      0.53         8
           4       0.76      0.78      0.77       187
           5       0.66      0.56      0.60       124
           6       0.00      0.00      0.00         1
           7       0.73      0.47      0.57        17
           8       0.71      0.53      0.61        45
           9       0.62      0.44      0.52        18

    accuracy                           0.83      1694
   macro avg       0.65      0.57      0.61      1694
weighted avg       0.83      0.83      0.83      1694

              precision    recall  f1-score   support

           0       0.84      0.91      0.87       618
           1       0.88      0.90      0.89       580
           2       0.64      0.71      0.67        96
           3       0.50 

The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(haz_cat_macro_f1_scores))
print("Standard Deviation:", np.std(haz_cat_macro_f1_scores))

# Printing best model
print("\nThe hyperparameters which give the best results are:")
print(haz_cat_hyperparameters[np.argmax(haz_cat_macro_f1_scores)])

Cross-Validation Macro f1:
Mean: 0.6491637686017375
Standard Deviation: 0.034544471561658185

The hyperparameters which give the best results are:
{'logreg__C': 20, 'logreg__class_weight': 'balanced', 'logreg__max_iter': 2000, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 1)}


**BERT - RoBERTa**

Define a function to split the dataset into training, validation and testing datasets using stratification to maintain the distribution of classes in them.

In [None]:
def split_dataset(dataset, label, percentage):
    """
    This function used to split the initial dataset to training, validation and
    testing datasets.

    :param dataset: Dataset to split
    :param label: Label to use for splitting
    :param percentage: Percentage of data to use for testing and validation
    :return:
    """

    # Splitting into train and temp sets
    train_set, temp_set = train_test_split(
        dataset,
        test_size=percentage,
        stratify=dataset[label],
        random_state=42,
    )

    # Splitting into validation and testing sets
    val_set, test_set = train_test_split(
        temp_set, test_size=0.5, random_state=42, stratify=temp_set[label]
    )

    return train_set, val_set, test_set

Define a function to calculate class weights for the training dataset to be used by the classifier to pay more attention to minority classes.

In [None]:
def compute_weight(train_set, label):
    """
    This function used to compute the class weights.

    :param train_set: Training set
    :param label: Label to use for computing class weights
    :return: Class weights
    """

    # Computing class weights
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.sort(np.unique(train_set[label])),
        y=train_set[label].values,
    )

    # Converting weights to a tensor and moving to GPU
    class_weights = torch.tensor(class_weights, dtype=torch.float).to("cuda")

    return class_weights

Define a function to tokenize the data based on the BERT tokenizer

In [None]:
def tokenize_data(train_set, val_set, test_set, label, model):
    """
    This function used to tokenize the text data and return the correspondig
    datasets together with their labels.

    :param train_set: Training set
    :param val_set: Validation set
    :param test_set: Testing set
    :param label: Label to use
    :param model: Model to use
    :return: Tokenized data sets
    """

    # Setting up the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model)

    # Tokenizing text data
    train_encodings = tokenizer(
        train_set["title_processed"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )
    val_encodings = tokenizer(
        val_set["title_processed"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )
    test_encodings = tokenizer(
        test_set["title_processed"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt",
    )

    # Converting list of labels to PyTorch tensors
    train_encodings["labels"] = torch.tensor(train_set[label].tolist())
    val_encodings["labels"] = torch.tensor(val_set[label].tolist())
    test_encodings["labels"] = torch.tensor(test_set[label].tolist())

    # Converting to Datasets
    train_dataset = Dataset.from_dict(train_encodings)
    val_dataset = Dataset.from_dict(val_encodings)
    test_dataset = Dataset.from_dict(test_encodings)

    return train_dataset, val_dataset, test_dataset

Define a function that computes some weights to be used by the classifier at each epoch to evaluate its performance against the validation dataset.

In [None]:
def compute_metrics(eval_pred):
    """
    This function used to evaluate the perfomance of the model for each epoch
    based on the evaluation dataset.

    :param eval_pred: Evaluation predictions
    :return: Evaluation metrics
    """

    # Extracting logits and labels
    logits, labels = eval_pred

    # Getting predictions
    predictions = torch.argmax(torch.tensor(logits), dim=1).numpy()

    # Calculating evaluation metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="macro", zero_division=0
    )
    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

Define a class that overrides the way the classifier calculates the loss. Here it takes into account the weights of the classes for the cross entropy loss function.

In [None]:
class WeightedTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        This function used to compute loss using class weights handling the
        imbalance classes.

        :param model: The model
        :param inputs: The inputs
        :param return_outputs: Whether to return outputs
        :param kwargs: Keyword arguments
        :return: The loss and outputs
        """

        # Extracting labels
        labels = inputs.pop("labels")

        # Passing inputs and getting results
        outputs = model(**inputs)
        logits = outputs.logits

        # Define=ing weighted cross entropy loss function
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)

        # Calculating
        loss = loss_fct(logits, labels)

        # Checking and returning
        if return_outputs:

            return (loss, outputs)

        else:

            return loss

Define a function to train and evaluate the model

In [None]:
def train_model(train_dataset, val_dataset, model_name, number_classes):
    """
    This function used to train the model.

    :param train_dataset: Training dataset
    :param val_dataset: Validation dataset
    :param model_name: Model name
    :param number_classes: Number of classes
    :return: Trained model
    """

    # Setting up the model to use and moving it to GPU
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=number_classes
    )
    model.to("cuda")

    # Setting up the training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        num_train_epochs=10,
        report_to="none",
        save_total_limit=1,
    )

    # Setting up the trainer of the model
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    # Training model
    trainer.train()

    return trainer

Define a function to test the trained model on unseen data

In [None]:
def test_model(model, test_dataset):
    """
    This function used to test the model.

    :param model: Trained model
    :param test_dataset: Testing dataset
    :return: Predicted labels and actual labels
    """

    # Running model on test dataset
    predictions = model.predict(test_dataset)

    # Getting the predicted labels
    test_logits = predictions.predictions
    labels_pred = torch.argmax(torch.tensor(test_logits), dim=1).numpy()

    # Getting the actual labels
    labels_actual = predictions.label_ids

    return labels_pred, labels_actual

Below, all the steps for training the BERT model for hazard categories are presented

- Separation of the initial dataset
- Calculation of the weights of the classes
- Data tokenization
- Training the model

In [None]:
# Splitting
train_set, val_set, test_set = split_dataset(
    hazard_category_dataset, "hazard_category_encoded", 0.5
)

# Calculating class weights
class_weights = compute_weight(train_set, "hazard_category_encoded")

# Tokenizing
train_dataset, val_dataset, test_dataset = tokenize_data(
    train_set, val_set, test_set, "hazard_category_encoded", "roberta-base"
)

# Traning
haz_cat_bert_model = train_model(train_dataset, val_dataset, "roberta-base", 10)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.548941,0.684252,0.463759,0.383295,0.360137
2,No log,1.404142,0.688189,0.575158,0.403286,0.38588
3,No log,1.305243,0.782677,0.514804,0.443466,0.468716
4,1.572100,1.29805,0.795276,0.608624,0.492229,0.50962
5,1.572100,1.226816,0.788976,0.560773,0.534506,0.533887
6,1.572100,1.388559,0.787402,0.566212,0.538497,0.539809
7,0.711400,1.639688,0.80315,0.61055,0.528299,0.556862
8,0.711400,1.667347,0.800787,0.599756,0.537876,0.558237
9,0.711400,1.804566,0.801575,0.561767,0.534444,0.544423
10,0.288000,1.861332,0.803937,0.576283,0.535748,0.552179


Now, we test the trained model on the useen data obtained from the initial split

In [None]:
# Testing
labels_pred, labels_actual = test_model(haz_cat_bert_model, test_dataset)

# Printing the classification report
print("Evaluating on Test Dataset")
print(classification_report(labels_actual, labels_pred, zero_division=0))

Evaluating on Test Dataset
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       464
           1       0.89      0.89      0.89       435
           2       0.75      0.68      0.72        72
           3       0.67      0.67      0.67         6
           4       0.74      0.77      0.75       141
           5       0.65      0.63      0.64        93
           6       0.00      0.00      0.00         1
           7       0.40      0.31      0.35        13
           8       0.62      0.55      0.58        33
           9       0.53      0.69      0.60        13

    accuracy                           0.82      1271
   macro avg       0.61      0.61      0.61      1271
weighted avg       0.82      0.82      0.82      1271



**XGBoost**

Below is the main function for training and implementing the XGBoost classifier. It performs nested cross validation for model and hyperparameter selection.

In [None]:
def train_evaluate_model_xgb(dataset, label, outer_cv, inner_cv, pipeline, param_grid):
    """
    This function used to evaluate the xgboost classifier.

    :param dataset: Dataset to use
    :param label: Label to use
    :param outer_cv: Outer cross validation
    :param inner_cv: Inner cross validation
    :param pipeline: Pipeline to use
    :param param_grid: Hyperparameter grid to use
    :return: Macro f1 scores and hyperparameters
    """

    # Defining the lists to hold values through the cross validation process
    f1_scores = []
    hyperparameters = []

    # Iterating through the folds
    for train_idx, test_idx in outer_cv.split(
        dataset["title_processed"], dataset[label]
    ):

        # Constructing the training and test datasets
        title_train, title_test = (
            dataset["title_processed"].iloc[train_idx],
            dataset["title_processed"].iloc[test_idx],
        )
        label_train, label_test = (
            dataset[label].iloc[train_idx],
            dataset[label].iloc[test_idx],
        )

        # Calculating the class weigths
        unique_classes = np.sort(np.unique(label_train))
        weights = compute_class_weight(
            "balanced", classes=unique_classes, y=label_train.values
        )
        class_weight_map = {cls: weight for cls, weight in zip(unique_classes, weights)}
        class_weights = [class_weight_map[lab] for lab in label_train.values]

        # Setting up the grid search for hyperparameter tuning
        grid_search = GridSearchCV(
            pipeline, param_grid, cv=inner_cv, scoring="f1_macro", n_jobs=-1, verbose=0
        )

        # Fitting the training dataset and performing hyperparameter tuning
        grid_search.fit(title_train, label_train, xgb__sample_weight=class_weights)

        # Getting the best model
        best_model = grid_search.best_estimator_

        # Predicting over the testing dataset
        label_pred = best_model.predict(title_test)

        # Updating lists to store the macro f1 score and hyperparameters of the current best model
        f1_scores.append(f1_score(label_test, label_pred, average="macro"))
        hyperparameters.append(grid_search.best_params_)

        print(classification_report(label_test, label_pred, zero_division=0))

    return f1_scores, hyperparameters

Below, all the steps for training and evaluating the XGBoost model for hazard categories are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        (
            "xgb",
            XGBClassifier(random_state=42, objective="multi:softmax", num_class=10),
        ),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "xgb__max_depth": [5],
    "xgb__learning_rate": [0.1, 0.3],
    "xgb__n_estimators": [100, 300],
    "xgb__gamma": [0],
    "xgb__min_child_weight": [1],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the xgboost classifier
haz_cat_macro_f1_scores_2, haz_cat_hyperparameters_2 = train_evaluate_model_xgb(
    hazard_category_dataset,
    "hazard_category_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)

              precision    recall  f1-score   support

           0       0.89      0.80      0.84       618
           1       0.84      0.86      0.85       581
           2       0.39      0.55      0.45        95
           3       0.50      0.25      0.33         8
           4       0.67      0.71      0.69       187
           5       0.53      0.56      0.54       124
           6       1.00      1.00      1.00         1
           7       0.31      0.50      0.38        18
           8       0.48      0.32      0.38        44
           9       0.47      0.39      0.42        18

    accuracy                           0.76      1694
   macro avg       0.61      0.59      0.59      1694
weighted avg       0.77      0.76      0.76      1694

              precision    recall  f1-score   support

           0       0.91      0.80      0.85       618
           1       0.85      0.88      0.87       580
           2       0.57      0.67      0.61        96
           3       0.67 

The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(haz_cat_macro_f1_scores_2))
print("Standard Deviation:", np.std(haz_cat_macro_f1_scores_2))

# Printing best model
print("\nThe hyperparameters which gives the best results are:")
print(haz_cat_hyperparameters_2[np.argmax(haz_cat_macro_f1_scores_2)])

Cross-Validation Macro f1:
Mean: 0.5624407645590868
Standard Deviation: 0.02252841624651751

The hyperparameters which gives the best results are:
{'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'xgb__gamma': 0, 'xgb__learning_rate': 0.3, 'xgb__max_depth': 5, 'xgb__min_child_weight': 1, 'xgb__n_estimators': 100}


#### **Classification Task - Product Categories**

Keep only the columns 'title_processed' and 'product_category_encoded' from the initial dataset

In [None]:
# Selecting specific columns
product_category_dataset = food_dataset[["title_processed", "product_category_encoded"]]

product_category_dataset.sample(2)

Unnamed: 0,title_processed,product_category_encoded
2922,azka impex recall pran muri moa milk correctly...,1
2354,chun yuen trading co. issues allergy alert und...,2


**Logistic Regression**

Below, all the steps for training and evaluating the Logistic Regression model for product categories are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

<br>

We use the previous predefined method to train and evaluate the classifier

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("logreg", LogisticRegression()),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5, 0.7],
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "logreg__penalty": ["l1", "l2"],
    "logreg__solver": ["liblinear"],
    "logreg__max_iter": [2000, 3000],
    "logreg__C": [10, 20],
    "logreg__class_weight": ["balanced"],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the logistic regression classifier
prod_cat_macro_f1_scores, prod_cat_hyperparameters = train_evaluate_model_lr(
    product_category_dataset,
    "product_category_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)

  _data = np.array(data, dtype=dtype, copy=copy,


              precision    recall  f1-score   support

           0       0.75      0.60      0.67        20
           1       0.74      0.69      0.71       223
           2       0.70      0.73      0.71        70
           3       0.54      0.53      0.53        57
           4       0.74      0.73      0.74        44
           5       0.50      0.67      0.57         6
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         2
           8       1.00      0.50      0.67         2
           9       0.69      0.79      0.74       178
          10       0.52      0.57      0.55        42
          11       0.50      0.33      0.40         3
          12       0.91      0.80      0.85        74
          13       0.81      0.86      0.84       478
          14       0.86      0.71      0.78        45
          15       0.69      0.72      0.70        88
          16       0.62      0.44      0.52        18
          17       0.50    



              precision    recall  f1-score   support

           0       0.67      0.42      0.52        19
           1       0.70      0.72      0.71       224
           2       0.72      0.73      0.72        70
           3       0.62      0.49      0.55        57
           4       0.65      0.59      0.62        44
           5       0.62      0.83      0.71         6
           6       0.00      0.00      0.00         2
           7       1.00      0.33      0.50         3
           8       0.00      0.00      0.00         2
           9       0.68      0.78      0.73       179
          10       0.73      0.46      0.57        41
          11       1.00      1.00      1.00         2
          12       0.89      0.80      0.84        74
          13       0.82      0.88      0.85       478
          14       0.80      0.71      0.75        45
          15       0.70      0.78      0.74        87
          16       0.60      0.33      0.43        18
          17       0.38    



The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(prod_cat_macro_f1_scores))
print("Standard Deviation:", np.std(prod_cat_macro_f1_scores))

# Printing best model
print("\nThe hyperparameters which give the best results are:")
print(prod_cat_hyperparameters[np.argmax(prod_cat_macro_f1_scores)])

Cross-Validation Macro f1:
Mean: 0.5675576066700403
Standard Deviation: 0.0030204728795336956

The hyperparameters which give the best results are:
{'logreg__C': 20, 'logreg__class_weight': 'balanced', 'logreg__max_iter': 2000, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


**BERT - RoBERTa**

Below, all the steps for training the BERT model for the product categories are presented

- Separation of the initial dataset
- Calculation of the weights of the classes
- Data tokenization
- Training the model

<br>

We use all the previous predefined methods to train and evaluate the classifier

In [None]:
# Splitting
train_set, val_set, test_set = split_dataset(
    product_category_dataset, "product_category_encoded", 0.4
)

# Calculating class weights
class_weights = compute_weight(train_set, "product_category_encoded")

# Tokenizating
train_dataset, val_dataset, test_dataset = tokenize_data(
    train_set, val_set, test_set, "product_category_encoded", "roberta-base"
)

# Traning
prod_cat_bert_model = train_model(train_dataset, val_dataset, "roberta-base", 22)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,2.070974,0.503937,0.333474,0.316481,0.27998
2,No log,1.585214,0.683071,0.470472,0.514848,0.469845
3,2.032000,1.54598,0.66437,0.496808,0.504697,0.487349
4,2.032000,1.488669,0.693898,0.520218,0.554816,0.520885
5,2.032000,1.486558,0.720472,0.596109,0.590636,0.584886
6,0.874200,1.498174,0.731299,0.604035,0.601078,0.585298
7,0.874200,1.537236,0.741142,0.637903,0.630401,0.619863
8,0.341800,1.564776,0.74311,0.650129,0.632806,0.627971
9,0.341800,1.644411,0.752953,0.660355,0.657995,0.642409
10,0.341800,1.67432,0.759843,0.66738,0.657502,0.647065


Now, we test the trained model on the useen data obtained from the initial split

In [None]:
# Testing
labels_pred, labels_actual = test_model(prod_cat_bert_model, test_dataset)

# Printing the classification report
print("Evaluating on Test Dataset")
print(classification_report(labels_actual, labels_pred, zero_division=0))

Evaluating on Test Dataset
              precision    recall  f1-score   support

           0       0.89      0.67      0.76        12
           1       0.74      0.66      0.70       134
           2       0.74      0.60      0.66        42
           3       0.54      0.56      0.55        34
           4       0.70      0.88      0.78        26
           5       1.00      0.75      0.86         4
           6       0.33      1.00      0.50         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.81      0.76      0.78       107
          10       0.74      0.68      0.71        25
          11       0.00      0.00      0.00         1
          12       0.91      0.93      0.92        45
          13       0.88      0.86      0.87       287
          14       0.78      0.78      0.78        27
          15       0.66      0.75      0.70        53
          16       0.50      0.55      0.52        11


**XGBoost**

Below, all the steps for training and evaluating the XGboost model for product categories are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

<br>

We use the previous predefined method to train and evaluate the classifier

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        (
            "xgb",
            XGBClassifier(random_state=42, objective="multi:softmax", num_class=22),
        ),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "xgb__max_depth": [5],
    "xgb__learning_rate": [0.1, 0.3],
    "xgb__n_estimators": [100, 300],
    "xgb__gamma": [0],
    "xgb__min_child_weight": [1],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the xgboost classifier
prod_cat_macro_f1_scores_2, prod_cat_hyperparameters_2 = train_evaluate_model_xgb(
    product_category_dataset,
    "product_category_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)

              precision    recall  f1-score   support

           0       0.33      0.45      0.38        20
           1       0.54      0.53      0.54       223
           2       0.51      0.70      0.59        70
           3       0.47      0.32      0.38        57
           4       0.69      0.55      0.61        44
           5       0.42      0.83      0.56         6
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         2
           8       0.50      0.50      0.50         2
           9       0.60      0.63      0.62       178
          10       0.35      0.50      0.41        42
          11       0.38      1.00      0.55         3
          12       0.64      0.74      0.69        74
          13       0.83      0.70      0.76       478
          14       0.78      0.64      0.71        45
          15       0.44      0.70      0.54        88
          16       0.50      0.44      0.47        18
          17       0.50    

The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(prod_cat_macro_f1_scores_2))
print("Standard Deviation:", np.std(prod_cat_macro_f1_scores_2))

# Printing best model
print("\nThe hyperparameters which gives the best results are:")
print(prod_cat_hyperparameters_2[np.argmax(prod_cat_macro_f1_scores_2)])

Cross-Validation Macro f1:
Mean: 0.46560035892954316
Standard Deviation: 0.012399625741603748

The hyperparameters which gives the best results are:
{'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'xgb__gamma': 0, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__min_child_weight': 1, 'xgb__n_estimators': 300}


#### **Classification Task - Hazard Vectors**

Keep only the columns 'title_processed' and 'hazard_encoded' from the initial dataset

In [None]:
# Selecting specific columns
hazard_dataset = food_dataset[["title_processed", "hazard_encoded"]]

hazard_dataset.sample(2)

Unnamed: 0,title_processed,hazard_encoded
2922,azka impex recall pran muri moa milk correctly...,59
2354,chun yuen trading co. issues allergy alert und...,59


**Logistic Regression**

Below, all the steps for training and evaluating the Logistic Regression model for hazard vectors are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

<br>

We use the previous predefined method to train and evaluate the classifier

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("logreg", LogisticRegression()),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "logreg__penalty": ["l1", "l2"],
    "logreg__solver": ["liblinear"],
    "logreg__max_iter": [2000, 3000],
    "logreg__C": [10, 20],
    "logreg__class_weight": ["balanced"],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the logistic regression classifier
haz_macro_f1_scores, haz_hyperparameters = train_evaluate_model_lr(
    hazard_dataset,
    "hazard_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)

              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         4
           5       0.54      0.64      0.58        22
           6       1.00      1.00      1.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         3
           9       0.40      0.40      0.40         5
          10       0.50      0.50      0.50         2
          11       1.00      0.50      0.67         4
          12       0.00      0.00      0.00         1
          13       0.83      0.71      0.77         7
          14       0.44      0.44      0.44         9
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         2
          17       0.51    

  _data = np.array(data, dtype=dtype, copy=copy,


              precision    recall  f1-score   support

           0       0.57      1.00      0.73         4
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         4
           5       0.54      0.64      0.58        22
           6       0.00      0.00      0.00         1
           7       1.00      1.00      1.00         1
           8       0.00      0.00      0.00         2
           9       0.40      0.40      0.40         5
          10       1.00      1.00      1.00         2
          11       1.00      0.25      0.40         4
          12       0.00      0.00      0.00         1
          13       0.38      0.71      0.50         7
          14       0.40      0.40      0.40        10
          15       0.75      1.00      0.86         3
          16       0.00      0.00      0.00         2
          17       0.41    



The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(haz_macro_f1_scores))
print("Standard Deviation:", np.std(haz_macro_f1_scores))

# Printing best model
print("\nThe hyperparameters which give the best results are:")
print(haz_hyperparameters[np.argmax(haz_macro_f1_scores)])

Cross-Validation Macro f1:
Mean: 0.3849847631507545
Standard Deviation: 0.016502427918966087

The hyperparameters which give the best results are:
{'logreg__C': 20, 'logreg__class_weight': 'balanced', 'logreg__max_iter': 3000, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


**BERT - RoBERTa**

Below, all the steps for training the BERT model for the hazard vectors are presented

- Separation of the initial dataset
- Calculation of the weights of the classes
- Data tokenization
- Training the model

<br>

We use all the previous predefined methods to train and evaluate the classifier

In [None]:
# Splitting
train_set, val_set, test_set = split_dataset(hazard_dataset, "hazard_encoded", 0.6)

# Calculating class weights
class_weights = compute_weight(train_set, "hazard_encoded")

# Tokenizating
train_dataset, val_dataset, test_dataset = tokenize_data(
    train_set, val_set, test_set, "hazard_encoded", "roberta-base"
)

# Traning
haz_bert_model = train_model(train_dataset, val_dataset, "roberta-base", 128)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,4.83462,0.04459,0.000543,0.007498,0.000825
2,No log,4.824948,0.040656,0.000318,0.007812,0.00061
3,No log,4.818899,0.116066,0.000907,0.007812,0.001625
4,4.855400,4.820378,0.017049,0.000133,0.007812,0.000262
5,4.855400,4.821419,0.122623,0.000958,0.007812,0.001707
6,4.855400,4.818181,0.122623,0.000958,0.007812,0.001707
7,4.855400,4.81917,0.122623,0.000958,0.007812,0.001707
8,4.839400,4.821813,0.122623,0.000958,0.007812,0.001707


Now, we test the trained model on the useen data obtained from the initial split

In [None]:
# Testing
labels_pred, labels_actual = test_model(haz_bert_model, test_dataset)

# Printing the classification report
print("Evaluating on Test Dataset")
print(classification_report(labels_actual, labels_pred, zero_division=0))

Evaluating on Test Dataset
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00        20
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         6
          14       0.00      0.00      0.00         9
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         2


**XGBoost**

Below, all the steps for training and evaluating the XGBoost model for hazard vectors are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

<br>

We use the previous predefined method to train and evaluate the classifier

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        (
            "xgb",
            XGBClassifier(random_state=42, objective="multi:softmax", num_class=128),
        ),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "xgb__max_depth": [5],
    "xgb__learning_rate": [0.1, 0.3],
    "xgb__n_estimators": [100, 300],
    "xgb__gamma": [0],
    "xgb__min_child_weight": [1, 2],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the xgboost classifier
haz_macro_f1_scores_2, haz_hyperparameters_2 = train_evaluate_model_xgb(
    hazard_dataset,
    "hazard_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)

              precision    recall  f1-score   support

           0       0.40      0.67      0.50         3
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         4
           5       0.41      0.64      0.50        22
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         3
           9       0.10      0.20      0.13         5
          10       0.50      0.50      0.50         2
          11       1.00      0.25      0.40         4
          12       0.00      0.00      0.00         1
          13       0.67      0.57      0.62         7
          14       0.23      0.56      0.32         9
          15       1.00      0.50      0.67         2
          16       0.00      0.00      0.00         2
          17       0.53    

The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(haz_macro_f1_scores_2))
print("Standard Deviation:", np.std(haz_macro_f1_scores_2))

# Printing best model
print("\nThe hyperparameters which gives the best results are:")
print(haz_hyperparameters_2[np.argmax(haz_macro_f1_scores_2)])

Cross-Validation Macro f1:
Mean: 0.3040555585067703
Standard Deviation: 0.0045353924479881015

The hyperparameters which gives the best results are:
{'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'xgb__gamma': 0, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__min_child_weight': 1, 'xgb__n_estimators': 100}


#### **Classification Task - Product Vectors**

Keep only the columns 'title_processed' and 'product_encoded' from the initial dataset

In [6]:
# Selecting specific columns
product_dataset = food_dataset[["title_processed", "product_encoded"]]

product_dataset.sample(2)

Unnamed: 0,title_processed,product_encoded
1492,pinnacle foods inc. recall chili products misb...,204
1151,texas firm recall chicken beef products misbra...,61


**Logistic Regression**

Below, all the steps for training and evaluating the Logistic Regression model for product vectors are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

<br>

We use the previous predefined method to train and evaluate the classifier

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        ("logreg", LogisticRegression()),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "logreg__penalty": ["l1"],
    "logreg__solver": ["liblinear"],
    "logreg__max_iter": [2000],
    "logreg__C": [10, 20],
    "logreg__class_weight": ["balanced"],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the logistic regression classifier
prod_macro_f1_scores, prod_hyperparameters = train_evaluate_model_lr(
    product_dataset,
    "product_encoded",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)



              precision    recall  f1-score   support

           0       0.50      1.00      0.67         4
           2       0.53      0.75      0.62        12
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           8       0.40      1.00      0.57         2
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         2
          12       1.00      1.00      1.00         4
          13       0.67      1.00      0.80         2
          14       0.50      1.00      0.67         1
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          17       1.00      1.00      1.00         1
          18       1.00      1.00      1.00         1
          19       1.00    



              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           2       0.38      0.55      0.44        11
           4       0.00      0.00      0.00         4
           5       1.00      1.00      1.00         1
           6       0.00      0.00      0.00         3
           8       1.00      0.50      0.67         2
           9       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       1.00      0.67      0.80         3
          13       1.00      1.00      1.00         2
          14       0.00      0.00      0.00         1
          17       1.00      1.00      1.00         1
          19       1.00      1.00      1.00         3
          20       0.30      0.75      0.43         4
          22       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         1
          26       0.00      0.00      0.00         1
          28       0.67    



              precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       0.00      0.00      0.00         1
           2       0.38      0.45      0.42        11
           4       0.00      0.00      0.00         3
           5       1.00      1.00      1.00         1
           6       0.00      0.00      0.00         3
           8       1.00      0.50      0.67         2
          11       0.00      0.00      0.00         1
          12       1.00      1.00      1.00         3
          13       0.67      1.00      0.80         2
          14       0.00      0.00      0.00         1
          17       1.00      1.00      1.00         1
          18       1.00      1.00      1.00         1
          19       0.67      1.00      0.80         2
          20       0.17      0.25      0.20         4
          22       0.00      0.00      0.00         1
          23       0.00      0.00      0.00         1
          24       0.00    



The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(prod_macro_f1_scores))
print("Standard Deviation:", np.std(prod_macro_f1_scores))

# Printing best model
print("\nThe hyperparameters which give the best results are:")
print(prod_hyperparameters[np.argmax(prod_macro_f1_scores)])

Cross-Validation Macro f1:
Mean: 0.2375661472317091
Standard Deviation: 0.008035176656578437

The hyperparameters which give the best results are:
{'logreg__C': 20, 'logreg__class_weight': 'balanced', 'logreg__max_iter': 2000, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


**BERT - RoBERTa**

After some testing i noticed that because there are too many labels with only one or two samples, the BERT model can be trained, as it is not possible to split the original dataset into training, validation and testing datasets. I decided to remove all the labels with such an amount of samples, as it is very difficult for a classifier to learn to classify such labels with so little information

In [None]:
# Getting the samples of each label
label_counts = product_dataset["product_encoded"].value_counts()

# Finding labels with one or two samples
labels_with_one_sample = label_counts[label_counts == 1].index
labels_with_two_samples = label_counts[label_counts == 2].index

# Removing the labesl with one or two samples
product_dataset_modified = product_dataset[
    (~product_dataset["product_encoded"].isin(labels_with_one_sample))
    & (~product_dataset["product_encoded"].isin(labels_with_two_samples))
]

# Encoding agin
product_encoder_level_2 = LabelEncoder()
product_dataset_modified["product_encoded_level_2"] = (
    product_encoder_level_2.fit_transform(product_dataset_modified["product_encoded"])
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_dataset_modified["product_encoded_level_2"] = product_encoder_level_2.fit_transform(product_dataset_modified["product_encoded"])


Below, all the steps for training the BERT model for the product vectors are presented

- Separation of the initial dataset
- Calculation of the weights of the classes
- Data tokenization
- Training the model

<br>

We use all the previous predefined methods to train and evaluate the classifier

In [None]:
# Splitting
train_set, val_set, test_set = split_dataset(
    product_dataset_modified, "product_encoded_level_2", 0.6
)

# Calculating class weights
class_weights = compute_weight(train_set, "product_encoded_level_2")

# Tokenizating
train_dataset, val_dataset, test_dataset = tokenize_data(
    train_set, val_set, test_set, "product_encoded_level_2", "roberta-base"
)

# Traning
prod_bert_model = train_model(
    train_dataset,
    val_dataset,
    "roberta-base",
    len(product_dataset_modified["product_encoded_level_2"].unique()),
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,5.988778,0.002331,1.3e-05,0.002879,2.6e-05
2,No log,6.00142,0.000777,2e-06,0.002519,4e-06
3,No log,5.992783,0.001554,4e-06,0.002519,8e-06
4,No log,5.989104,0.001554,4e-06,0.002519,8e-06


Now, we test the trained model on the useen data obtained from the initial split

In [None]:
# Testing
labels_pred, labels_actual = test_model(prod_bert_model, test_dataset)

# Printing the classification report
print("Evaluating on Test Dataset")
print(classification_report(labels_actual, labels_pred, zero_division=0))

Evaluating on Test Dataset
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00        10
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         3
          12       0.00      0.00      0.00         4
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         3
          15       0.00      0.00      0.00         3
          16       0.00      0.00      0.00         5


**XGBoost**

Here trying to run XGBoost as in the previous classification tasks, I got an error because there are not enough samples for some labels in order to split the dataset. Because of this i decided to keep the same approach as i did in the BERT model and remove the labels with one or two samples, as it is not possible to learn a model with so little information

Below, all the steps for training and evaluating the XGBoost model for product vectors are presented

- Setting the desired pipeline
- Setting the hyperparameter grid for tuning
- Setting the outer and inner cross validation
- Training and evaluating the model

<br>

We use the previous predefined method to train and evaluate the classifier

In [None]:
# Defining the pipeline consisting of a vectorizer and a classifier
pipeline = Pipeline(
    [
        ("tfidf", TfidfVectorizer()),
        (
            "xgb",
            XGBClassifier(
                random_state=42,
                objective="multi:softmax",
                num_class=len(
                    product_dataset_modified["product_encoded_level_2"].unique()
                ),
            ),
        ),
    ]
)

# Defining hyperparameter grid
param_grid = {
    "tfidf__max_df": [0.5],
    "tfidf__ngram_range": [(1, 2)],
    "xgb__max_depth": [5],
    "xgb__learning_rate": [0.1, 0.3],
    "xgb__n_estimators": [100, 300],
    "xgb__gamma": [0],
    "xgb__min_child_weight": [1],
}

# Setting up the outer 3-folds cross validation for checking different train-test splits
outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Setting up the inner 2-fold cross validation for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Implementing and evaluating the xgboost classifier
prod_macro_f1_scores_2, prod_hyperparameters_2 = train_evaluate_model_xgb(
    product_dataset_modified,
    "product_encoded_level_2",
    outer_cv,
    inner_cv,
    pipeline,
    param_grid,
)

              precision    recall  f1-score   support

           0       0.50      0.25      0.33         4
           1       0.29      0.45      0.36        11
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1
           7       0.60      1.00      0.75         3
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       1.00      1.00      1.00         3
          12       0.00      0.00      0.00         3
          13       0.00      0.00      0.00         2
          14       0.67      0.67      0.67         3
          15       0.60      1.00      0.75         3
          16       0.60      0.50      0.55         6
          17       1.00    

The mean and standard deviation of the macro f1 in all cross-validation and hyperparameter tuning procedures are presented below

In [None]:
# Printing f1 distribution
print("Cross-Validation Macro f1:")
print("Mean:", np.mean(prod_macro_f1_scores_2))
print("Standard Deviation:", np.std(prod_macro_f1_scores_2))

# Printing best model
print("\nThe hyperparameters which gives the best results are:")
print(prod_hyperparameters_2[np.argmax(prod_macro_f1_scores_2)])

Cross-Validation Macro f1:
Mean: 0.26848145148323993
Standard Deviation: 0.005098106193364899

The hyperparameters which gives the best results are:
{'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'xgb__gamma': 0, 'xgb__learning_rate': 0.3, 'xgb__max_depth': 5, 'xgb__min_child_weight': 1, 'xgb__n_estimators': 300}


### **Results**

Based on the above bechmark analyses we have the following performance based on the macro f1.

<br>

|Classification Task|Logistic Regression|BERT - RoBERTa|XGBoost|
|-------------------|-------------------|--------------|-------|
|Hazard-Category|Mean: 0.6491 Std: 0.0345|Approximate: 0.61 |Mean: 0.5624 Std: 0.0225|
|Product-Category|Mean: 0.5675 Std: 0.0030|Approximate: 0.65|Mean: 0.4656 Std: 0.0123|
|Hazard|Mean: 0.3849 Std: 0.0165|Approximate: 0.00|Mean: 0.3040 Std: 0.0045|
|Product|Mean: 0.2375 Std: 0.0080|Approximate: 0.00|Mean: 0.2684 Std: 0.0050|


- Hazard-Category

  As we can see, the logistic regression model fits the data better and seems to handle the classification task and the imbalance feature a bit better.

- Product-Category

  As we can see, the BERT model fits better than the other algorithms and after a few experiments on the validation dataset and considering the score obtained, it indeed performs better.

- Hazard

  Here the best performing classifier seems to be again the logistic regression with a better mean than the XGBoost model. The BERT model here could not be fitted because there are many labels and most of them have very few samples compared to the very few labels that are strongly overrepresented in the dataset with too many samples. Probably to work better a good approach is to use oversampling and undersampling to increase the visibility of the minority labels and decrease the visibility of the majority labels accordingly.

- Product

  Here the logistic regression classifier has a lower macro f1 score than XGBoost, but in XGBoost to run it i remove some labels, which means that the logistic regression, although it has slightly lower performance, has been trained on the entire dataset and can handle and predict all labels better unlike the XGBoost model. This is actually true after running some experiments on the validation dataset and the resulting score was higher for the linear regression model. The BERT model as in the hazard vector classification task could not be fitted for the same reason.

Based on the above results, the outcomes obtained using the "title" column of the training dataset are not as good as those of the benchmark analyses using the "text" column, except for the product vector classification task, where the linear regression classifier in the "title" column performs better than the algorithms using the "text" column (as you can see in the other notebook that performs this type of benchmark). The best performing model for the product vector classification task and its predictions on the validation dataset are presented below.

### **Classification Task - Product**

Train a Logistic Regression classifier for the product vectors based on the best hyperparameters obtained earlier

In [7]:
# Getting the best hyperparameters
prod_best_hyperparameters = prod_hyperparameters[np.argmax(prod_macro_f1_scores)]

# Setting the model pipeline
prod_model = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_df=prod_best_hyperparameters["tfidf__max_df"],
        ngram_range=prod_best_hyperparameters["tfidf__ngram_range"],
    )),
    ("logreg", LogisticRegression(
        penalty=prod_best_hyperparameters["logreg__penalty"],
        solver=prod_best_hyperparameters["logreg__solver"],
        max_iter=prod_best_hyperparameters["logreg__max_iter"],
        C=prod_best_hyperparameters["logreg__C"],
        class_weight=prod_best_hyperparameters["logreg__class_weight"],
    )),
])

# Training
prod_model.fit(product_dataset["title_processed"], product_dataset["product_encoded"])



Load and transform the validation dataset

In [8]:
# Loading
validation_dataset = pd.read_csv("../data/validation/incidents.csv")

# Dropping
validation_dataset.drop(columns=["Unnamed: 0", "year", "month", "day", "country", "text"], inplace= True)

# Processing the titles
validation_dataset["title"] = validation_dataset["title"].apply(process_title)

validation_dataset.sample(5)

Unnamed: 0,title
65,2009 meadow gold dairy recall 56 ounce scround...
57,charles sturt university bidgee cheese lemon m...
491,hu products conduct nationwide voluntary recal...
346,creme recall assorted chocolate contain undecl...
530,welsh specialty foods recall welsh chunky picc...


Predict the product vectors

In [9]:
# Predict product
validation_dataset["product"] = validation_dataset["title"].apply(lambda row: encoders["product_encoded"].inverse_transform([prod_model.predict([row])[0]])[0])

# Storing
validation_dataset["product"].to_csv("../submission/subtask_2/submission_product.csv", index=False)