<a href="https://colab.research.google.com/github/23f2002620/New/blob/main/LogicLoom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Linear Regression

---



In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

try:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
except FileNotFoundError:
    print("Make sure 'train.csv' and 'test.csv' are in the same directory.")
    exit()

train_df['combined_text'] = train_df['title'].fillna('') + " " + train_df['text'].fillna('')
test_df['combined_text'] = test_df['title'].fillna('') + " " + test_df['text'].fillna('')

X_train = train_df['combined_text']
y_train_hazard = train_df['hazard-type']
y_train_product = train_df['product-category']
X_test = test_df['combined_text']

tfidf_vectorizer = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model_hazard = LogisticRegression(max_iter=1000)
model_hazard.fit(X_train_tfidf, y_train_hazard)
predictions_hazard = model_hazard.predict(X_test_tfidf)

model_product = LogisticRegression(max_iter=1000)
model_product.fit(X_train_tfidf, y_train_product)
predictions_product = model_product.predict(X_test_tfidf)

output_df = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard
})

output_df.to_csv('output.csv', index=False)

print("Predictions saved to 'output.csv'")

print("\nEvaluation on Training Data (Metrics for Hazard Type):")
print(classification_report(y_train_hazard, model_hazard.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Hazard Type:", accuracy_score(y_train_hazard, model_hazard.predict(X_train_tfidf)))

print("\nEvaluation on Training Data (Metrics for Product Category):")
print(classification_report(y_train_product, model_product.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Product Category:", accuracy_score(y_train_product, model_product.predict(X_train_tfidf)))


Predictions saved to 'output.csv'

Evaluation on Training Data (Metrics for Hazard Type):
                                precision    recall  f1-score   support

                     allergens       0.94      0.99      0.96      1854
                    biological       0.90      0.99      0.95      1741
                      chemical       0.92      0.82      0.87       287
food additives and flavourings       0.83      0.42      0.56        24
                foreign bodies       0.94      0.97      0.95       561
                         fraud       0.89      0.66      0.76       371
                     migration       0.00      0.00      0.00         3
          organoleptic aspects       1.00      0.09      0.17        53
                  other hazard       0.93      0.43      0.58       134
              packaging defect       1.00      0.28      0.43        54

                      accuracy                           0.92      5082
                     macro avg       0.84   

KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_hazard = KNeighborsClassifier(n_neighbors=5)
knn_hazard.fit(X_train_tfidf, y_train_hazard)
predictions_hazard_knn = knn_hazard.predict(X_test_tfidf)

knn_product = KNeighborsClassifier(n_neighbors=5)
knn_product.fit(X_train_tfidf, y_train_product)
predictions_product_knn = knn_product.predict(X_test_tfidf)

output_df_knn = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard_knn
})

output_df_knn.to_csv('output_knn.csv', index=False)

print("\nKNN Predictions saved to 'output_knn.csv'")

print("\nEvaluation on Training Data (Metrics for Hazard Type with KNN):")
print(classification_report(y_train_hazard, knn_hazard.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Hazard Type with KNN:", accuracy_score(y_train_hazard, knn_hazard.predict(X_train_tfidf)))

print("\nEvaluation on Training Data (Metrics for Product Category with KNN):")
print(classification_report(y_train_product, knn_product.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Product Category with KNN:", accuracy_score(y_train_product, knn_product.predict(X_train_tfidf)))



KNN Predictions saved to 'output_knn.csv'

Evaluation on Training Data (Metrics for Hazard Type with KNN):
                                precision    recall  f1-score   support

                     allergens       0.82      0.95      0.88      1854
                    biological       0.86      0.94      0.90      1741
                      chemical       0.88      0.66      0.75       287
food additives and flavourings       0.83      0.42      0.56        24
                foreign bodies       0.83      0.66      0.73       561
                         fraud       0.81      0.55      0.66       371
                     migration       0.00      0.00      0.00         3
          organoleptic aspects       0.86      0.23      0.36        53
                  other hazard       0.79      0.36      0.49       134
              packaging defect       0.71      0.37      0.49        54

                      accuracy                           0.84      5082
                     macro

### Logistic Regression

In [None]:

model_hazard = LogisticRegression(max_iter=1000, solver='liblinear')
model_hazard.fit(X_train_tfidf, y_train_hazard)
predictions_hazard = model_hazard.predict(X_test_tfidf)

model_product = LogisticRegression(max_iter=1000, solver='liblinear')

model_product.fit(X_train_tfidf, y_train_product)
predictions_product = model_product.predict(X_test_tfidf)

output_df_lr = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard
})

output_df_lr.to_csv('output_lr.csv', index=False)

print("\nLogistic Regression Predictions saved to 'output_lr.csv'")

print("\nEvaluation on Training Data (Metrics for Hazard Type with Logistic Regression):")
print(classification_report(y_train_hazard, model_hazard.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Hazard Type with Logistic Regression:", accuracy_score(y_train_hazard, model_hazard.predict(X_train_tfidf)))

print("\nEvaluation on Training Data (Metrics for Product Category with Logistic Regression):")
print(classification_report(y_train_product, model_product.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Product Category with Logistic Regression:", accuracy_score(y_train_product, model_product.predict(X_train_tfidf)))



Logistic Regression Predictions saved to 'output_lr.csv'

Evaluation on Training Data (Metrics for Hazard Type with Logistic Regression):
                                precision    recall  f1-score   support

                     allergens       0.92      0.99      0.95      1854
                    biological       0.88      0.99      0.93      1741
                      chemical       0.93      0.78      0.85       287
food additives and flavourings       0.85      0.46      0.59        24
                foreign bodies       0.94      0.96      0.95       561
                         fraud       0.87      0.59      0.70       371
                     migration       0.00      0.00      0.00         3
          organoleptic aspects       0.00      0.00      0.00        53
                  other hazard       0.95      0.26      0.41       134
              packaging defect       1.00      0.11      0.20        54

                      accuracy                           0.90      

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_hazard = DecisionTreeClassifier(random_state=42)
dt_hazard.fit(X_train_tfidf, y_train_hazard)
predictions_hazard_dt = dt_hazard.predict(X_test_tfidf)

dt_product = DecisionTreeClassifier(random_state=42)
dt_product.fit(X_train_tfidf, y_train_product)
predictions_product_dt = dt_product.predict(X_test_tfidf)

output_df_dt = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard_dt
})

output_df_dt.to_csv('output-dt.csv', index=False)

print("\nDecision Tree Predictions saved to 'output_dt.csv'")

# Evaluate Decision Tree on Training Data
print("\nEvaluation on Training Data (Metrics for Hazard Type with Decision Tree):")
print(classification_report(y_train_hazard, dt_hazard.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Hazard Type with Decision Tree:", accuracy_score(y_train_hazard, dt_hazard.predict(X_train_tfidf)))

print("\nEvaluation on Training Data (Metrics for Product Category with Decision Tree):")
print(classification_report(y_train_product, dt_product.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Product Category with Decision Tree:", accuracy_score(y_train_product, dt_product.predict(X_train_tfidf)))


Decision Tree Predictions saved to 'output_dt.csv'

Evaluation on Training Data (Metrics for Hazard Type with Decision Tree):
                                precision    recall  f1-score   support

                     allergens       1.00      1.00      1.00      1854
                    biological       1.00      1.00      1.00      1741
                      chemical       1.00      1.00      1.00       287
food additives and flavourings       1.00      1.00      1.00        24
                foreign bodies       1.00      1.00      1.00       561
                         fraud       1.00      1.00      1.00       371
                     migration       1.00      1.00      1.00         3
          organoleptic aspects       1.00      1.00      1.00        53
                  other hazard       1.00      0.99      1.00       134
              packaging defect       1.00      1.00      1.00        54

                      accuracy                           1.00      5082
       

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_hazard = RandomForestClassifier(n_estimators=100, random_state=42)
rf_hazard.fit(X_train_tfidf, y_train_hazard)
predictions_hazard_rf = rf_hazard.predict(X_test_tfidf)

rf_product = RandomForestClassifier(n_estimators=100, random_state=42)
rf_product.fit(X_train_tfidf, y_train_product)
predictions_product_rf = rf_product.predict(X_test_tfidf)

output_df_rf = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard_rf
})

output_df_rf.to_csv('output_rf.csv', index=False)

print("\nRandom Forest Predictions saved to 'output_rf.csv'")

print("\nEvaluation on Training Data (Metrics for Hazard Type with Random Forest):")
print(classification_report(y_train_hazard, rf_hazard.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Hazard Type with Random Forest:", accuracy_score(y_train_hazard, rf_hazard.predict(X_train_tfidf)))

print("\nEvaluation on Training Data (Metrics for Product Category with Random Forest):")
print(classification_report(y_train_product, rf_product.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Product Category with Random Forest:", accuracy_score(y_train_product, rf_product.predict(X_train_tfidf)))



Random Forest Predictions saved to 'output_rf.csv'

Evaluation on Training Data (Metrics for Hazard Type with Random Forest):
                                precision    recall  f1-score   support

                     allergens       1.00      1.00      1.00      1854
                    biological       1.00      1.00      1.00      1741
                      chemical       1.00      1.00      1.00       287
food additives and flavourings       1.00      1.00      1.00        24
                foreign bodies       1.00      1.00      1.00       561
                         fraud       1.00      1.00      1.00       371
                     migration       1.00      1.00      1.00         3
          organoleptic aspects       1.00      1.00      1.00        53
                  other hazard       1.00      0.99      1.00       134
              packaging defect       1.00      1.00      1.00        54

                      accuracy                           1.00      5082
       

### SVM

In [None]:
from sklearn.svm import SVC

svm_hazard = SVC(kernel='linear', random_state=42)
svm_hazard.fit(X_train_tfidf, y_train_hazard)
predictions_hazard_svm = svm_hazard.predict(X_test_tfidf)

svm_product = SVC(kernel='linear', random_state=42)
svm_product.fit(X_train_tfidf, y_train_product)
predictions_product_svm = svm_product.predict(X_test_tfidf)

output_df_svm = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard_svm
})

output_df_svm.to_csv('output_svm.csv', index=False)

print("\nSVM Predictions saved to 'output_svm.csv'")

print("\nEvaluation on Training Data (Metrics for Hazard Type with SVM):")
print(classification_report(y_train_hazard, svm_hazard.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Hazard Type with SVM:", accuracy_score(y_train_hazard, svm_hazard.predict(X_train_tfidf)))

print("\nEvaluation on Training Data (Metrics for Product Category with SVM):")
print(classification_report(y_train_product, svm_product.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Product Category with SVM:", accuracy_score(y_train_product, svm_product.predict(X_train_tfidf)))



SVM Predictions saved to 'output_svm.csv'

Evaluation on Training Data (Metrics for Hazard Type with SVM):
                                precision    recall  f1-score   support

                     allergens       0.95      0.99      0.97      1854
                    biological       0.97      1.00      0.98      1741
                      chemical       0.96      0.94      0.95       287
food additives and flavourings       0.87      0.54      0.67        24
                foreign bodies       0.97      0.98      0.98       561
                         fraud       0.92      0.74      0.82       371
                     migration       1.00      0.67      0.80         3
          organoleptic aspects       1.00      0.64      0.78        53
                  other hazard       0.94      0.79      0.86       134
              packaging defect       1.00      0.81      0.90        54

                      accuracy                           0.96      5082
                     macro

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_hazard = MultinomialNB()
nb_hazard.fit(X_train_tfidf, y_train_hazard)
predictions_hazard_nb = nb_hazard.predict(X_test_tfidf)

nb_product = MultinomialNB()
nb_product.fit(X_train_tfidf, y_train_product)
predictions_product_nb = nb_product.predict(X_test_tfidf)

output_df_nb = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard_nb
})

output_df_nb.to_csv('output_nb.csv', index=False)

print("\nNaive Bayes Predictions saved to 'output_nb.csv'")

print("\nEvaluation on Training Data (Metrics for Hazard Type with Naive Bayes):")
print(classification_report(y_train_hazard, nb_hazard.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Hazard Type with Naive Bayes:", accuracy_score(y_train_hazard, nb_hazard.predict(X_train_tfidf)))

print("\nEvaluation on Training Data (Metrics for Product Category with Naive Bayes):")
print(classification_report(y_train_product, nb_product.predict(X_train_tfidf), zero_division=0))
print("Accuracy for Product Category with Naive Bayes:", accuracy_score(y_train_product, nb_product.predict(X_train_tfidf)))



Naive Bayes Predictions saved to 'output_nb.csv'

Evaluation on Training Data (Metrics for Hazard Type with Naive Bayes):
                                precision    recall  f1-score   support

                     allergens       0.74      0.98      0.85      1854
                    biological       0.87      0.96      0.91      1741
                      chemical       0.89      0.43      0.58       287
food additives and flavourings       0.00      0.00      0.00        24
                foreign bodies       0.92      0.47      0.62       561
                         fraud       0.63      0.49      0.55       371
                     migration       0.00      0.00      0.00         3
          organoleptic aspects       0.00      0.00      0.00        53
                  other hazard       1.00      0.01      0.01       134
              packaging defect       0.00      0.00      0.00        54

                      accuracy                           0.80      5082
           

### Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam

input_layer = Input(shape=(X_train_tfidf.shape[1],))
dense1 = Dense(128, activation='relu')(input_layer)
dropout1 = Dropout(0.5)(dense1)
dense2 = Dense(64, activation='relu')(dropout1)
dropout2 = Dropout(0.5)(dense2)

num_hazard_classes = y_train_hazard.nunique()
hazard_output = Dense(num_hazard_classes, activation='softmax', name='hazard_output')(dropout2)

num_product_classes = y_train_product.nunique()
product_output = Dense(num_product_classes, activation='softmax', name='product_output')(dropout2)

model = Model(inputs=input_layer, outputs=[hazard_output, product_output])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss={'hazard_output': 'sparse_categorical_crossentropy',
                    'product_output': 'sparse_categorical_crossentropy'},
              metrics={'hazard_output': 'accuracy',
                       'product_output': 'accuracy'})

y_train_hazard_encoded, hazard_labels = pd.factorize(y_train_hazard)
y_train_product_encoded, product_labels = pd.factorize(y_train_product)

history = model.fit(X_train_tfidf, {'hazard_output': y_train_hazard_encoded, 'product_output': y_train_product_encoded},
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

predictions_nn = model.predict(X_test_tfidf)

predictions_hazard_nn_encoded = tf.argmax(predictions_nn[0], axis=1).numpy()
predictions_product_nn_encoded = tf.argmax(predictions_nn[1], axis=1).numpy()

predictions_hazard_nn = [hazard_labels[i] for i in predictions_hazard_nn_encoded]
predictions_product_nn = [product_labels[i] for i in predictions_product_nn_encoded]

output_df_nn = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard_nn
})

output_df_nn.to_csv('output_nn.csv', index=False)

print("\nNeural Network Predictions saved to 'output_nn.csv'")

print("\nNeural Network Training History:")
print(history.history.keys())

Epoch 1/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - hazard_output_accuracy: 0.4000 - hazard_output_loss: 1.8661 - loss: 4.6371 - product_output_accuracy: 0.2174 - product_output_loss: 2.7708 - val_hazard_output_accuracy: 0.6509 - val_hazard_output_loss: 1.1452 - val_loss: 3.6019 - val_product_output_accuracy: 0.2035 - val_product_output_loss: 2.4588
Epoch 2/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - hazard_output_accuracy: 0.7051 - hazard_output_loss: 0.9483 - loss: 3.0558 - product_output_accuracy: 0.3649 - product_output_loss: 2.1073 - val_hazard_output_accuracy: 0.7168 - val_hazard_output_loss: 0.8542 - val_loss: 3.0954 - val_product_output_accuracy: 0.3156 - val_product_output_loss: 2.2433
Epoch 3/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - hazard_output_accuracy: 0.7806 - hazard_output_loss: 0.7009 - loss: 2.5966 - product_output_accuracy: 0.4121 - product_output_loss: 1

### XGBoost

In [None]:
!pip install xgboost

import xgboost as xgb

xgb_hazard = xgb.XGBClassifier(objective='multi:softmax', num_class=y_train_hazard.nunique(), use_label_encoder=False, eval_metric='mlogloss')
xgb_hazard.fit(X_train_tfidf, y_train_hazard_encoded)
predictions_hazard_xgb_encoded = xgb_hazard.predict(X_test_tfidf)
predictions_hazard_xgb = [hazard_labels[i] for i in predictions_hazard_xgb_encoded]

xgb_product = xgb.XGBClassifier(objective='multi:softmax', num_class=y_train_product.nunique(), use_label_encoder=False, eval_metric='mlogloss')
xgb_product.fit(X_train_tfidf, y_train_product_encoded)
predictions_product_xgb_encoded = xgb_product.predict(X_test_tfidf)
predictions_product_xgb = [product_labels[i] for i in predictions_product_xgb_encoded]

output_df_xgb = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard_xgb
})

output_df_xgb.to_csv('output_xgb.csv', index=False)

print("\nXGBoost Predictions saved to 'output_xgb.csv'")

print("\nEvaluation on Training Data (Metrics for Hazard Type with XGBoost):")
print(classification_report(y_train_hazard_encoded, xgb_hazard.predict(X_train_tfidf), zero_division=0, target_names=hazard_labels))
print("Accuracy for Hazard Type with XGBoost:", accuracy_score(y_train_hazard_encoded, xgb_hazard.predict(X_train_tfidf)))

print("\nEvaluation on Training Data (Metrics for Product Category with XGBoost):")
print(classification_report(y_train_product_encoded, xgb_product.predict(X_train_tfidf), zero_division=0, target_names=product_labels))
print("Accuracy for Product Category with XGBoost:", accuracy_score(y_train_product_encoded, xgb_product.predict(X_train_tfidf)))



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




XGBoost Predictions saved to 'output_xgb.csv'

Evaluation on Training Data (Metrics for Hazard Type with XGBoost):
                                precision    recall  f1-score   support

                    biological       1.00      1.00      1.00      1741
                foreign bodies       1.00      1.00      1.00       561
                      chemical       1.00      1.00      1.00       287
                         fraud       1.00      1.00      1.00       371
          organoleptic aspects       1.00      1.00      1.00        53
                     allergens       1.00      1.00      1.00      1854
              packaging defect       1.00      1.00      1.00        54
                  other hazard       1.00      0.99      1.00       134
food additives and flavourings       1.00      1.00      1.00        24
                     migration       1.00      1.00      1.00         3

                      accuracy                           1.00      5082
                  

## LightGBM

In [None]:
!pip install lightgbm
import lightgbm as lgb

lgb_hazard = lgb.LGBMClassifier(objective='multiclass', num_class=y_train_hazard.nunique(), random_state=42)
lgb_hazard.fit(X_train_tfidf, y_train_hazard_encoded)
predictions_hazard_lgb_encoded = lgb_hazard.predict(X_test_tfidf)
predictions_hazard_lgb = [hazard_labels[i] for i in predictions_hazard_lgb_encoded]

lgb_product = lgb.LGBMClassifier(objective='multiclass', num_class=y_train_product.nunique(), random_state=42)
lgb_product.fit(X_train_tfidf, y_train_product_encoded)
predictions_product_lgb_encoded = lgb_product.predict(X_test_tfidf)
predictions_product_lgb = [product_labels[i] for i in predictions_product_lgb_encoded]

output_df_lgb = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard_lgb
})

output_df_lgb.to_csv('output_lgb.csv', index=False)

print("\nLightGBM Predictions saved to 'output_lgb.csv'")

print("\nEvaluation on Training Data (Metrics for Hazard Type with LightGBM):")
print(classification_report(y_train_hazard_encoded, lgb_hazard.predict(X_train_tfidf), zero_division=0, target_names=hazard_labels))
print("Accuracy for Hazard Type with LightGBM:", accuracy_score(y_train_hazard_encoded, lgb_hazard.predict(X_train_tfidf)))

print("\nEvaluation on Training Data (Metrics for Product Category with LightGBM):")
print(classification_report(y_train_product_encoded, lgb_product.predict(X_train_tfidf), zero_division=0, target_names=product_labels))
print("Accuracy for Product Category with LightGBM:", accuracy_score(y_train_product_encoded, lgb_product.predict(X_train_tfidf)))





[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.257370 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 180386
[LightGBM] [Info] Number of data points in the train set: 5082, number of used features: 2863
[LightGBM] [Info] Start training from score -1.071245
[LightGBM] [Info] Start training from score -2.203739
[LightGBM] [Info] Start training from score -2.873978
[LightGBM] [Info] Start training from score -2.617258
[LightGBM] [Info] Start training from score -4.563168
[LightGBM] [Info] Start training from score -1.008359
[LightGBM] [Info] Start training from score -4.544476
[LightGBM] [Info] Start training from score -3.635620
[LightGBM] [Info] Start training from score -5.355406
[LightGBM] [Info] Start training from score -7.434848




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.211185 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 180386
[LightGBM] [Info] Number of data points in the train set: 5082, number of used features: 2863
[LightGBM] [Info] Start training from score -1.265237
[LightGBM] [Info] Start training from score -2.382857
[LightGBM] [Info] Start training from score -2.024691
[LightGBM] [Info] Start training from score -3.397662
[LightGBM] [Info] Start training from score -3.130783
[LightGBM] [Info] Start training from score -4.455923
[LightGBM] [Info] Start training from score -2.251193
[LightGBM] [Info] Start training from score -4.526127
[LightGBM] [Info] Start training from score -3.186353
[LightGBM] [Info] Start training from score -2.965116
[LightGBM] [Info] Start training from score -2.942473
[LightGBM] [Info] Start training from score -2.957




LightGBM Predictions saved to 'output_lgb.csv'

Evaluation on Training Data (Metrics for Hazard Type with LightGBM):




                                precision    recall  f1-score   support

                    biological       0.96      0.90      0.93      1741
                foreign bodies       0.79      0.65      0.72       561
                      chemical       0.71      0.60      0.65       287
                         fraud       0.46      0.35      0.40       371
          organoleptic aspects       0.13      0.21      0.16        53
                     allergens       0.77      0.92      0.84      1854
              packaging defect       0.19      0.15      0.17        54
                  other hazard       0.46      0.34      0.39       134
food additives and flavourings       0.31      0.42      0.36        24
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.79      5082
                     macro avg       0.48      0.45      0.46      5082
                  weighted avg       0.79      0.79      0.78 



Accuracy for Hazard Type with LightGBM: 0.79004329004329

Evaluation on Training Data (Metrics for Product Category with LightGBM):




                                                   precision    recall  f1-score   support

                     meat, egg and dairy products       1.00      1.00      1.00      1434
                       prepared dishes and snacks       1.00      1.00      1.00       469
                      cereals and bakery products       1.00      1.00      1.00       671
                                    confectionery       0.99      1.00      1.00       170
                                ices and desserts       1.00      1.00      1.00       222
                              alcoholic beverages       1.00      1.00      1.00        59
                            fruits and vegetables       1.00      1.00      1.00       535
                       other food product / mixed       0.98      0.98      0.98        55
     cocoa and cocoa preparations, coffee and tea       1.00      1.00      1.00       210
                     nuts, nut products and seeds       1.00      1.00      1.00       26



Accuracy for Product Category with LightGBM: 0.999409681227863


# Stacking

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb
import lightgbm as lgb


X_train_stack, X_val_stack, y_train_hazard_stack, y_val_hazard_stack = train_test_split(
    X_train_tfidf, y_train_hazard_encoded, test_size=0.2, random_state=42, stratify=y_train_hazard_encoded)

X_train_stack, X_val_stack, y_train_product_stack, y_val_product_stack = train_test_split(
    X_train_tfidf, y_train_product_encoded, test_size=0.2, random_state=42, stratify=y_train_product_encoded)

estimators_hazard = [
    ('lr', LogisticRegression(max_iter=1000, solver='liblinear')),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(kernel='linear', probability=True, random_state=42)),
    ('nb', MultinomialNB()),
    ('xgb', xgb.XGBClassifier(objective='multi:softmax', num_class=y_train_hazard.nunique(), use_label_encoder=False, eval_metric='mlogloss')),
    ('lgb', lgb.LGBMClassifier(objective='multiclass', num_class=y_train_hazard.nunique(), random_state=42))
]

stk_hazard = StackingClassifier(estimators=estimators_hazard, final_estimator=LogisticRegression(max_iter=1000, solver='liblinear'))

stk_hazard.fit(X_train_tfidf, y_train_hazard_encoded)
predictions_hazard_stk_encoded = stk_hazard.predict(X_test_tfidf)
predictions_hazard_stk = [hazard_labels[i] for i in predictions_hazard_stk_encoded]

estimators_product = [
    ('lr', LogisticRegression(max_iter=1000, solver='liblinear')),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(kernel='linear', probability=True, random_state=42)),
    ('nb', MultinomialNB()),
    ('xgb', xgb.XGBClassifier(objective='multi:softmax', num_class=y_train_product.nunique(), use_label_encoder=False, eval_metric='mlogloss')),
    ('lgb', lgb.LGBMClassifier(objective='multiclass', num_class=y_train_product.nunique(), random_state=42))
]

stk_product = StackingClassifier(estimators=estimators_product, final_estimator=LogisticRegression(max_iter=1000, solver='liblinear'))

stk_product.fit(X_train_tfidf, y_train_product_encoded)
predictions_product_stk_encoded = stk_product.predict(X_test_tfidf)
predictions_product_stk = [product_labels[i] for i in predictions_product_stk_encoded]

output_df_stk = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard_stk
})

output_df_stk.to_csv('output_stk.csv', index=False)

print("\nStacking Predictions saved to 'output_stk.csv'")

print("\nEvaluation on Training Data (Metrics for Hazard Type with Stacking):")
print(classification_report(y_train_hazard_encoded, stk_hazard.predict(X_train_tfidf), zero_division=0, target_names=hazard_labels))
print("Accuracy for Hazard Type with Stacking:", accuracy_score(y_train_hazard_encoded, stk_hazard.predict(X_train_tfidf)))

print("\nEvaluation on Training Data (Metrics for Product Category with Stacking):")
print(classification_report(y_train_product_encoded, stk_product.predict(X_train_tfidf), zero_division=0, target_names=product_labels))
print("Accuracy for Product Category with Stacking:", accuracy_score(y_train_product_encoded, stk_product.predict(X_train_tfidf)))


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.205302 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 180386
[LightGBM] [Info] Number of data points in the train set: 5082, number of used features: 2863
[LightGBM] [Info] Start training from score -1.071245
[LightGBM] [Info] Start training from score -2.203739
[LightGBM] [Info] Start training from score -2.873978
[LightGBM] [Info] Start training from score -2.617258
[LightGBM] [Info] Start training from score -4.563168
[LightGBM] [Info] Start training from score -1.008359
[LightGBM] [Info] Start training from score -4.544476
[LightGBM] [Info] Start training from score -3.635620
[LightGBM] [Info] Start training from score -5.355406
[LightGBM] [Info] Start training from score -7.434848


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.124360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 157766
[LightGBM] [Info] Number of data points in the train set: 4065, number of used features: 2513
[LightGBM] [Info] Start training from score -1.071672
[LightGBM] [Info] Start training from score -2.203146
[LightGBM] [Info] Start training from score -2.872090
[LightGBM] [Info] Start training from score -2.616437
[LightGBM] [Info] Start training from score -4.572499
[LightGBM] [Info] Start training from score -1.008347
[LightGBM] [Info] Start training from score -4.548969
[LightGBM] [Info] Start training from score -3.628038
[LightGBM] [Info] Start training from score -5.365730
[LightGBM] [Info] Start training from score -7.617022




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140564 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 152176
[LightGBM] [Info] Number of data points in the train set: 4065, number of used features: 2492
[LightGBM] [Info] Start training from score -1.070954
[LightGBM] [Info] Start training from score -2.205376
[LightGBM] [Info] Start training from score -2.872090
[LightGBM] [Info] Start training from score -2.616437
[LightGBM] [Info] Start training from score -4.572499
[LightGBM] [Info] Start training from score -1.008347
[LightGBM] [Info] Start training from score -4.525979
[LightGBM] [Info] Start training from score -3.637340
[LightGBM] [Info] Start training from score -5.365730
[LightGBM] [Info] Start training from score -7.617022




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.136543 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152575
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2527
[LightGBM] [Info] Start training from score -1.071200
[LightGBM] [Info] Start training from score -2.203392
[LightGBM] [Info] Start training from score -2.876693
[LightGBM] [Info] Start training from score -2.616683
[LightGBM] [Info] Start training from score -4.572745
[LightGBM] [Info] Start training from score -1.007919
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -3.637586
[LightGBM] [Info] Start training from score -5.365976
[LightGBM] [Info] Start training from score -7.211803




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.139299 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152026
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2478
[LightGBM] [Info] Start training from score -1.071200
[LightGBM] [Info] Start training from score -2.203392
[LightGBM] [Info] Start training from score -2.876693
[LightGBM] [Info] Start training from score -2.616683
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -1.008593
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -3.637586
[LightGBM] [Info] Start training from score -5.365976
[LightGBM] [Info] Start training from score -7.211803




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.146748 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154403
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2506
[LightGBM] [Info] Start training from score -1.071200
[LightGBM] [Info] Start training from score -2.203392
[LightGBM] [Info] Start training from score -2.872336
[LightGBM] [Info] Start training from score -2.620056
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -1.008593
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -3.637586
[LightGBM] [Info] Start training from score -5.314683
[LightGBM] [Info] Start training from score -7.617268


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.219159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 180386
[LightGBM] [Info] Number of data points in the train set: 5082, number of used features: 2863
[LightGBM] [Info] Start training from score -1.265237
[LightGBM] [Info] Start training from score -2.382857
[LightGBM] [Info] Start training from score -2.024691
[LightGBM] [Info] Start training from score -3.397662
[LightGBM] [Info] Start training from score -3.130783
[LightGBM] [Info] Start training from score -4.455923
[LightGBM] [Info] Start training from score -2.251193
[LightGBM] [Info] Start training from score -4.526127
[LightGBM] [Info] Start training from score -3.186353
[LightGBM] [Info] Start training from score -2.965116
[LightGBM] [Info] Start training from score -2.942473
[LightGBM] [Info] Start training from score -2.957

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.160053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 158351
[LightGBM] [Info] Number of data points in the train set: 4065, number of used features: 2516
[LightGBM] [Info] Start training from score -1.265264
[LightGBM] [Info] Start training from score -2.383243
[LightGBM] [Info] Start training from score -2.024171
[LightGBM] [Info] Start training from score -3.397514
[LightGBM] [Info] Start training from score -3.134019
[LightGBM] [Info] Start training from score -4.438968
[LightGBM] [Info] Start training from score -2.251046
[LightGBM] [Info] Start training from score -4.525979
[LightGBM] [Info] Start training from score -3.186205
[LightGBM] [Info] Start training from score -2.967835
[LightGBM] [Info] Start training from score -2.939531
[LightGBM] [Info] Start training from score -2.958311
[LightGBM] [Info] Start training from score -5.602119
[Light



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.133340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152158
[LightGBM] [Info] Number of data points in the train set: 4065, number of used features: 2480
[LightGBM] [Info] Start training from score -1.265264
[LightGBM] [Info] Start training from score -2.383243
[LightGBM] [Info] Start training from score -2.024171
[LightGBM] [Info] Start training from score -3.397514
[LightGBM] [Info] Start training from score -3.128385
[LightGBM] [Info] Start training from score -4.460021
[LightGBM] [Info] Start training from score -2.251046
[LightGBM] [Info] Start training from score -4.525979
[LightGBM] [Info] Start training from score -3.186205
[LightGBM] [Info] Start training from score -2.967835
[LightGBM] [Info] Start training from score -2.939531
[LightGBM] [Info] Start training from score -2.958



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.137472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 151653
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2513
[LightGBM] [Info] Start training from score -1.265510
[LightGBM] [Info] Start training from score -2.383489
[LightGBM] [Info] Start training from score -2.024417
[LightGBM] [Info] Start training from score -3.397760
[LightGBM] [Info] Start training from score -3.128631
[LightGBM] [Info] Start training from score -4.460267
[LightGBM] [Info] Start training from score -2.251292
[LightGBM] [Info] Start training from score -4.526225
[LightGBM] [Info] Start training from score -3.186451
[LightGBM] [Info] Start training from score -2.963307
[LightGBM] [Info] Start training from score -2.944439
[LightGBM] [Info] Start training from score -2.958



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.123607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152119
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2482
[LightGBM] [Info] Start training from score -1.265510
[LightGBM] [Info] Start training from score -2.380826
[LightGBM] [Info] Start training from score -2.026281
[LightGBM] [Info] Start training from score -3.397760
[LightGBM] [Info] Start training from score -3.128631
[LightGBM] [Info] Start training from score -4.460267
[LightGBM] [Info] Start training from score -2.251292
[LightGBM] [Info] Start training from score -4.526225
[LightGBM] [Info] Start training from score -3.186451
[LightGBM] [Info] Start training from score -2.963307
[LightGBM] [Info] Start training from score -2.944439
[LightGBM] [Info] Start training from score -2.958



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.147553 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 154607
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2532
[LightGBM] [Info] Start training from score -1.264638
[LightGBM] [Info] Start training from score -2.383489
[LightGBM] [Info] Start training from score -2.024417
[LightGBM] [Info] Start training from score -3.397760
[LightGBM] [Info] Start training from score -3.134265
[LightGBM] [Info] Start training from score -4.460267
[LightGBM] [Info] Start training from score -2.251292
[LightGBM] [Info] Start training from score -4.526225
[LightGBM] [Info] Start training from score -3.186451
[LightGBM] [Info] Start training from score -2.963307
[LightGBM] [Info] Start training from score -2.944439
[LightGBM] [Info] Start training from score -2.953829
[LightGBM] [Info] Start training from score -5.602365
[Light




Stacking Predictions saved to 'output_stk.csv'

Evaluation on Training Data (Metrics for Hazard Type with Stacking):




                                precision    recall  f1-score   support

                    biological       1.00      1.00      1.00      1741
                foreign bodies       1.00      1.00      1.00       561
                      chemical       1.00      1.00      1.00       287
                         fraud       0.99      1.00      1.00       371
          organoleptic aspects       1.00      0.98      0.99        53
                     allergens       1.00      1.00      1.00      1854
              packaging defect       1.00      0.98      0.99        54
                  other hazard       0.98      0.95      0.97       134
food additives and flavourings       1.00      1.00      1.00        24
                     migration       0.00      0.00      0.00         3

                      accuracy                           1.00      5082
                     macro avg       0.90      0.89      0.89      5082
                  weighted avg       1.00      1.00      1.00 



Accuracy for Hazard Type with Stacking: 0.9976387249114522

Evaluation on Training Data (Metrics for Product Category with Stacking):




                                                   precision    recall  f1-score   support

                     meat, egg and dairy products       0.91      0.99      0.95      1434
                       prepared dishes and snacks       0.89      0.72      0.80       469
                      cereals and bakery products       0.95      0.97      0.96       671
                                    confectionery       0.98      0.95      0.96       170
                                ices and desserts       0.99      1.00      1.00       222
                              alcoholic beverages       0.98      1.00      0.99        59
                            fruits and vegetables       0.92      0.86      0.89       535
                       other food product / mixed       1.00      0.49      0.66        55
     cocoa and cocoa preparations, coffee and tea       0.98      0.98      0.98       210
                     nuts, nut products and seeds       0.84      0.94      0.89       26



Accuracy for Product Category with Stacking: 0.9295552931916569


## Stacking Advanced

In [None]:
from sklearn.ensemble import StackingClassifier

estimators_hazard_advanced = [
    ('lr', LogisticRegression(max_iter=1000, solver='liblinear')),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42)),
    ('svm', SVC(kernel='linear', probability=True, random_state=42)),
    ('nb', MultinomialNB()),
    ('xgb', xgb.XGBClassifier(objective='multi:softmax', num_class=y_train_hazard.nunique(), use_label_encoder=False, eval_metric='mlogloss', n_estimators=200)),
    ('lgb', lgb.LGBMClassifier(objective='multiclass', num_class=y_train_hazard.nunique(), random_state=42, n_estimators=200))
]

meta_model_hazard = LogisticRegression(max_iter=1000, solver='liblinear')

stk_hazard_advanced = StackingClassifier(estimators=estimators_hazard_advanced, final_estimator=meta_model_hazard, cv=5)

print("\nTraining Advanced Stacking Model for Hazard Type...")
stk_hazard_advanced.fit(X_train_tfidf, y_train_hazard_encoded)
predictions_hazard_stk_advanced_encoded = stk_hazard_advanced.predict(X_test_tfidf)
predictions_hazard_stk_advanced = [hazard_labels[i] for i in predictions_hazard_stk_advanced_encoded]

estimators_product_advanced = [
    ('lr', LogisticRegression(max_iter=1000, solver='liblinear')),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42)),
    ('svm', SVC(kernel='linear', probability=True, random_state=42)),
    ('nb', MultinomialNB()),
    ('xgb', xgb.XGBClassifier(objective='multi:softmax', num_class=y_train_product.nunique(), use_label_encoder=False, eval_metric='mlogloss', n_estimators=200)),
    ('lgb', lgb.LGBMClassifier(objective='multiclass', num_class=y_train_product.nunique(), random_state=42, n_estimators=200))
]

meta_model_product = LogisticRegression(max_iter=1000, solver='liblinear')

stk_product_advanced = StackingClassifier(estimators=estimators_product_advanced, final_estimator=meta_model_product, cv=5)

print("Training Advanced Stacking Model for Product Category...")
stk_product_advanced.fit(X_train_tfidf, y_train_product_encoded)
predictions_product_stk_advanced_encoded = stk_product_advanced.predict(X_test_tfidf)
predictions_product_stk_advanced = [product_labels[i] for i in predictions_product_stk_advanced_encoded]

output_df_stk_advanced = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard_stk_advanced
})

output_df_stk_advanced.to_csv('output_stk_advanced.csv', index=False)

print("\nAdvanced Stacking Predictions saved to 'output_stk_advanced.csv'")

print("\nEvaluation on Training Data (Metrics for Hazard Type with Advanced Stacking):")
print(classification_report(y_train_hazard_encoded, stk_hazard_advanced.predict(X_train_tfidf), zero_division=0, target_names=hazard_labels))
print("Accuracy for Hazard Type with Advanced Stacking:", accuracy_score(y_train_hazard_encoded, stk_hazard_advanced.predict(X_train_tfidf)))

print("\nEvaluation on Training Data (Metrics for Product Category with Advanced Stacking):")
print(classification_report(y_train_product_encoded, stk_product_advanced.predict(X_train_tfidf), zero_division=0, target_names=product_labels))
print("Accuracy for Product Category with Advanced Stacking:", accuracy_score(y_train_product_encoded, stk_product_advanced.predict(X_train_tfidf)))



Training Advanced Stacking Model for Hazard Type...


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.240855 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 180386
[LightGBM] [Info] Number of data points in the train set: 5082, number of used features: 2863
[LightGBM] [Info] Start training from score -1.071245
[LightGBM] [Info] Start training from score -2.203739
[LightGBM] [Info] Start training from score -2.873978
[LightGBM] [Info] Start training from score -2.617258
[LightGBM] [Info] Start training from score -4.563168
[LightGBM] [Info] Start training from score -1.008359
[LightGBM] [Info] Start training from score -4.544476
[LightGBM] [Info] Start training from score -3.635620
[LightGBM] [Info] Start training from score -5.355406
[LightGBM] [Info] Start training from score -7.434848


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.143607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 157766
[LightGBM] [Info] Number of data points in the train set: 4065, number of used features: 2513
[LightGBM] [Info] Start training from score -1.071672
[LightGBM] [Info] Start training from score -2.203146
[LightGBM] [Info] Start training from score -2.872090
[LightGBM] [Info] Start training from score -2.616437
[LightGBM] [Info] Start training from score -4.572499
[LightGBM] [Info] Start training from score -1.008347
[LightGBM] [Info] Start training from score -4.548969
[LightGBM] [Info] Start training from score -3.628038
[LightGBM] [Info] Start training from score -5.365730
[LightGBM] [Info] Start training from score -7.617022




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.143962 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152176
[LightGBM] [Info] Number of data points in the train set: 4065, number of used features: 2492
[LightGBM] [Info] Start training from score -1.070954
[LightGBM] [Info] Start training from score -2.205376
[LightGBM] [Info] Start training from score -2.872090
[LightGBM] [Info] Start training from score -2.616437
[LightGBM] [Info] Start training from score -4.572499
[LightGBM] [Info] Start training from score -1.008347
[LightGBM] [Info] Start training from score -4.525979
[LightGBM] [Info] Start training from score -3.637340
[LightGBM] [Info] Start training from score -5.365730
[LightGBM] [Info] Start training from score -7.617022




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.205386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152575
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2527
[LightGBM] [Info] Start training from score -1.071200
[LightGBM] [Info] Start training from score -2.203392
[LightGBM] [Info] Start training from score -2.876693
[LightGBM] [Info] Start training from score -2.616683
[LightGBM] [Info] Start training from score -4.572745
[LightGBM] [Info] Start training from score -1.007919
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -3.637586
[LightGBM] [Info] Start training from score -5.365976
[LightGBM] [Info] Start training from score -7.211803




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.245890 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 152026
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2478
[LightGBM] [Info] Start training from score -1.071200
[LightGBM] [Info] Start training from score -2.203392
[LightGBM] [Info] Start training from score -2.876693
[LightGBM] [Info] Start training from score -2.616683
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -1.008593
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -3.637586
[LightGBM] [Info] Start training from score -5.365976
[LightGBM] [Info] Start training from score -7.211803




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.200371 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154403
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2506
[LightGBM] [Info] Start training from score -1.071200
[LightGBM] [Info] Start training from score -2.203392
[LightGBM] [Info] Start training from score -2.872336
[LightGBM] [Info] Start training from score -2.620056
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -1.008593
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -3.637586
[LightGBM] [Info] Start training from score -5.314683
[LightGBM] [Info] Start training from score -7.617268




Training Advanced Stacking Model for Product Category...


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.226318 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 180386
[LightGBM] [Info] Number of data points in the train set: 5082, number of used features: 2863
[LightGBM] [Info] Start training from score -1.265237
[LightGBM] [Info] Start training from score -2.382857
[LightGBM] [Info] Start training from score -2.024691
[LightGBM] [Info] Start training from score -3.397662
[LightGBM] [Info] Start training from score -3.130783
[LightGBM] [Info] Start training from score -4.455923
[LightGBM] [Info] Start training from score -2.251193
[LightGBM] [Info] Start training from score -4.526127
[LightGBM] [Info] Start training from score -3.186353
[LightGBM] [Info] Start training from score -2.965116
[LightGBM] [Info] Start training from score -2.942473
[LightGBM] [Info] Start training from score -2.957511
[LightGBM] [Info] Start training from score -5.589021
[Light

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.148506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 158351
[LightGBM] [Info] Number of data points in the train set: 4065, number of used features: 2516
[LightGBM] [Info] Start training from score -1.265264
[LightGBM] [Info] Start training from score -2.383243
[LightGBM] [Info] Start training from score -2.024171
[LightGBM] [Info] Start training from score -3.397514
[LightGBM] [Info] Start training from score -3.134019
[LightGBM] [Info] Start training from score -4.438968
[LightGBM] [Info] Start training from score -2.251046
[LightGBM] [Info] Start training from score -4.525979
[LightGBM] [Info] Start training from score -3.186205
[LightGBM] [Info] Start training from score -2.967835
[LightGBM] [Info] Start training from score -2.939531
[LightGBM] [Info] Start training from score -2.958



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.161294 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152158
[LightGBM] [Info] Number of data points in the train set: 4065, number of used features: 2480
[LightGBM] [Info] Start training from score -1.265264
[LightGBM] [Info] Start training from score -2.383243
[LightGBM] [Info] Start training from score -2.024171
[LightGBM] [Info] Start training from score -3.397514
[LightGBM] [Info] Start training from score -3.128385
[LightGBM] [Info] Start training from score -4.460021
[LightGBM] [Info] Start training from score -2.251046
[LightGBM] [Info] Start training from score -4.525979
[LightGBM] [Info] Start training from score -3.186205
[LightGBM] [Info] Start training from score -2.967835
[LightGBM] [Info] Start training from score -2.939531
[LightGBM] [Info] Start training from score -2.958



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.159781 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 151653
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2513
[LightGBM] [Info] Start training from score -1.265510
[LightGBM] [Info] Start training from score -2.383489
[LightGBM] [Info] Start training from score -2.024417
[LightGBM] [Info] Start training from score -3.397760
[LightGBM] [Info] Start training from score -3.128631
[LightGBM] [Info] Start training from score -4.460267
[LightGBM] [Info] Start training from score -2.251292
[LightGBM] [Info] Start training from score -4.526225
[LightGBM] [Info] Start training from score -3.186451
[LightGBM] [Info] Start training from score -2.963307
[LightGBM] [Info] Start training from score -2.944439
[LightGBM] [Info] Start training from score -2.958



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.157862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152119
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2482
[LightGBM] [Info] Start training from score -1.265510
[LightGBM] [Info] Start training from score -2.380826
[LightGBM] [Info] Start training from score -2.026281
[LightGBM] [Info] Start training from score -3.397760
[LightGBM] [Info] Start training from score -3.128631
[LightGBM] [Info] Start training from score -4.460267
[LightGBM] [Info] Start training from score -2.251292
[LightGBM] [Info] Start training from score -4.526225
[LightGBM] [Info] Start training from score -3.186451
[LightGBM] [Info] Start training from score -2.963307
[LightGBM] [Info] Start training from score -2.944439
[LightGBM] [Info] Start training from score -2.958



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.275308 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154607
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2532
[LightGBM] [Info] Start training from score -1.264638
[LightGBM] [Info] Start training from score -2.383489
[LightGBM] [Info] Start training from score -2.024417
[LightGBM] [Info] Start training from score -3.397760
[LightGBM] [Info] Start training from score -3.134265
[LightGBM] [Info] Start training from score -4.460267
[LightGBM] [Info] Start training from score -2.251292
[LightGBM] [Info] Start training from score -4.526225
[LightGBM] [Info] Start training from score -3.186451
[LightGBM] [Info] Start training from score -2.963307
[LightGBM] [Info] Start training from score -2.944439
[LightGBM] [Info] Start training from score -2.953




Advanced Stacking Predictions saved to 'output_stk_advanced.csv'

Evaluation on Training Data (Metrics for Hazard Type with Advanced Stacking):




                                precision    recall  f1-score   support

                    biological       1.00      1.00      1.00      1741
                foreign bodies       1.00      1.00      1.00       561
                      chemical       1.00      1.00      1.00       287
                         fraud       0.98      1.00      0.99       371
          organoleptic aspects       1.00      0.98      0.99        53
                     allergens       1.00      1.00      1.00      1854
              packaging defect       1.00      0.98      0.99        54
                  other hazard       1.00      0.96      0.98       134
food additives and flavourings       1.00      0.96      0.98        24
                     migration       0.00      0.00      0.00         3

                      accuracy                           1.00      5082
                     macro avg       0.90      0.89      0.89      5082
                  weighted avg       1.00      1.00      1.00 



Accuracy for Hazard Type with Advanced Stacking: 0.9976387249114522

Evaluation on Training Data (Metrics for Product Category with Advanced Stacking):




                                                   precision    recall  f1-score   support

                     meat, egg and dairy products       0.90      0.99      0.94      1434
                       prepared dishes and snacks       0.87      0.67      0.76       469
                      cereals and bakery products       0.95      0.97      0.96       671
                                    confectionery       0.98      0.95      0.97       170
                                ices and desserts       0.99      1.00      1.00       222
                              alcoholic beverages       0.98      1.00      0.99        59
                            fruits and vegetables       0.91      0.86      0.89       535
                       other food product / mixed       0.95      0.36      0.53        55
     cocoa and cocoa preparations, coffee and tea       0.98      0.98      0.98       210
                     nuts, nut products and seeds       0.85      0.94      0.89       26



Accuracy for Product Category with Advanced Stacking: 0.9228650137741047


# Stacking Optimization

In [None]:
estimators_hazard_optimized = [
    ('lr', LogisticRegression(max_iter=2000, solver='liblinear', C=0.1)),
    ('dt', DecisionTreeClassifier(random_state=42, max_depth=50)),
    ('rf', RandomForestClassifier(n_estimators=300, random_state=42, max_depth=None)),
    ('svm', SVC(kernel='linear', probability=True, random_state=42, C=0.5)),
    ('nb', MultinomialNB(alpha=0.5)),
    ('xgb', xgb.XGBClassifier(objective='multi:softmax', num_class=y_train_hazard.nunique(), use_label_encoder=False, eval_metric='mlogloss', n_estimators=300, learning_rate=0.1)),
    ('lgb', lgb.LGBMClassifier(objective='multiclass', num_class=y_train_hazard.nunique(), random_state=42, n_estimators=300, learning_rate=0.1))
]

meta_model_hazard_optimized = LogisticRegression(max_iter=2000, solver='liblinear', C=1.0)

stk_hazard_optimized = StackingClassifier(estimators=estimators_hazard_optimized, final_estimator=meta_model_hazard_optimized, cv=5)

print("\nTraining Optimized Stacking Model for Hazard Type...")
stk_hazard_optimized.fit(X_train_tfidf, y_train_hazard_encoded)
predictions_hazard_stk_optimized_encoded = stk_hazard_optimized.predict(X_test_tfidf)
predictions_hazard_stk_optimized = [hazard_labels[i] for i in predictions_hazard_stk_optimized_encoded]

estimators_product_optimized = [
    ('lr', LogisticRegression(max_iter=2000, solver='liblinear', C=0.1)),
    ('dt', DecisionTreeClassifier(random_state=42, max_depth=50)),
    ('rf', RandomForestClassifier(n_estimators=300, random_state=42, max_depth=None)),
    ('svm', SVC(kernel='linear', probability=True, random_state=42, C=0.5)),
    ('nb', MultinomialNB(alpha=0.5)),
    ('xgb', xgb.XGBClassifier(objective='multi:softmax', num_class=y_train_product.nunique(), use_label_encoder=False, eval_metric='mlogloss', n_estimators=300, learning_rate=0.1)),
    ('lgb', lgb.LGBMClassifier(objective='multiclass', num_class=y_train_product.nunique(), random_state=42, n_estimators=300, learning_rate=0.1))
]

meta_model_product_optimized = LogisticRegression(max_iter=2000, solver='liblinear', C=1.0)

stk_product_optimized = StackingClassifier(estimators=estimators_product_optimized, final_estimator=meta_model_product_optimized, cv=5)

print("Training Optimized Stacking Model for Product Category...")
stk_product_optimized.fit(X_train_tfidf, y_train_product_encoded)
predictions_product_stk_optimized_encoded = stk_product_optimized.predict(X_test_tfidf)
predictions_product_stk_optimized = [product_labels[i] for i in predictions_product_stk_optimized_encoded]

output_df_stk_optimized = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': predictions_hazard_stk_optimized
})

output_df_stk_optimized.to_csv('output_stk_optimized.csv', index=False)

print("\nOptimized Stacking Predictions saved to 'output_stk_optimized.csv'")

print("\nEvaluation on Training Data (Metrics for Hazard Type with Optimized Stacking):")
print(classification_report(y_train_hazard_encoded, stk_hazard_optimized.predict(X_train_tfidf), zero_division=0, target_names=hazard_labels))
print("Accuracy for Hazard Type with Optimized Stacking:", accuracy_score(y_train_hazard_encoded, stk_hazard_optimized.predict(X_train_tfidf)))

print("\nEvaluation on Training Data (Metrics for Product Category with Optimized Stacking):")
print(classification_report(y_train_product_encoded, stk_product_optimized.predict(X_train_tfidf), zero_division=0, target_names=product_labels))
print("Accuracy for Product Category with Optimized Stacking:", accuracy_score(y_train_product_encoded, stk_product_optimized.predict(X_train_tfidf)))



Training Optimized Stacking Model for Hazard Type...


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.220936 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 180386
[LightGBM] [Info] Number of data points in the train set: 5082, number of used features: 2863
[LightGBM] [Info] Start training from score -1.071245
[LightGBM] [Info] Start training from score -2.203739
[LightGBM] [Info] Start training from score -2.873978
[LightGBM] [Info] Start training from score -2.617258
[LightGBM] [Info] Start training from score -4.563168
[LightGBM] [Info] Start training from score -1.008359
[LightGBM] [Info] Start training from score -4.544476
[LightGBM] [Info] Start training from score -3.635620
[LightGBM] [Info] Start training from score -5.355406
[LightGBM] [Info] Start training from score -7.434848


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.160608 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 157766
[LightGBM] [Info] Number of data points in the train set: 4065, number of used features: 2513
[LightGBM] [Info] Start training from score -1.071672
[LightGBM] [Info] Start training from score -2.203146
[LightGBM] [Info] Start training from score -2.872090
[LightGBM] [Info] Start training from score -2.616437
[LightGBM] [Info] Start training from score -4.572499
[LightGBM] [Info] Start training from score -1.008347
[LightGBM] [Info] Start training from score -4.548969
[LightGBM] [Info] Start training from score -3.628038
[LightGBM] [Info] Start training from score -5.365730
[LightGBM] [Info] Start training from score -7.617022




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.160330 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152176
[LightGBM] [Info] Number of data points in the train set: 4065, number of used features: 2492
[LightGBM] [Info] Start training from score -1.070954
[LightGBM] [Info] Start training from score -2.205376
[LightGBM] [Info] Start training from score -2.872090
[LightGBM] [Info] Start training from score -2.616437
[LightGBM] [Info] Start training from score -4.572499
[LightGBM] [Info] Start training from score -1.008347
[LightGBM] [Info] Start training from score -4.525979
[LightGBM] [Info] Start training from score -3.637340
[LightGBM] [Info] Start training from score -5.365730
[LightGBM] [Info] Start training from score -7.617022




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.159693 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 152575
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2527
[LightGBM] [Info] Start training from score -1.071200
[LightGBM] [Info] Start training from score -2.203392
[LightGBM] [Info] Start training from score -2.876693
[LightGBM] [Info] Start training from score -2.616683
[LightGBM] [Info] Start training from score -4.572745
[LightGBM] [Info] Start training from score -1.007919
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -3.637586
[LightGBM] [Info] Start training from score -5.365976
[LightGBM] [Info] Start training from score -7.211803




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.144745 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 152026
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2478
[LightGBM] [Info] Start training from score -1.071200
[LightGBM] [Info] Start training from score -2.203392
[LightGBM] [Info] Start training from score -2.876693
[LightGBM] [Info] Start training from score -2.616683
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -1.008593
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -3.637586
[LightGBM] [Info] Start training from score -5.365976
[LightGBM] [Info] Start training from score -7.211803




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.241493 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 154403
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2506
[LightGBM] [Info] Start training from score -1.071200
[LightGBM] [Info] Start training from score -2.203392
[LightGBM] [Info] Start training from score -2.872336
[LightGBM] [Info] Start training from score -2.620056
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -1.008593
[LightGBM] [Info] Start training from score -4.549215
[LightGBM] [Info] Start training from score -3.637586
[LightGBM] [Info] Start training from score -5.314683
[LightGBM] [Info] Start training from score -7.617268




Training Optimized Stacking Model for Product Category...


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.226024 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 180386
[LightGBM] [Info] Number of data points in the train set: 5082, number of used features: 2863
[LightGBM] [Info] Start training from score -1.265237
[LightGBM] [Info] Start training from score -2.382857
[LightGBM] [Info] Start training from score -2.024691
[LightGBM] [Info] Start training from score -3.397662
[LightGBM] [Info] Start training from score -3.130783
[LightGBM] [Info] Start training from score -4.455923
[LightGBM] [Info] Start training from score -2.251193
[LightGBM] [Info] Start training from score -4.526127
[LightGBM] [Info] Start training from score -3.186353
[LightGBM] [Info] Start training from score -2.965116
[LightGBM] [Info] Start training from score -2.942473
[LightGBM] [Info] Start training from score -2.957511
[LightGBM] [Info] Start training from score -5.589021
[Light

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.252413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 158351
[LightGBM] [Info] Number of data points in the train set: 4065, number of used features: 2516
[LightGBM] [Info] Start training from score -1.265264
[LightGBM] [Info] Start training from score -2.383243
[LightGBM] [Info] Start training from score -2.024171
[LightGBM] [Info] Start training from score -3.397514
[LightGBM] [Info] Start training from score -3.134019
[LightGBM] [Info] Start training from score -4.438968
[LightGBM] [Info] Start training from score -2.251046
[LightGBM] [Info] Start training from score -4.525979
[LightGBM] [Info] Start training from score -3.186205
[LightGBM] [Info] Start training from score -2.967835
[LightGBM] [Info] Start training from score -2.939531
[LightGBM] [Info] Start training from score -2.958311
[LightGBM] [Info] Start training from score -5.602119
[Light



[1;30;43mStreaming output truncated to the last 5000 lines.[0m




[1;30;43mStreaming output truncated to the last 5000 lines.[0m




[1;30;43mStreaming output truncated to the last 5000 lines.[0m




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.158724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 154607
[LightGBM] [Info] Number of data points in the train set: 4066, number of used features: 2532
[LightGBM] [Info] Start training from score -1.264638
[LightGBM] [Info] Start training from score -2.383489
[LightGBM] [Info] Start training from score -2.024417
[LightGBM] [Info] Start training from score -3.397760
[LightGBM] [Info] Start training from score -3.134265
[LightGBM] [Info] Start training from score -4.460267
[LightGBM] [Info] Start training from score -2.251292
[LightGBM] [Info] Start training from score -4.526225
[LightGBM] [Info] Start training from score -3.186451
[LightGBM] [Info] Start training from score -2.963307
[LightGBM] [Info] Start training from score -2.944439
[LightGBM] [Info] Start training from score -2.953829
[LightGBM] [Info] Start training from score -5.602365
[Light




Optimized Stacking Predictions saved to 'output_stk_optimized.csv'

Evaluation on Training Data (Metrics for Hazard Type with Optimized Stacking):




                                precision    recall  f1-score   support

                    biological       1.00      1.00      1.00      1741
                foreign bodies       1.00      1.00      1.00       561
                      chemical       1.00      1.00      1.00       287
                         fraud       0.98      1.00      0.99       371
          organoleptic aspects       1.00      0.98      0.99        53
                     allergens       1.00      1.00      1.00      1854
              packaging defect       1.00      0.98      0.99        54
                  other hazard       1.00      0.96      0.98       134
food additives and flavourings       1.00      0.92      0.96        24
                     migration       0.00      0.00      0.00         3

                      accuracy                           1.00      5082
                     macro avg       0.90      0.88      0.89      5082
                  weighted avg       1.00      1.00      1.00 



Accuracy for Hazard Type with Optimized Stacking: 0.9974419519874065

Evaluation on Training Data (Metrics for Product Category with Optimized Stacking):




                                                   precision    recall  f1-score   support

                     meat, egg and dairy products       0.87      0.98      0.92      1434
                       prepared dishes and snacks       0.85      0.60      0.70       469
                      cereals and bakery products       0.96      0.99      0.97       671
                                    confectionery       1.00      0.94      0.97       170
                                ices and desserts       0.99      1.00      1.00       222
                              alcoholic beverages       0.98      1.00      0.99        59
                            fruits and vegetables       0.93      0.78      0.85       535
                       other food product / mixed       1.00      0.40      0.57        55
     cocoa and cocoa preparations, coffee and tea       0.97      0.99      0.98       210
                     nuts, nut products and seeds       0.77      0.97      0.86       26



Accuracy for Product Category with Optimized Stacking: 0.910271546635183
