### Import necessary libraries

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


### Load the dataset

In [42]:
data = pd.read_csv('../Dataset/SafeBite_preprocessed_data_loo_enc_price.csv', keep_default_na=False)

In [43]:
#### Split the dataset into features (X) and target variable (Y)

In [44]:
# Split the dataset into features (X) and target variable (Y)
X = data.drop(columns=["Is_Allergen"])
Y = data["Is_Allergen"]
print("Shape of X:", X.shape)
print("Shape of Y:", Y.shape)

Shape of X: (399, 8)
Shape of Y: (399,)


#### Perform an 80:20 train-test split

In [45]:
X_train_80, X_test_80, Y_train_80, Y_test_80 = train_test_split(X, Y, test_size=0.2, random_state=42)

In [46]:
#Print the shape of the 80:20 split data
print("\n80:20 Split:")
print("X_train_80 shape:", X_train_80.shape)
print("X_test_80 shape:", X_test_80.shape)
print("Y_train_80 shape:", Y_train_80.shape)
print("Y_test_80 shape:", Y_test_80.shape)


80:20 Split:
X_train_80 shape: (319, 8)
X_test_80 shape: (80, 8)
Y_train_80 shape: (319,)
Y_test_80 shape: (80,)


#### Perform an 70:30 train-test split

In [47]:
X_train_70, X_test_70, Y_train_70, Y_test_70 = train_test_split(X, Y, test_size=0.3, random_state=42)

In [48]:
# Print the shape of the 70:30 split data
print("\n70:30 Split:")
print("X_train_70 shape:", X_train_70.shape)
print("X_test_70 shape:", X_test_70.shape)
print("Y_train_70 shape:", Y_train_70.shape)
print("Y_test_70 shape:", Y_test_70.shape)


70:30 Split:
X_train_70 shape: (279, 8)
X_test_70 shape: (120, 8)
Y_train_70 shape: (279,)
Y_test_70 shape: (120,)


### Logistic Regression

In [49]:
# Initialize Logistic Regression
logreg = LogisticRegression(max_iter=500)

In [50]:
# Train and evaluate on 80:20 split
logreg.fit(X_train_80, Y_train_80)
logreg_train_pred_80 = logreg.predict(X_train_80)
logreg_test_pred_80 = logreg.predict(X_test_80)
# Train and evaluate on 70:30 split
logreg.fit(X_train_70, Y_train_70)
logreg_train_pred_70 = logreg.predict(X_train_70)
logreg_test_pred_70 = logreg.predict(X_test_70)

In [51]:
# Calculate metrics 80:20 split
logreg_train_acc_80 = accuracy_score(Y_train_80, logreg_train_pred_80)
logreg_test_acc_80 = accuracy_score(Y_test_80, logreg_test_pred_80)
logreg_precision_80 = precision_score(Y_test_80, logreg_test_pred_80)
logreg_recall_80 = recall_score(Y_test_80, logreg_test_pred_80)
logreg_f1_80 = f1_score(Y_test_80, logreg_test_pred_80)

# Calculate metrics 70:30 split
logreg_train_acc_70 = accuracy_score(Y_train_70, logreg_train_pred_70)
logreg_test_acc_70 = accuracy_score(Y_test_70, logreg_test_pred_70)
logreg_precision_70 = precision_score(Y_test_70, logreg_test_pred_70)
logreg_recall_70 = recall_score(Y_test_70, logreg_test_pred_70)
logreg_f1_70 = f1_score(Y_test_70, logreg_test_pred_70)

In [52]:
# Print results for Logistic Regression
print("Logistic Regression (80:20 split):")
print("Accuracy:", logreg_test_acc_80)
print("Precision:", logreg_precision_80)
print("Recall:", logreg_recall_80)
print("F1 Score:", logreg_f1_80)
print("\nLogistic Regression (70:30 split):")
print("Accuracy:", logreg_test_acc_70)
print("Precision:", logreg_precision_70)
print("Recall:", logreg_recall_70)
print("F1 Score:", logreg_f1_70)

Logistic Regression (80:20 split):
Accuracy: 0.65
Precision: 0.7142857142857143
Recall: 0.8620689655172413
F1 Score: 0.78125

Logistic Regression (70:30 split):
Accuracy: 0.6083333333333333
Precision: 0.7
Recall: 0.8045977011494253
F1 Score: 0.7486631016042781


### Random Forest

In [53]:
# Initialize Random Forest
random_forest = RandomForestClassifier()


In [54]:
# Train and evaluate on 80:20 split
random_forest.fit(X_train_80, Y_train_80)
rf_train_pred_80 = random_forest.predict(X_train_80)
rf_test_pred_80 = random_forest.predict(X_test_80)
# Train and evaluate for 70:30 split
random_forest.fit(X_train_70, Y_train_70)
rf_train_pred_70 = random_forest.predict(X_train_70)
rf_test_pred_70 = random_forest.predict(X_test_70)

In [55]:
# Calculate metrics 80:20 split
rf_train_acc_80 = accuracy_score(Y_train_80, rf_train_pred_80)
rf_test_acc_80 = accuracy_score(Y_test_80, rf_test_pred_80)
rf_precision_80 = precision_score(Y_test_80, rf_test_pred_80)
rf_recall_80 = recall_score(Y_test_80, rf_test_pred_80)
rf_f1_80 = f1_score(Y_test_80, rf_test_pred_80)

# Calculate metrics 70:30 split
rf_train_acc_70 = accuracy_score(Y_train_70, rf_train_pred_70)
rf_test_acc_70 = accuracy_score(Y_test_70, rf_test_pred_70)
rf_precision_70 = precision_score(Y_test_70, rf_test_pred_70)
rf_recall_70 = recall_score(Y_test_70, rf_test_pred_70)
rf_f1_70 = f1_score(Y_test_70, rf_test_pred_70)

In [56]:
# Print results for Random Forest
print("\nRandom Forest (80:20 split):")
print("Accuracy:", rf_test_acc_80)
print("Precision:", rf_precision_80)
print("Recall:", rf_recall_80)
print("F1 Score:", rf_f1_80)
print("\nRandom Forest (70:30 split):")
print("Accuracy:", rf_test_acc_70)
print("Precision:", rf_precision_70)
print("Recall:", rf_recall_70)
print("F1 Score:", rf_f1_70)


Random Forest (80:20 split):
Accuracy: 0.9875
Precision: 1.0
Recall: 0.9827586206896551
F1 Score: 0.991304347826087

Random Forest (70:30 split):
Accuracy: 0.975
Precision: 0.9772727272727273
Recall: 0.9885057471264368
F1 Score: 0.9828571428571429


### Ada Boost

In [57]:
# Initialize AdaBoost
adaboost = AdaBoostClassifier(algorithm='SAMME')

In [58]:
# Train and evaluate on 80:20 split
adaboost.fit(X_train_80, Y_train_80)
adaboost_train_pred_80 = adaboost.predict(X_train_80)
adaboost_test_pred_80 = adaboost.predict(X_test_80)
# Train and evaluate on 70:30 split
adaboost.fit(X_train_70, Y_train_70)
adaboost_train_pred_70 = adaboost.predict(X_train_70)
adaboost_test_pred_70 = adaboost.predict(X_test_70)

In [59]:
# Calculate metrics 80:20 split
adaboost_train_acc_80 = accuracy_score(Y_train_80, adaboost_train_pred_80)
adaboost_test_acc_80 = accuracy_score(Y_test_80, adaboost_test_pred_80)
adaboost_precision_80 = precision_score(Y_test_80, adaboost_test_pred_80)
adaboost_recall_80 = recall_score(Y_test_80, adaboost_test_pred_80)
adaboost_f1_80 = f1_score(Y_test_80, adaboost_test_pred_80)

# Calculate metrics 70:30 split
adaboost_train_acc_70 = accuracy_score(Y_train_70, adaboost_train_pred_70)
adaboost_test_acc_70 = accuracy_score(Y_test_70, adaboost_test_pred_70)
adaboost_precision_70 = precision_score(Y_test_70, adaboost_test_pred_70)
adaboost_recall_70 = recall_score(Y_test_70, adaboost_test_pred_70)
adaboost_f1_70 = f1_score(Y_test_70, adaboost_test_pred_70)

In [60]:
# Print results for AdaBoost
print("\nAdaBoost (80:20 split):")
print("Accuracy:", adaboost_test_acc_80)
print("Precision:", adaboost_precision_80)
print("Recall:", adaboost_recall_80)
print("F1 Score:", adaboost_f1_80)
print("\nAdaBoost (70:30 split):")
print("Accuracy:", adaboost_test_acc_70)
print("Precision:", adaboost_precision_70)
print("Recall:", adaboost_recall_70)
print("F1 Score:", adaboost_f1_70)


AdaBoost (80:20 split):
Accuracy: 0.9875
Precision: 1.0
Recall: 0.9827586206896551
F1 Score: 0.991304347826087

AdaBoost (70:30 split):
Accuracy: 0.9666666666666667
Precision: 0.9770114942528736
Recall: 0.9770114942528736
F1 Score: 0.9770114942528736


### Decision Tree

In [61]:
# Initialize Decision Tree
decision_tree = DecisionTreeClassifier()

In [62]:
# Train and evaluate on 80:20 split
decision_tree.fit(X_train_80, Y_train_80)
dt_train_pred_80 = decision_tree.predict(X_train_80)
dt_test_pred_80 = decision_tree.predict(X_test_80)
# Train and evaluate on 70:30 split
decision_tree.fit(X_train_70, Y_train_70)
dt_train_pred_70 = decision_tree.predict(X_train_70)
dt_test_pred_70 = decision_tree.predict(X_test_70)

In [63]:
# Calculate metrics 80:20 split
dt_train_acc_80 = accuracy_score(Y_train_80, dt_train_pred_80)
dt_test_acc_80 = accuracy_score(Y_test_80, dt_test_pred_80)
dt_precision_80 = precision_score(Y_test_80, dt_test_pred_80)
dt_recall_80 = recall_score(Y_test_80, dt_test_pred_80)
dt_f1_80 = f1_score(Y_test_80, dt_test_pred_80)

# Calculate metrics 80:20 split
dt_train_acc_70 = accuracy_score(Y_train_70, dt_train_pred_70)
dt_test_acc_70 = accuracy_score(Y_test_70, dt_test_pred_70)
dt_precision_70 = precision_score(Y_test_70, dt_test_pred_70)
dt_recall_70 = recall_score(Y_test_70, dt_test_pred_70)
dt_f1_70 = f1_score(Y_test_70, dt_test_pred_70)

In [64]:
# Print results for Decision Tree
print("\nDecision Tree (80:20 split):")
print("Accuracy:", dt_test_acc_80)
print("Precision:", dt_precision_80)
print("Recall:", dt_recall_80)
print("F1 Score:", dt_f1_80)
print("\nDecision Tree (70:30 split):")
print("Accuracy:", dt_test_acc_70)
print("Precision:", dt_precision_70)
print("Recall:", dt_recall_70)
print("F1 Score:", dt_f1_70)


Decision Tree (80:20 split):
Accuracy: 0.9375
Precision: 0.9818181818181818
Recall: 0.9310344827586207
F1 Score: 0.9557522123893806

Decision Tree (70:30 split):
Accuracy: 0.975
Precision: 1.0
Recall: 0.9655172413793104
F1 Score: 0.9824561403508771


###  XGBoost

In [65]:
# Initialize XGBoost
xgboost = XGBClassifier(eval_metric='logloss')

In [66]:
# Train and evaluate on 80:20 split
xgboost.fit(X_train_80, Y_train_80)
xgboost_train_pred_80 = xgboost.predict(X_train_80)
xgboost_test_pred_80 = xgboost.predict(X_test_80)

# Train and evaluate on 70:30 split
xgboost.fit(X_train_70, Y_train_70)
xgboost_train_pred_70 = xgboost.predict(X_train_70)
xgboost_test_pred_70 = xgboost.predict(X_test_70)

In [67]:
# Calculate metrics 80:20 split
xgboost_train_acc_80 = accuracy_score(Y_train_80, xgboost_train_pred_80)
xgboost_test_acc_80 = accuracy_score(Y_test_80, xgboost_test_pred_80)
xgboost_precision_80 = precision_score(Y_test_80, xgboost_test_pred_80)
xgboost_recall_80 = recall_score(Y_test_80, xgboost_test_pred_80)
xgboost_f1_80 = f1_score(Y_test_80, xgboost_test_pred_80)

# Calculate metrics 70:30 split
xgboost_train_acc_70 = accuracy_score(Y_train_70, xgboost_train_pred_70)
xgboost_test_acc_70 = accuracy_score(Y_test_70, xgboost_test_pred_70)
xgboost_precision_70 = precision_score(Y_test_70, xgboost_test_pred_70)
xgboost_recall_70 = recall_score(Y_test_70, xgboost_test_pred_70)
xgboost_f1_70 = f1_score(Y_test_70, xgboost_test_pred_70)

In [68]:
# Print results for XGBoost
print("\nXGBoost (80:20 split):")
print("Accuracy:", xgboost_test_acc_80)
print("Precision:", xgboost_precision_80)
print("Recall:", xgboost_recall_80)
print("F1 Score:", xgboost_f1_80)
print("\nXGBoost (70:30 split):")
print("Accuracy:", xgboost_test_acc_70)
print("Precision:", xgboost_precision_70)
print("Recall:", xgboost_recall_70)
print("F1 Score:", xgboost_f1_70)


XGBoost (80:20 split):
Accuracy: 0.9875
Precision: 1.0
Recall: 0.9827586206896551
F1 Score: 0.991304347826087

XGBoost (70:30 split):
Accuracy: 0.9916666666666667
Precision: 1.0
Recall: 0.9885057471264368
F1 Score: 0.9942196531791907


### K-Nearest Neighbors (KNN)

In [69]:
# Initialize KNN
knn = KNeighborsClassifier()

# Train and evaluate on 80:20 split
knn.fit(X_train_80, Y_train_80)
knn_train_pred_80 = knn.predict(X_train_80)
knn_test_pred_80 = knn.predict(X_test_80)

# Calculate metrics
knn_train_acc_80 = accuracy_score(Y_train_80, knn_train_pred_80)
knn_test_acc_80 = accuracy_score(Y_test_80, knn_test_pred_80)
knn_precision_80 = precision_score(Y_test_80, knn_test_pred_80)
knn_recall_80 = recall_score(Y_test_80, knn_test_pred_80)
knn_f1_80 = f1_score(Y_test_80, knn_test_pred_80)

# Train and evaluate on 70:30 split
knn.fit(X_train_70, Y_train_70)
knn_train_pred_70 = knn.predict(X_train_70)
knn_test_pred_70 = knn.predict(X_test_70)

# Calculate metrics
knn_train_acc_70 = accuracy_score(Y_train_70, knn_train_pred_70)
knn_test_acc_70 = accuracy_score(Y_test_70, knn_test_pred_70)
knn_precision_70 = precision_score(Y_test_70, knn_test_pred_70)
knn_recall_70 = recall_score(Y_test_70, knn_test_pred_70)
knn_f1_70 = f1_score(Y_test_70, knn_test_pred_70)

# Print results for K-Nearest Neighbors (KNN)
print("\nKNN (80:20 split):")
print("Accuracy:", knn_test_acc_80)
print("Precision:", knn_precision_80)
print("Recall:", knn_recall_80)
print("F1 Score:", knn_f1_80)
print("\nKNN (70:30 split):")
print("Accuracy:", knn_test_acc_70)
print("Precision:", knn_precision_70)
print("Recall:", knn_recall_70)
print("F1 Score:", knn_f1_70)


KNN (80:20 split):
Accuracy: 0.7125
Precision: 0.8571428571428571
Recall: 0.7241379310344828
F1 Score: 0.7850467289719626

KNN (70:30 split):
Accuracy: 0.6416666666666667
Precision: 0.8666666666666667
Recall: 0.5977011494252874
F1 Score: 0.7074829931972789


In [70]:
import pandas as pd

# Prepare data for the table: Model, Train Accuracy (80:20), Test Accuracy (80:20), Train Accuracy (70:30), Test Accuracy (70:30), Total Accuracy
model_comparison_data = {
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost', 'AdaBoost', 'Decision Tree', 'KNN'],
    'Train Accuracy (80:20)': [logreg_train_acc_80, rf_train_acc_80, xgboost_train_acc_80, adaboost_train_acc_80, dt_train_acc_80, knn_train_acc_80],
    'Test Accuracy (80:20)': [logreg_test_acc_80, rf_test_acc_80, xgboost_test_acc_80, adaboost_test_acc_80, dt_test_acc_80, knn_test_acc_80],
    'Train Accuracy (70:30)': [logreg_train_acc_70, rf_train_acc_70, xgboost_train_acc_70, adaboost_train_acc_70, dt_train_acc_70, knn_train_acc_70],
    'Test Accuracy (70:30)': [logreg_test_acc_70, rf_test_acc_70, xgboost_test_acc_70, adaboost_test_acc_70, dt_test_acc_70, knn_test_acc_70],
    'Total Accuracy': [
        (logreg_train_acc_80 + logreg_test_acc_80 + logreg_train_acc_70 + logreg_test_acc_70) / 4,
        (rf_train_acc_80 + rf_test_acc_80 + rf_train_acc_70 + rf_test_acc_70) / 4,
        (xgboost_train_acc_80 + xgboost_test_acc_80 + xgboost_train_acc_70 + xgboost_test_acc_70) / 4,
        (adaboost_train_acc_80 + adaboost_test_acc_80 + adaboost_train_acc_70 + adaboost_test_acc_70) / 4,
        (dt_train_acc_80 + dt_test_acc_80 + dt_train_acc_70 + dt_test_acc_70) / 4,
        (knn_train_acc_80 + knn_test_acc_80 + knn_train_acc_70 + knn_test_acc_70) / 4
        
    ]
}

# Create a DataFrame from the data
model_comparison_df = pd.DataFrame(model_comparison_data)

# Display the comparison table
model_comparison_df


Unnamed: 0,Model,Train Accuracy (80:20),Test Accuracy (80:20),Train Accuracy (70:30),Test Accuracy (70:30),Total Accuracy
0,Logistic Regression,0.589342,0.65,0.594982,0.608333,0.610664
1,Random Forest,1.0,0.9875,1.0,0.975,0.990625
2,XGBoost,1.0,0.9875,1.0,0.991667,0.994792
3,AdaBoost,0.981191,0.9875,0.982079,0.966667,0.979359
4,Decision Tree,1.0,0.9375,1.0,0.975,0.978125
5,KNN,0.711599,0.7125,0.731183,0.641667,0.699237


In [71]:
# Example structure to store performance metrics for all models
results = {
    "Logistic Regression": {
        "80:20": {"Accuracy": logreg_test_acc_80, "Precision": logreg_precision_80, "Recall": logreg_recall_80, "F1": logreg_f1_80},
        "70:30": {"Accuracy": logreg_test_acc_70, "Precision": logreg_precision_70, "Recall": logreg_recall_70, "F1": logreg_f1_70}
    },
    "Random Forest": {
        "80:20": {"Accuracy": rf_test_acc_80, "Precision": rf_precision_80, "Recall": rf_recall_80, "F1": rf_f1_80},
        "70:30": {"Accuracy": rf_test_acc_70, "Precision": rf_precision_70, "Recall": rf_recall_70, "F1": rf_f1_70}
    },
    "AdaBoost": {
        "80:20": {"Accuracy": adaboost_test_acc_80, "Precision": adaboost_precision_80, "Recall": adaboost_recall_80, "F1": adaboost_f1_80},
        "70:30": {"Accuracy": adaboost_test_acc_70, "Precision": adaboost_precision_70, "Recall": adaboost_recall_70, "F1": adaboost_f1_70}
    },
    "Decision Tree": {
        "80:20": {"Accuracy": dt_test_acc_80, "Precision": dt_precision_80, "Recall": dt_recall_80, "F1": dt_f1_80},
        "70:30": {"Accuracy": dt_test_acc_70, "Precision": dt_precision_70, "Recall": dt_recall_70, "F1": dt_f1_70}
    },
    "XGBoost": {
        "80:20": {"Accuracy": xgboost_test_acc_80, "Precision": xgboost_precision_80, "Recall": xgboost_recall_80, "F1": xgboost_f1_80},
        "70:30": {"Accuracy": xgboost_test_acc_70, "Precision": xgboost_precision_70, "Recall": xgboost_recall_70, "F1": xgboost_f1_70}
    },
    "KNN": {
        "80:20": {"Accuracy": knn_test_acc_80, "Precision": knn_precision_80, "Recall": knn_recall_80, "F1": knn_f1_80},
        "70:30": {"Accuracy": knn_test_acc_70, "Precision": knn_precision_70, "Recall": knn_recall_70, "F1": knn_f1_70}
    },
}

# Function to determine the better split
def compare_splits(results):
    better_split = {}
    for model, metrics in results.items():
        scores_80 = metrics["80:20"]
        scores_70 = metrics["70:30"]
        # Count metrics where 80:20 performs better and vice versa
        better_80 = sum([scores_80[m] > scores_70[m] for m in scores_80])
        better_70 = sum([scores_70[m] > scores_80[m] for m in scores_70])
        better_split[model] = "80:20" if better_80 > better_70 else "70:30"
    return better_split

# Determine the better split for each model
better_splits = compare_splits(results)

# Print results
for model, split in better_splits.items():
    print(f"{model}: Better split is {split}")


Logistic Regression: Better split is 80:20
Random Forest: Better split is 80:20
AdaBoost: Better split is 80:20
Decision Tree: Better split is 70:30
XGBoost: Better split is 70:30
KNN: Better split is 80:20


## Hyperparameter Tuning

### Random Forest Hyperparameter Tuning

In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report 

In [73]:
# Define the model
rf = RandomForestClassifier()

# Define parameter grid for hyperparameter tuning
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Setup GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=3, n_jobs=-1, verbose=2)

# Fit the model to training data
grid_search_rf.fit(X_train_70, Y_train_70)

# Best parameters from grid search
best_rf_params = grid_search_rf.best_params_

# Print the best parameters
print(f"Best parameters for Random Forest: {best_rf_params}")

# Evaluate on the test data
best_rf_model = grid_search_rf.best_estimator_
Y_pred_rf = best_rf_model.predict(X_test_70)
print(f"Random Forest Classification Report (70:30 Split):\n{classification_report(Y_test_70, Y_pred_rf)}")

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Random Forest Classification Report (70:30 Split):
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        33
           1       0.99      0.99      0.99        87

    accuracy                           0.98       120
   macro avg       0.98      0.98      0.98       120
weighted avg       0.98      0.98      0.98       120



### XGBoost Hyperparameter Tuning 

In [74]:
from xgboost import XGBClassifier

# Define the model
xgb = XGBClassifier()

# Define parameter grid for hyperparameter tuning
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'subsample': [0.8, 0.9, 1.0]
}

# Setup GridSearchCV
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=3, n_jobs=-1, verbose=2)

# Fit the model to training data
grid_search_xgb.fit(X_train_70, Y_train_70)

# Best parameters from grid search
best_xgb_params = grid_search_xgb.best_params_

# Print the best parameters
print(f"Best parameters for XGBoost: {best_xgb_params}")

# Evaluate on the test data
best_xgb_model = grid_search_xgb.best_estimator_
Y_pred_xgb = best_xgb_model.predict(X_test_70)
print(f"XGBoost Classification Report (70:30 Split):\n{classification_report(Y_test_70, Y_pred_xgb)}")


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
XGBoost Classification Report (70:30 Split):
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        33
           1       1.00      0.99      0.99        87

    accuracy                           0.99       120
   macro avg       0.99      0.99      0.99       120
weighted avg       0.99      0.99      0.99       120



###  Logistic Regression Hyperparameter Tuning

In [75]:
from sklearn.linear_model import LogisticRegression

# Define the model
log_reg = LogisticRegression(max_iter=1000)

# Define parameter grid for hyperparameter tuning
param_grid_log_reg = {
    'C': [0.01, 0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'saga']  # Solvers to use
}

# Setup GridSearchCV
grid_search_log_reg = GridSearchCV(estimator=log_reg, param_grid=param_grid_log_reg, cv=3, n_jobs=-1, verbose=2)

# Fit the model to training data
grid_search_log_reg.fit(X_train_70, Y_train_70)

# Best parameters from grid search
best_log_reg_params = grid_search_log_reg.best_params_

# Print the best parameters
print(f"Best parameters for Logistic Regression: {best_log_reg_params}")

# Evaluate on the test data
best_log_reg_model = grid_search_log_reg.best_estimator_
Y_pred_log_reg = best_log_reg_model.predict(X_test_70)
print(f"Logistic Regression Classification Report (70:30 Split):\n{classification_report(Y_test_70, Y_pred_log_reg)}")


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters for Logistic Regression: {'C': 0.01, 'solver': 'liblinear'}
Logistic Regression Classification Report (70:30 Split):
              precision    recall  f1-score   support

           0       0.14      0.03      0.05        33
           1       0.72      0.93      0.81        87

    accuracy                           0.68       120
   macro avg       0.43      0.48      0.43       120
weighted avg       0.56      0.68      0.60       120



### AdaBoost Hyperparameter Tuning

In [76]:
from sklearn.ensemble import AdaBoostClassifier

# Define the model
ada_boost = AdaBoostClassifier(algorithm='SAMME')

# Define parameter grid for hyperparameter tuning
param_grid_ada_boost = {
    'n_estimators': [50, 100, 200],  # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 1.0]  # Learning rate
}

# Setup GridSearchCV
grid_search_ada_boost = GridSearchCV(estimator=ada_boost, param_grid=param_grid_ada_boost, cv=3, n_jobs=-1, verbose=2)

# Fit the model to training data
grid_search_ada_boost.fit(X_train_70, Y_train_70)

# Best parameters from grid search
best_ada_boost_params = grid_search_ada_boost.best_params_

# Print the best parameters
print(f"Best parameters for AdaBoost: {best_ada_boost_params}")

# Evaluate on the test data
best_ada_boost_model = grid_search_ada_boost.best_estimator_
Y_pred_ada_boost = best_ada_boost_model.predict(X_test_70)
print(f"AdaBoost Classification Report (70:30 Split):\n{classification_report(Y_test_70, Y_pred_ada_boost)}")


Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best parameters for AdaBoost: {'learning_rate': 1.0, 'n_estimators': 200}
AdaBoost Classification Report (70:30 Split):
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        33
           1       0.99      0.99      0.99        87

    accuracy                           0.98       120
   macro avg       0.98      0.98      0.98       120
weighted avg       0.98      0.98      0.98       120



### Decision Tree Hyperparameter Tuning 

In [77]:
from sklearn.tree import DecisionTreeClassifier

# Define the model
dt = DecisionTreeClassifier()

# Define parameter grid for hyperparameter tuning
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Setup GridSearchCV
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=3, n_jobs=-1, verbose=2)

# Fit the model to training data
grid_search_dt.fit(X_train_70, Y_train_70)

# Best parameters from grid search
best_dt_params = grid_search_dt.best_params_

# Print the best parameters
print(f"Best parameters for Decision Tree: {best_dt_params}")

# Evaluate on the test data
best_dt_model = grid_search_dt.best_estimator_
Y_pred_dt = best_dt_model.predict(X_test_70)
print(f"Decision Tree Classification Report (70:30 Split):\n{classification_report(Y_test_70, Y_pred_dt)}")


Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2}
Decision Tree Classification Report (70:30 Split):
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        33
           1       1.00      0.98      0.99        87

    accuracy                           0.98       120
   macro avg       0.97      0.99      0.98       120
weighted avg       0.98      0.98      0.98       120



### KNN Hyperparameter Tuning

In [78]:
from sklearn.neighbors import KNeighborsClassifier

# Define the model
knn = KNeighborsClassifier()

# Define parameter grid for hyperparameter tuning
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Setup GridSearchCV
grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid_knn, cv=3, n_jobs=-1, verbose=2)

# Fit the model to training data
grid_search_knn.fit(X_train_70, Y_train_70)

# Best parameters from grid search
best_knn_params = grid_search_knn.best_params_

# Print the best parameters
print(f"Best parameters for KNN: {best_knn_params}")

# Evaluate on the test data
best_knn_model = grid_search_knn.best_estimator_
Y_pred_knn = best_knn_model.predict(X_test_70)
print(f"KNN Classification Report (70:30 Split):\n{classification_report(Y_test_70, Y_pred_knn)}")


Fitting 3 folds for each of 32 candidates, totalling 96 fits
Best parameters for KNN: {'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'distance'}
KNN Classification Report (70:30 Split):
              precision    recall  f1-score   support

           0       0.39      0.64      0.48        33
           1       0.82      0.62      0.71        87

    accuracy                           0.62       120
   macro avg       0.60      0.63      0.59       120
weighted avg       0.70      0.62      0.64       120



In [79]:
import pandas as pd
from sklearn.metrics import accuracy_score

# Initialize an empty list to store results
results = []

# 1. Logistic Regression: Before and After Hyperparameter Tuning

# Default Logistic Regression
log_reg_default = LogisticRegression(max_iter=1000)
log_reg_default.fit(X_train_70, Y_train_70)
y_pred_log_reg_default = log_reg_default.predict(X_test_70)
acc_log_reg_default = accuracy_score(Y_test_70, y_pred_log_reg_default)

# Hyperparameter tuned Logistic Regression
best_log_reg_model = grid_search_log_reg.best_estimator_
y_pred_log_reg_tuned = best_log_reg_model.predict(X_test_70)
acc_log_reg_tuned = accuracy_score(Y_test_70, y_pred_log_reg_tuned)

# Add to results
results.append(['Logistic Regression', acc_log_reg_default, acc_log_reg_tuned])

# 2. Random Forest: Before and After Hyperparameter Tuning

# Default Random Forest
rf_default = RandomForestClassifier()
rf_default.fit(X_train_70, Y_train_70)
y_pred_rf_default = rf_default.predict(X_test_70)
acc_rf_default = accuracy_score(Y_test_70, y_pred_rf_default)

# Hyperparameter tuned Random Forest
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf_tuned = best_rf_model.predict(X_test_70)
acc_rf_tuned = accuracy_score(Y_test_70, y_pred_rf_tuned)

# Add to results
results.append(['Random Forest', acc_rf_default, acc_rf_tuned])

# 3. AdaBoost: Before and After Hyperparameter Tuning

# Default AdaBoost
ada_boost_default = AdaBoostClassifier(algorithm='SAMME')
ada_boost_default.fit(X_train_70, Y_train_70)
y_pred_ada_boost_default = ada_boost_default.predict(X_test_70)
acc_ada_boost_default = accuracy_score(Y_test_70, y_pred_ada_boost_default)

# Hyperparameter tuned AdaBoost
best_ada_boost_model = grid_search_ada_boost.best_estimator_
y_pred_ada_boost_tuned = best_ada_boost_model.predict(X_test_70)
acc_ada_boost_tuned = accuracy_score(Y_test_70, y_pred_ada_boost_tuned)

# Add to results
results.append(['AdaBoost', acc_ada_boost_default, acc_ada_boost_tuned])

# 4. Decision Tree: Before and After Hyperparameter Tuning

# Default Decision Tree
dt_default = DecisionTreeClassifier()
dt_default.fit(X_train_70, Y_train_70)
y_pred_dt_default = dt_default.predict(X_test_70)
acc_dt_default = accuracy_score(Y_test_70, y_pred_dt_default)

# Hyperparameter tuned Decision Tree
best_dt_model = grid_search_dt.best_estimator_
y_pred_dt_tuned = best_dt_model.predict(X_test_70)
acc_dt_tuned = accuracy_score(Y_test_70, y_pred_dt_tuned)

# Add to results
results.append(['Decision Tree', acc_dt_default, acc_dt_tuned])

# 5. XGBoost: Before and After Hyperparameter Tuning

# Default XGBoost
xgb_default = XGBClassifier()
xgb_default.fit(X_train_70, Y_train_70)
y_pred_xgb_default = xgb_default.predict(X_test_70)
acc_xgb_default = accuracy_score(Y_test_70, y_pred_xgb_default)

# Hyperparameter tuned XGBoost
best_xgb_model = grid_search_xgb.best_estimator_
y_pred_xgb_tuned = best_xgb_model.predict(X_test_70)
acc_xgb_tuned = accuracy_score(Y_test_70, y_pred_xgb_tuned)

# Add to results
results.append(['XGBoost', acc_xgb_default, acc_xgb_tuned])

# 6. KNN: Before and After Hyperparameter Tuning

# Default KNN
knn_default = KNeighborsClassifier()
knn_default.fit(X_train_70, Y_train_70)
y_pred_knn_default = knn_default.predict(X_test_70)
acc_knn_default = accuracy_score(Y_test_70, y_pred_knn_default)

# Hyperparameter tuned KNN
best_knn_model = grid_search_knn.best_estimator_
y_pred_knn_tuned = best_knn_model.predict(X_test_70)
acc_knn_tuned = accuracy_score(Y_test_70, y_pred_knn_tuned)

# Add to results
results.append(['KNN', acc_knn_default, acc_knn_tuned])

# Convert results to a DataFrame for easy comparison
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy (Before Tuning)', 'Accuracy (After Tuning)'])

# Display the result table
print(results_df)


                 Model  Accuracy (Before Tuning)  Accuracy (After Tuning)
0  Logistic Regression                  0.608333                 0.683333
1        Random Forest                  0.983333                 0.983333
2             AdaBoost                  0.966667                 0.983333
3        Decision Tree                  0.950000                 0.983333
4              XGBoost                  0.991667                 0.991667
5                  KNN                  0.641667                 0.625000


In [80]:
import joblib
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier


X = data.drop(columns=["Is_Allergen"]) # Feature columns
Y = data["Is_Allergen"] # Target column


# Split the dataset (70:30 split)
X_train, X_test, Y_train,Y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

# Initialize the best model 
best_model = XGBClassifier(max_depth=3, learning_rate=0.01, n_estimators=100,subsample= 0.8) 

# Retrain the model
best_model.fit(X_train, Y_train)

# Save the trained model to a file
joblib.dump(best_model, 'best_xgboost_model.pkl')

# Verify the model performance
train_accuracy = best_model.score(X_train, Y_train)
test_accuracy = best_model.score(X_test, Y_test)

print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")


Train Accuracy: 0.974910394265233
Test Accuracy: 0.9666666666666667
