## Model Training and Testing

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df=pd.read_csv('C:/Users/Admin/Desktop/food_allergen_detection/dataset/output_dataset.csv')

In [3]:
df.head()

Unnamed: 0,Price,Customer rating,Prediction,Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens
0,10.15,3.1,0,6.17,6.17,12.343736,12.471786,13.408,6.17
1,6.17,4.5,0,10.15,10.15,12.387473,12.519167,13.806,10.15
2,19.65,4.1,0,12.5,17.48,12.371511,12.031228,11.584,17.48
3,17.48,4.7,0,13.223333,19.65,12.379317,12.069298,11.801,19.65
4,10.83,3.7,0,12.397588,17.925,12.403237,12.185965,12.466,12.072073


In [4]:
# Define the target variable
target = 'Prediction'

# Separate the features and target variable
X = df.drop(columns=[target])
y = df[target]

In [5]:
# Print the shape of X and y
print("Shape of features (X):", X.shape)
print("Shape of target (y):", y.shape)

Shape of features (X): (398, 8)
Shape of target (y): (398,)


In [6]:
# Splitting into training and testing sets in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
#Printing the shapes of the resulting datasets
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_test: {y_test.shape}')

Shape of X_train: (278, 8)
Shape of X_test: (120, 8)
Shape of y_train: (278,)
Shape of y_test: (120,)


## Models

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
import joblib
import os

In [9]:
# Initialize models
models = {
    'Logistic Regression': Pipeline([
              ('scaler',StandardScaler()),
              ('model',LogisticRegression(max_iter=1000))
               ]),  
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Naive Bayes': GaussianNB(),
    'XGBoost': xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)
}

# Store results
results = []


# Train and evaluate each model
for model_name, model in models.items():
    print(f"--- {model_name} ---")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    
    # Compute accuracies
    train_accuracy = accuracy_score(y_train, train_preds)
    test_accuracy = accuracy_score(y_test, test_preds)
    overall_accuracy = (train_accuracy * len(y_train) + test_accuracy * len(y_test)) / (len(y_train) + len(y_test))

    
    # Append results
    results.append({
        'Model': model_name,
        'Training Accuracy': train_accuracy,
        'Testing Accuracy': test_accuracy,
        'Overall Accuracy': overall_accuracy
    })
    
    # Print classification report and confusion matrix
    print("Classification Report:")
    print(classification_report(y_test, test_preds))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, test_preds))
    print("\n")

# Create a DataFrame with results
results_df = pd.DataFrame(results)


--- Logistic Regression ---
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.84      0.76        87
           1       0.07      0.03      0.04        33

    accuracy                           0.62       120
   macro avg       0.38      0.43      0.40       120
weighted avg       0.52      0.62      0.56       120


Confusion Matrix:
[[73 14]
 [32  1]]


--- Decision Tree ---
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98        87
           1       0.94      0.97      0.96        33

    accuracy                           0.97       120
   macro avg       0.96      0.97      0.97       120
weighted avg       0.98      0.97      0.98       120


Confusion Matrix:
[[85  2]
 [ 1 32]]


--- Random Forest ---
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        87
           1    



Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        87
           1       0.97      0.97      0.97        33

    accuracy                           0.98       120
   macro avg       0.98      0.98      0.98       120
weighted avg       0.98      0.98      0.98       120


Confusion Matrix:
[[86  1]
 [ 1 32]]


--- Naive Bayes ---
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.86      0.91        87
           1       0.72      0.94      0.82        33

    accuracy                           0.88       120
   macro avg       0.85      0.90      0.87       120
weighted avg       0.90      0.88      0.89       120


Confusion Matrix:
[[75 12]
 [ 2 31]]


--- XGBoost ---
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        87
           1       0.94      1.00      0.97        3

Parameters: { "use_label_encoder" } are not used.



In [10]:
results_df

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,Overall Accuracy
0,Logistic Regression,0.586331,0.616667,0.595477
1,Decision Tree,1.0,0.975,0.992462
2,Random Forest,1.0,0.983333,0.994975
3,AdaBoost,1.0,0.983333,0.994975
4,Naive Bayes,0.899281,0.883333,0.894472
5,XGBoost,1.0,0.983333,0.994975


## Hyperparameter tuning for RandomForest Model

In [11]:
from sklearn.ensemble import RandomForestClassifier

# Define hyperparameters to tune for Random Forest
rf_param_grid = {
    'model__n_estimators': [50, 100, 200],  # Number of trees
    'model__max_depth': [None, 10, 20, 30],  # Maximum depth of trees
    'model__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'model__min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
    'model__bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Initialize the Random Forest model inside the pipeline
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier())
])

# Initialize GridSearchCV for Random Forest
rf_grid_search = GridSearchCV(estimator=rf_pipeline, param_grid=rf_param_grid, 
                              scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV to training data
rf_grid_search.fit(X_train, y_train)

# Get the best parameters and best score for Random Forest
best_rf_params = rf_grid_search.best_params_
best_rf_score = rf_grid_search.best_score_

print(f"Best Parameters for Random Forest: {best_rf_params}")
print(f"Best Cross-validation Accuracy for Random Forest: {best_rf_score}\n")

# Use the best estimator to predict on the test set
rf_model = rf_grid_search.best_estimator_
rf_train_preds =rf_model.predict(X_train)
rf_test_preds =rf_model.predict(X_test)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters for Random Forest: {'model__bootstrap': True, 'model__max_depth': None, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 50}
Best Cross-validation Accuracy for Random Forest: 0.9675324675324676



In [12]:
# Compute accuracies for Random Forest
rf_train_accuracy = accuracy_score(y_train, rf_train_preds)
rf_test_accuracy = accuracy_score(y_test, rf_test_preds)
rf_overall_accuracy = (rf_train_accuracy * len(y_train) + rf_test_accuracy * len(y_test)) / (len(y_train) + len(y_test))

# Print accuracies for Random Forest
print(f"Random Forest Training Accuracy: {rf_train_accuracy}")
print(f"Random Forest Test Accuracy: {rf_test_accuracy}")
print(f"Random Forest Overall Accuracy: {rf_overall_accuracy}")

# Classification report and confusion matrix for Random Forest
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_test_preds))
print("\nRandom Forest Confusion Matrix:")
print(confusion_matrix(y_test, rf_test_preds))

Random Forest Training Accuracy: 0.9964028776978417
Random Forest Test Accuracy: 0.9916666666666667
Random Forest Overall Accuracy: 0.9949748743718593

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        87
           1       0.97      1.00      0.99        33

    accuracy                           0.99       120
   macro avg       0.99      0.99      0.99       120
weighted avg       0.99      0.99      0.99       120


Random Forest Confusion Matrix:
[[86  1]
 [ 0 33]]


In [13]:
model_folder=r"C:\Users\Admin\Desktop\food_allergen_detection\model"
model_filename="rf_model.pkl"
model_path = os.path.join(model_folder,model_filename)

In [14]:
# Save the model to a file
joblib.dump(rf_model, model_path)

['C:\\Users\\Admin\\Desktop\\food_allergen_detection\\model\\rf_model.pkl']