In [1]:
import pandas as pd
from category_encoders import LeaveOneOutEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [3]:
# Load the dataset
file_path = 'E:/my files/project/food-allergen/Allergen_Status_of_Food_Products.csv'  
data = pd.read_csv(file_path, keep_default_na=False, na_values="")

# Split the data into training and test sets
train, test = train_test_split(data)

In [4]:
test

Unnamed: 0,Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens,Price ($),Customer rating (Out of 5),Prediction
83,Gulab Jamun,Milk solids,Sugar,Ghee,Cardamom syrup,Dairy,5.94,4.0,Contains
227,Roasted Brussels Sprouts,Brussels sprouts,,Olive oil,Balsamic glaze,,13.61,4.7,Does not contain
284,Rice Pudding,Rice,Sugar,Milk,"Cinnamon, raisins","Rice, Dairy",16.09,3.8,Contains
382,Apple Pie,Apples,Sugar,Butter,"Cinnamon, pastry","Wheat, Dairy",18.70,1.8,Contains
355,Cinnamon Rolls,Dough,Sugar,Butter,"Cinnamon, icing","Wheat, Dairy",7.93,3.8,Contains
...,...,...,...,...,...,...,...,...,...
186,Quinoa Stuffed Peppers,Quinoa,,Olive oil,"Vegetables, spices",,11.91,3.5,Does not contain
181,Stuffed Mushrooms,Mushrooms,,,"Garlic, herbs",,19.36,1.8,Does not contain
0,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",10.15,3.1,Contains
92,Sweet Potato Casserole,Sweet potatoes,Brown sugar,Butter,"Cinnamon, nutmeg","Dairy, Nuts",13.47,2.6,Contains


In [5]:
# Drop rows with missing values in the training set
train = train.dropna()

In [6]:
# Map the 'Prediction' column to numerical values
train['Prediction'] = train['Prediction'].map({'Contains': 1, 'Does not contain': 0})
test['Prediction'] = test['Prediction'].map({'Contains': 1, 'Does not contain': 0})

In [7]:
# Identify categorical columns in the training set
categorical_columns_train = train.select_dtypes(include=['object']).columns

# Initialize and fit the Leave-One-Out Encoder on the training data
encoder = LeaveOneOutEncoder(cols=categorical_columns_train)
train_encoded = encoder.fit_transform(train[categorical_columns_train], train['Price ($)'])
test_encoded = encoder.transform(test[categorical_columns_train])

# Add the encoded columns to the training and test data
train = pd.concat([train.drop(categorical_columns_train, axis=1), train_encoded], axis=1)
test = pd.concat([test.drop(categorical_columns_train, axis=1), test_encoded], axis=1)

In [8]:
test

Unnamed: 0,Price ($),Customer rating (Out of 5),Prediction,Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens
83,5.94,4.0,1,12.305671,12.305671,11.997581,11.952381,12.305671,12.163016
227,13.61,4.7,0,12.305671,11.475000,12.350507,13.463231,11.475000,12.116348
284,16.09,3.8,1,12.305671,7.695000,11.997581,11.950000,12.305671,12.305671
382,18.70,1.8,1,12.185000,11.550000,11.997581,12.261864,12.305671,12.575385
355,7.93,3.8,1,16.280000,12.305671,11.997581,12.261864,12.305671,12.575385
...,...,...,...,...,...,...,...,...,...
186,11.91,3.5,0,12.305671,14.685000,12.350507,13.463231,12.305671,12.116348
181,19.36,1.8,0,12.305671,12.087500,12.350507,11.834762,8.646667,12.116348
0,10.15,3.1,1,12.305671,12.305671,11.997581,12.261864,12.225000,12.305671
92,13.47,2.6,1,12.305671,12.305671,14.300000,12.261864,12.305671,12.305671


In [9]:
# Split features and target variable
X_train = train.drop(['Prediction'], axis=1)
y_train = train['Prediction']
X_test = test.drop(['Prediction'], axis=1)
y_test = test['Prediction']

In [10]:
# Define a dictionary of models
models = {
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()), 
        ('model', LogisticRegression(max_iter=1000))
    ]),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Classifier": Pipeline([
        ('scaler', StandardScaler()), 
        ('model', SVC(probability=True))
    ]),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": XGBClassifier()
}

In [11]:
# Store results in a dictionary
results = {}
confusion_matrices_train = {}
confusion_matrices_test = {}

# Evaluate each model

In [12]:
for model_name, model in models.items():
    # Perform cross-validation for a more reliable performance estimate
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    
    # Train the model on the full training set
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate accuracy for train and test
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    # Get classification reports for train and test sets
    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    test_report = classification_report(y_test, y_test_pred, output_dict=True)
    
    # Get confusion matrices for train and test sets
    train_confusion = confusion_matrix(y_train, y_train_pred)
    test_confusion = confusion_matrix(y_test, y_test_pred)

    # Store confusion matrices in dictionaries
    confusion_matrices_train[model_name] = pd.DataFrame(train_confusion, 
                                                        index=[f"Actual_{i}" for i in range(len(train_confusion))], 
                                                        columns=[f"Predicted_{i}" for i in range(len(train_confusion))])
    
    confusion_matrices_test[model_name] = pd.DataFrame(test_confusion, 
                                                       index=[f"Actual_{i}" for i in range(len(test_confusion))], 
                                                       columns=[f"Predicted_{i}" for i in range(len(test_confusion))])
    
    # Store results
    results[model_name] = {
        "Cross-Validation Mean Accuracy": cv_scores.mean(),
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy,
        "Train Precision": train_report['weighted avg']['precision'],
        "Test Precision": test_report['weighted avg']['precision'],
        "Train Recall": train_report['weighted avg']['recall'],
        "Test Recall": test_report['weighted avg']['recall'],
        "Train F1-Score": train_report['weighted avg']['f1-score'],
        "Test F1-Score": test_report['weighted avg']['f1-score']
    }



In [13]:
# Convert results to a DataFrame for easier viewing
result_df = pd.DataFrame(results).T

# Display the results
result_df

Unnamed: 0,Cross-Validation Mean Accuracy,Train Accuracy,Test Accuracy,Train Precision,Test Precision,Train Recall,Test Recall,Train F1-Score,Test F1-Score
Logistic Regression,0.563729,0.597315,0.65,0.513528,0.444388,0.597315,0.65,0.513725,0.527879
KNN,0.587514,0.768456,0.51,0.765465,0.513717,0.768456,0.51,0.766225,0.511815
Decision Tree,0.916328,1.0,0.76,1.0,0.754094,1.0,0.76,1.0,0.755736
Random Forest,0.879209,1.0,0.83,1.0,0.845167,1.0,0.83,1.0,0.833466
Gradient Boosting,0.909492,1.0,0.72,1.0,0.709528,1.0,0.72,1.0,0.712
Support Vector Classifier,0.661299,0.785235,0.65,0.782054,0.622436,0.785235,0.65,0.781196,0.628276
AdaBoost,0.929661,1.0,0.78,1.0,0.77491,1.0,0.78,1.0,0.776092
XGBoost,0.926271,1.0,0.77,1.0,0.762651,1.0,0.77,1.0,0.762065


In [14]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize the Random Forest model
rf = RandomForestClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)


  _data = np.array(data, dtype=dtype, copy=copy,


Best Hyperparameters: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 300}


In [15]:
# Initialize the Random Forest model with the best parameters
final_model = RandomForestClassifier(
    max_depth=7,                
    max_features='log2',        
    min_samples_leaf=5,         
    min_samples_split=10,      
    n_estimators=100,        
    random_state=42
)

# Train the model on the entire training dataset
final_model.fit(X_train, y_train)

# Make predictions on the test set
y_train_pred = final_model.predict(X_train)
y_test_pred = final_model.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Train Accuracy:",train_accuracy)
print("Test Accuracy:", test_accuracy)

# Print the classification report
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))

# Print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix:\n", conf_matrix)

cv_scores = cross_val_score(final_model, X_train, y_train, cv=5)
print("Cross-Validation Accuracy: ", cv_scores.mean())

Train Accuracy: 0.9697986577181208
Test Accuracy: 0.83

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.79      0.75        33
           1       0.89      0.85      0.87        67

    accuracy                           0.83       100
   macro avg       0.81      0.82      0.81       100
weighted avg       0.84      0.83      0.83       100


Confusion Matrix:
 [[26  7]
 [10 57]]
Cross-Validation Accuracy:  0.8691525423728814


In [16]:
import joblib

# Save the trained Random Forest model
joblib.dump(final_model, 'random_forest_model_updated.pkl')

# Save the fitted Leave-One-Out Encoder
joblib.dump(encoder, 'leave_one_out_encoder_updated.pkl')

print("Model and encoder have been saved successfully!")


Model and encoder have been saved successfully!


In [18]:
feature_importances = final_model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("\nFeature Importances:\n", importance_df)



Feature Importances:
                       Feature  Importance
7                   Allergens    0.595914
5                     Fat/Oil    0.133250
4                   Sweetener    0.098956
0                   Price ($)    0.047962
1  Customer rating (Out of 5)    0.040581
6                   Seasoning    0.033310
3             Main Ingredient    0.031740
2                Food Product    0.018288
