### Import packages needed for the project

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score, make_scorer


from IPython.display import display

from sklearn.svm import OneClassSVM
from numpy import where

### Preprocess the data from the csv file

In [4]:
# Load the dataset
data = pd.read_csv('Allergen_Status_of_Food_Products.csv')

# Check for and handle NaN values in the 'Prediction' column
data_cleaned = data.dropna(subset=['Prediction'])

# Dropping unnecessary columns
data_cleaned = data_cleaned.drop(['Price ($)', 'Customer rating (Out of 5)', 'Allergens'], axis=1)

# Encoding categorical variables
# Modify encoder initialization to handle unknown categories
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(data_cleaned[['Food Product', 'Main Ingredient', 'Sweetener', 'Fat/Oil', 'Seasoning', ]])
data_encoded = encoder.transform(data_cleaned[['Food Product', 'Main Ingredient', 'Sweetener', 'Fat/Oil', 'Seasoning']])

# Split the data
X = data_encoded
y = data_cleaned['Prediction']
# Apply SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



### Create a logistic regression model for the data

In [5]:
# Define a set of values to try for C
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
# Define a custom scorer based on F1 score
f1_scorer = make_scorer(f1_score, average='binary', pos_label='Contains')

# Initialize GridSearchCV with the F1 scorer
grid_search = GridSearchCV(LogisticRegression(max_iter=10000, penalty='l2', class_weight='balanced'), 
                           param_grid, 
                           cv=5,
                           scoring=f1_scorer)
# Fit the model
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and the corresponding score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)
# Create and train the logistic regression model
#logreg = LogisticRegression(max_iter=10000, penalty='l2')
#logreg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = grid_search.predict(X_test)
#y_pred = logreg.predict(X_test)

# Calculate precision and recall for 'Contains'
precision_contains = precision_score(y_test, y_pred, pos_label='Contains')
precision_does_not_contain = precision_score(y_test, y_pred, pos_label='Does not contain')
recall_contains = recall_score(y_test, y_pred, pos_label='Contains')
recall_does_not_contain = recall_score(y_test, y_pred, pos_label='Does not contain')

# Print the precision and recall
print("Precision of 'Contains':", precision_contains)
print("Recall of 'Contains':", recall_contains)
print("Precision of 'Does not contain':", precision_does_not_contain)
print("Recall of 'Does not contain':", recall_does_not_contain)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print("Accuracy", accuracy)
print("Classification Report")
print(classification_rep)



Best parameters: {'C': 100}
Best cross-validated score: 0.921801343659942
Precision of 'Contains': 0.92
Recall of 'Contains': 0.92
Precision of 'Does not contain': 0.8666666666666667
Recall of 'Does not contain': 0.8666666666666667
Accuracy 0.9
Classification Report
                  precision    recall  f1-score   support

        Contains       0.92      0.92      0.92        50
Does not contain       0.87      0.87      0.87        30

        accuracy                           0.90        80
       macro avg       0.89      0.89      0.89        80
    weighted avg       0.90      0.90      0.90        80



## Using the threshold

In [6]:
# After fitting the grid search
T_model = grid_search.best_estimator_

# Get predicted probabilities
y_pred_prob = T_model.predict_proba(X_test)[:, 0]  # Assuming 'Contains' is the second class

# Choose a threshold
threshold = 0.23  

# Apply threshold to get new predictions
y_pred_new = np.where(y_pred_prob > threshold, 'Contains', 'Does not contain')

# Calculate precision and recall for 'Contains'
precision_contains_T = precision_score(y_test, y_pred_new, pos_label='Contains')
precision_does_not_contain_T = precision_score(y_test, y_pred_new, pos_label='Does not contain')
recall_contains_T = recall_score(y_test, y_pred_new, pos_label='Contains')
recall_does_not_contain_T = recall_score(y_test, y_pred_new, pos_label='Does not contain')

# Print the precision and recall
print("Precision of 'Contains':", precision_contains_T)
print("Recall of 'Contains':", recall_contains_T)
print("Precision of 'Does not contain':", precision_does_not_contain_T)
print("Recall of 'Does not contain':", recall_does_not_contain_T)
accuracy_T = accuracy_score(y_test, y_pred_new)
classification_rep_T = classification_report(y_test, y_pred_new)
print("Accuracy", accuracy_T)
print("Classification Report")
print(classification_rep_T)


Precision of 'Contains': 0.9215686274509803
Recall of 'Contains': 0.94
Precision of 'Does not contain': 0.896551724137931
Recall of 'Does not contain': 0.8666666666666667
Accuracy 0.9125
Classification Report
                  precision    recall  f1-score   support

        Contains       0.92      0.94      0.93        50
Does not contain       0.90      0.87      0.88        30

        accuracy                           0.91        80
       macro avg       0.91      0.90      0.91        80
    weighted avg       0.91      0.91      0.91        80



In [71]:
from joblib import dump

# Save the grid_search object (which contains your trained model)
dump(grid_search, 'grid_search_model.joblib')


['grid_search_model.joblib']

In [97]:
dump(T_model, 'T_model.joblib')

['T_model.joblib']

### Enter the data to predict the output

In [7]:
from joblib import load

# Load the model
grid_model = load('grid_search_model.joblib')

In [8]:
TT_model = load('T_model.joblib')

In [9]:

def encode_input(input_data, encoder):
    # Convert input data to DataFrame
    input_df = pd.DataFrame([input_data])
    # Use the encoder to transform the data
    encoded_data = encoder.transform(input_df).toarray()
    return encoded_data

def predict_allergen_status(model, encoder):
    # Get user input for each feature
    input_data = {
        'Food Product': input("Enter Food Product Name: "),
        'Main Ingredient': input("Enter Main Ingredient: "),
        'Sweetener': input("Enter Sweetener (or None): "),
        'Fat/Oil': input("Enter Fat/Oil (or None): "),
        'Seasoning': input("Enter Seasoning (or None): "),
    }

    # Encode the input data
    encoded_data = encode_input(input_data, encoder)

    # Make a prediction
    prediction = model.predict(encoded_data)
    return prediction[0]

if __name__ == "__main__":
    # Assuming 'logreg' is the trained logistic regression model
    # and 'encoder' is the OneHotEncoder used during training
    prediction = predict_allergen_status(grid_model, encoder)
    print(f"The predicted allergen status is: {prediction}")


The predicted allergen status is: Contains


In [10]:

def encode_input(input_data, encoder):
    # Convert input data to DataFrame
    input_df = pd.DataFrame([input_data])
    # Use the encoder to transform the data
    encoded_data = encoder.transform(input_df).toarray()
    return encoded_data

def predict_allergen_status(model, encoder):
    # Get user input for each feature
    input_data = {
        'Food Product': input("Enter Food Product Name: "),
        'Main Ingredient': input("Enter Main Ingredient: "),
        'Sweetener': input("Enter Sweetener (or None): "),
        'Fat/Oil': input("Enter Fat/Oil (or None): "),
        'Seasoning': input("Enter Seasoning (or None): "),
    }

    # Encode the input data
    encoded_data = encode_input(input_data, encoder)

    # Make a prediction
    prediction = model.predict(encoded_data)
    return prediction[0]

if __name__ == "__main__":
    # Assuming 'logreg' is the trained logistic regression model
    # and 'encoder' is the OneHotEncoder used during training
    prediction = predict_allergen_status(TT_model, encoder)
    print(f"The predicted allergen status is: {prediction}")


The predicted allergen status is: Contains
