### Import packages needed for the project

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from IPython.display import display

from sklearn.svm import OneClassSVM
from numpy import where

### Preprocess the data from the csv file

In [32]:
# Load the dataset
data = pd.read_csv('Allergen_Status_of_Food_Products.csv')

# Check for and handle NaN values in the 'Prediction' column
data_cleaned = data.dropna(subset=['Prediction'])

# Dropping unnecessary columns
data_cleaned = data_cleaned.drop(['Price ($)', 'Customer rating (Out of 5)', 'Allergens'], axis=1)

# Encoding categorical variables
# Modify encoder initialization to handle unknown categories
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(data_cleaned[['Food Product', 'Main Ingredient', 'Sweetener', 'Fat/Oil', 'Seasoning', ]])
data_encoded = encoder.transform(data_cleaned[['Food Product', 'Main Ingredient', 'Sweetener', 'Fat/Oil', 'Seasoning']])

# Split the data
X = data_encoded
y = data_cleaned['Prediction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)


### Create a logistic regression model for the data

In [38]:
# Create and train the logistic regression model
logreg = LogisticRegression(max_iter=10000, penalty='l2')
logreg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test)

# Calculate precision and recall for 'Contains'
precision_contains = precision_score(y_test, y_pred, pos_label='Contains')
precision_does_not_contain = precision_score(y_test, y_pred, pos_label='Does not contain')
recall_contains = recall_score(y_test, y_pred, pos_label='Contains')
recall_does_not_contain = recall_score(y_test, y_pred, pos_label='Does not contain')

# Print the precision and recall
print("Precision of 'Contains':", precision_contains)
print("Recall of 'Contains':", recall_contains)
print("Precision of 'Does not contain':", precision_does_not_contain)
print("Recall of 'Does not contain':", recall_does_not_contain)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print("Accuracy", accuracy)
print("Classification Report")
print(classification_rep)


Precision of 'Contains': 0.8913043478260869
Recall of 'Contains': 0.82
Precision of 'Does not contain': 0.7352941176470589
Recall of 'Does not contain': 0.8333333333333334
Accuracy 0.825
Classification Report
                  precision    recall  f1-score   support

        Contains       0.89      0.82      0.85        50
Does not contain       0.74      0.83      0.78        30

        accuracy                           0.82        80
       macro avg       0.81      0.83      0.82        80
    weighted avg       0.83      0.82      0.83        80



### Enter the data to predict the output

In [39]:

def encode_input(input_data, encoder):
    # Convert input data to DataFrame
    input_df = pd.DataFrame([input_data])
    # Use the encoder to transform the data
    encoded_data = encoder.transform(input_df).toarray()
    return encoded_data

def predict_allergen_status(model, encoder):
    # Get user input for each feature
    input_data = {
        'Food Product': input("Enter Food Product Name: "),
        'Main Ingredient': input("Enter Main Ingredient: "),
        'Sweetener': input("Enter Sweetener (or None): "),
        'Fat/Oil': input("Enter Fat/Oil (or None): "),
        'Seasoning': input("Enter Seasoning (or None): "),
    }

    # Encode the input data
    encoded_data = encode_input(input_data, encoder)

    # Make a prediction
    prediction = model.predict(encoded_data)
    return prediction[0]

if __name__ == "__main__":
    # Assuming 'logreg' is the trained logistic regression model
    # and 'encoder' is the OneHotEncoder used during training
    prediction = predict_allergen_status(logreg, encoder)
    print(f"The predicted allergen status is: {prediction}")


The predicted allergen status is: Contains
