In [1]:
import pandas as pd
# Load the dataset
data = pd.read_csv("Allergen_Status_of_Food_Products.csv")

# Display the first few rows of the dataset for exploration
data.head()


Unnamed: 0,Food Product,Main Ingredient,Sweetener,Fat/Oil,Seasoning,Allergens,Price ($),Customer rating (Out of 5),Prediction
0,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",10.15,3.1,Contains
1,Almond Cookies,Almonds,Sugar,Butter,Flour,"Almonds, Wheat, Dairy",6.17,4.5,Contains
2,Chicken Noodle Soup,Chicken broth,,,Salt,"Chicken, Wheat, Celery",19.65,4.1,Contains
3,Chicken Noodle Soup,Chicken broth,,,Salt,"Chicken, Wheat, Celery",17.48,4.7,Contains
4,Cheddar Cheese,Cheese,,,Salt,Dairy,10.83,3.7,Contains


In [2]:
import pandas as pd
import optuna
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

  from .autonotebook import tqdm as notebook_tqdm





In [3]:

# Preprocessing the data
data = pd.read_csv("Allergen_Status_of_Food_Products.csv")

# Fill NaN values with empty strings in the relevant columns
data = data.fillna('')

# Encoding the input features using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['Food Product'] + " " + 
                             data['Main Ingredient'] + " " + 
                             data['Sweetener'] + " " + 
                             data['Fat/Oil'] + " " + 
                             data['Seasoning'])
                             

# Preparing the target variable for multi-label classification
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['Allergens'].str.split(', '))

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Create the model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_train.shape[1], activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Train the model with the dense data
history = model.fit(X_train_dense, y_train, epochs=1000, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set using the dense data
loss, accuracy = model.evaluate(X_test_dense, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Make predictions using the dense data
predictions = model.predict(X_test_dense)
model.save('ALE.h5')  # creates a HDF5 file 'ALE.h5'




In [None]:
def create_model(trial):
    # Hyperparameters to be tuned by Optuna
    dropout_rate1 = trial.suggest_float('dropout_rate1', 0.1, 0.7)
    dropout_rate2 = trial.suggest_float('dropout_rate2', 0.1, 0.7)

    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    units_layer_1 = trial.suggest_int('units_layer_1', 64, 256)
    units_layer_2 = trial.suggest_int('units_layer_2', 32, 128)
    activation_function1 = trial.suggest_categorical('activation1', ['relu', 'sigmoid', 'tanh', 'leaky_relu'])
    activation_function2 = trial.suggest_categorical('activation2', ['relu', 'sigmoid', 'tanh', 'leaky_relu'])

    # Model architecture
    model = Sequential()
    model.add(Dense(units_layer_1, input_dim=X_train.shape[1], activation=activation_function1))
    model.add(Dropout(dropout_rate1))
    model.add(Dense(units_layer_2, activation=activation_function2))
    model.add(Dropout(dropout_rate2))
    model.add(Dense(y_train.shape[1], activation='sigmoid'))

    # Compile model
    optimizer = Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model

def objective(trial):
     # Create and train model
    model = create_model(trial)
    X_train_dense = X_train.toarray()  # Assuming X_train is in sparse format
    model.fit(X_train_dense, y_train, epochs=800, batch_size=32, validation_split=0.2, verbose=0)

    # Evaluate the model
    X_test_dense = X_test.toarray()  # Assuming X_test is in sparse format
    loss, accuracy = model.evaluate(X_test_dense, y_test, verbose=0)
    return accuracy  # Return accuracy
# Optuna study (maximize accuracy)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

# Output best trial information
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
best_trial = study.best_trial



In [None]:
# Train final model with best hyperparameters
best_model = create_model(best_trial)
X_train_dense = X_train.toarray()
best_model_history = best_model.fit(X_train_dense, y_train, epochs=1000, batch_size=32, validation_split=0.2)
loss, accuracy = best_model.evaluate(X_test_dense, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Save the final model
best_model.save('best_model.h5')

### Using the untuned model

In [None]:
trained_model = load_model('ALE.h5')

In [None]:
def predict_allergens_with_user_input(model, vectorizer, mlb, threshold=0.5):
    """
    Predicts allergens in a given food product based on user inputted ingredients
    and provides the likelihood of each allergen.

    Parameters:
    - model: Trained machine learning model for prediction.
    - vectorizer: CountVectorizer fitted on the training data.
    - mlb: MultiLabelBinarizer fitted on the training data.
    - threshold: Threshold for predicting the presence of an allergen (default is 0.5).

    Returns:
    - A dictionary with allergens and their likelihood.
    """

    # User input
    food_product = input("Enter Food Product Name: ")
    main_ingredient = input("Enter Main Ingredient: ")
    sweetener = input("Enter Sweetener (or None): ")
    fat_oil = input("Enter Fat/Oil (or None): ")
    seasoning = input("Enter Seasoning (or None): ")

    # Combining the input data and transforming it using the vectorizer
    combined_input = vectorizer.transform([f"{food_product} {main_ingredient} {sweetener} {fat_oil} {seasoning}"])

    # Getting model predictions
    pred_probabilities = model.predict(combined_input.toarray())[0]

    # Creating a dictionary of allergen probabilities
    allergen_probabilities = {allergen: prob for allergen, prob in zip(mlb.classes_, pred_probabilities)}
    if '' in allergen_probabilities and allergen_probabilities[''] > threshold:
        return f"Does not contain allergens (Probability: {allergen_probabilities['']:.6f})"
    # Filtering to include only allergens with probability above the threshold
    
    likely_allergens = {allergen: prob for allergen, prob in allergen_probabilities.items() if prob > threshold}
    if '' in likely_allergens:
        return f"Does not contain allergens (Probability: {allergen_probabilities['']:.6f})"
    else: 
        return likely_allergens
  

allergen_predictions = predict_allergens_with_user_input(trained_model, vectorizer, mlb)
allergen_predictions


### Using the tuned model

In [None]:
best_trained_model = load_model('best_model.h5')

In [None]:
def predict_allergens_with_user_input(model, vectorizer, mlb, threshold=0.5):
    """
    Predicts allergens in a given food product based on user inputted ingredients
    and provides the likelihood of each allergen.

    Parameters:
    - model: Trained machine learning model for prediction.
    - vectorizer: CountVectorizer fitted on the training data.
    - mlb: MultiLabelBinarizer fitted on the training data.
    - threshold: Threshold for predicting the presence of an allergen (default is 0.5).

    Returns:
    - A dictionary with allergens and their likelihood.
    """

    # User input
    food_product = input("Enter Food Product Name: ")
    main_ingredient = input("Enter Main Ingredient: ")
    sweetener = input("Enter Sweetener (or None): ")
    fat_oil = input("Enter Fat/Oil (or None): ")
    seasoning = input("Enter Seasoning (or None): ")

    # Combining the input data and transforming it using the vectorizer
    combined_input = vectorizer.transform([f"{food_product} {main_ingredient} {sweetener} {fat_oil} {seasoning}"])

    # Getting model predictions
    pred_probabilities = model.predict(combined_input.toarray())[0]

    # Creating a dictionary of allergen probabilities
    allergen_probabilities = {allergen: prob for allergen, prob in zip(mlb.classes_, pred_probabilities)}
    if '' in allergen_probabilities and allergen_probabilities[''] > threshold:
        return f"Does not contain allergens (Probability: {allergen_probabilities['']:.6f})"
    # Filtering to include only allergens with probability above the threshold
    
    likely_allergens = {allergen: prob for allergen, prob in allergen_probabilities.items() if prob > threshold}
    if '' in likely_allergens:
        return f"Does not contain allergens (Probability: {allergen_probabilities['']:.6f})"
    else: 
        return likely_allergens
  

allergen_predictions = predict_allergens_with_user_input(best_trained_model, vectorizer, mlb)
allergen_predictions
