In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
# Load the dataset
file_path = 'food.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Display the first few rows of the dataset
data.head()


Unnamed: 0,Food,Serving,Calories
0,Artichoke,1 artichoke (128 g),60
1,Arugula,1 leaf (2 g),1
2,Asparagus,1 spear (12 g),2
3,Aubergine,1 aubergine (458 g),115
4,Beetroot,1 beet (82 g),35


In [3]:
# Define a threshold for "healthy" vs "unhealthy" based on calories (e.g., > 200 is unhealthy)
calorie_threshold = 200

# Create labels based on calories
data['Label'] = np.where(data['Calories'] > calorie_threshold, 'unhealthy', 'healthy')

# Display the updated dataset with labels
data.head()


Unnamed: 0,Food,Serving,Calories,Label
0,Artichoke,1 artichoke (128 g),60,healthy
1,Arugula,1 leaf (2 g),1,healthy
2,Asparagus,1 spear (12 g),2,healthy
3,Aubergine,1 aubergine (458 g),115,healthy
4,Beetroot,1 beet (82 g),35,healthy


In [4]:
# Use the 'Food' column as the text data
texts = data['Food'].values
labels = data['Label'].values

# Preprocess the labels: Convert "healthy" and "unhealthy" into binary values
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)  # 0: healthy, 1: unhealthy


In [5]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)  # Use a max vocabulary size of 5000
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to the same length (for LSTM input)
max_len = 20  # Adjust based on the average length of food descriptions
X = pad_sequences(sequences, maxlen=max_len)


In [6]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Display the shapes of the training and test data
print(f"Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}, Test labels shape: {y_test.shape}")


Training data shape: (449, 20), Training labels shape: (449,)
Test data shape: (113, 20), Test labels shape: (113,)


In [7]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))  # Embedding layer
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))  # LSTM layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [8]:
# Train the model
model.fit(X_train, y_train, epochs=18, batch_size=32, validation_split=0.1)


Epoch 1/18
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - accuracy: 0.5748 - loss: 0.6842 - val_accuracy: 0.6889 - val_loss: 0.6216
Epoch 2/18
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.5807 - loss: 0.6667 - val_accuracy: 0.6889 - val_loss: 0.6163
Epoch 3/18
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.6395 - loss: 0.6265 - val_accuracy: 0.7556 - val_loss: 0.6110
Epoch 4/18
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.7374 - loss: 0.5884 - val_accuracy: 0.7556 - val_loss: 0.5257
Epoch 5/18
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.7700 - loss: 0.5121 - val_accuracy: 0.7556 - val_loss: 0.4890
Epoch 6/18
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.8414 - loss: 0.3643 - val_accuracy: 0.7333 - val_loss: 0.5018
Epoch 7/18
[1m13/13[0m [32m━━━━

<keras.src.callbacks.history.History at 0x2467d386ed0>

In [9]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7567 - loss: 0.7812 
Test accuracy: 74.34%


In [10]:
# Function to predict healthiness of food description based on user input
def predict_healthiness(food_description):
    new_seq = tokenizer.texts_to_sequences([food_description])
    new_pad = pad_sequences(new_seq, maxlen=max_len)
    prediction = model.predict(new_pad)
    predicted_label = "unhealthy" if prediction > 0.5 else "healthy"
    return predicted_label


In [11]:
# Get user input for prediction
user_input = input("Enter a food description to check if it's healthy or unhealthy: ")
predicted_label = predict_healthiness(user_input)
print(f"Predicted label: {predicted_label}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 223ms/step
Predicted label: healthy
