<a href="https://colab.research.google.com/github/Adrita2211/ML_Project/blob/ML_Project_branch/integredient-category-detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
####import
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Bidirectional, Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [4]:
# Load the dataset
data = pd.read_csv('/content/sample_data/Ingredients.csv')


In [5]:
#data cleaning
# Combine 'Aliased Ingredient Name' and 'Ingredient Synonyms' for richer text input
data['text'] = data['Aliased Ingredient Name'].astype(str) + ' ' + data['Ingredient Synonyms'].fillna('')

# Clean text: lowercase, remove special characters
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    return text.strip()

data['text'] = data['text'].apply(clean_text)

In [6]:
# Tokenize text
max_words = 1000  # Maximum vocabulary size
max_len = 50      # Maximum sequence length
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')


In [7]:
# Encode categories
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(data['Category'])
num_classes = len(label_encoder.classes_)
labels = to_categorical(labels, num_classes=num_classes)

In [8]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [10]:
#textRNN:model defination

def build_text_rnn(vocab_size, embed_size, hidden_size, num_classes, dropout=0.5):
    model = Sequential([
        Embedding(vocab_size, embed_size, input_length=max_len),
        SimpleRNN(hidden_size, return_sequences=False),
        Dropout(dropout),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [11]:
#textBiRNN:model defination
def build_text_birnn(vocab_size, embed_size, hidden_size, num_classes, dropout=0.5):
    model = Sequential([
        Embedding(vocab_size, embed_size, input_length=max_len),
        Bidirectional(SimpleRNN(hidden_size, return_sequences=False)),
        Dropout(dropout),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [12]:
vocab_size = min(max_words, len(tokenizer.word_index) + 1)
embed_size = 100
hidden_size = 128
num_epochs = 10
batch_size = 32

# Build and train TextRNN
print("Training TextRNN...")
model_rnn = build_text_rnn(vocab_size, embed_size, hidden_size, num_classes)
model_rnn.summary()
history_rnn = model_rnn.fit(
    X_train, y_train,
    epochs=num_epochs,
    batch_size=batch_size,
    validation_data=(X_test, y_test),
    verbose=1
)

# Evaluate TextRNN
test_loss_rnn, test_acc_rnn = model_rnn.evaluate(X_test, y_test, verbose=0)
print(f"TextRNN Test Accuracy: {test_acc_rnn*100:.2f}%")

Training TextRNN...




Epoch 1/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 42ms/step - accuracy: 0.0661 - loss: 3.0176 - val_accuracy: 0.1398 - val_loss: 2.8538
Epoch 2/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.1903 - loss: 2.7767 - val_accuracy: 0.1935 - val_loss: 2.7933
Epoch 3/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.3398 - loss: 2.4179 - val_accuracy: 0.2742 - val_loss: 2.7388
Epoch 4/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.4244 - loss: 2.0708 - val_accuracy: 0.2688 - val_loss: 2.7576
Epoch 5/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.5246 - loss: 1.7255 - val_accuracy: 0.3011 - val_loss: 2.7205
Epoch 6/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.6488 - loss: 1.2943 - val_accuracy: 0.2688 - val_loss: 2.8792
Epoch 7/10
[1m24/24[0m [32m━━━━

In [13]:
# Build and train TextBiRNN
print("\nTraining TextBiRNN...")
model_birnn = build_text_birnn(vocab_size, embed_size, hidden_size, num_classes)
model_birnn.summary()
history_birnn = model_birnn.fit(
    X_train, y_train,
    epochs=num_epochs,
    batch_size=batch_size,
    validation_data=(X_test, y_test),
    verbose=1
)

# Evaluate TextBiRNN
test_loss_birnn, test_acc_birnn = model_birnn.evaluate(X_test, y_test, verbose=0)
print(f"TextBiRNN Test Accuracy: {test_acc_birnn*100:.2f}%")



Training TextBiRNN...


Epoch 1/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 63ms/step - accuracy: 0.0791 - loss: 2.9787 - val_accuracy: 0.1344 - val_loss: 2.8039
Epoch 2/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 78ms/step - accuracy: 0.1727 - loss: 2.7716 - val_accuracy: 0.1613 - val_loss: 2.7599
Epoch 3/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.2164 - loss: 2.6607 - val_accuracy: 0.1828 - val_loss: 2.6424
Epoch 4/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.2646 - loss: 2.4129 - val_accuracy: 0.2258 - val_loss: 2.5807
Epoch 5/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.4309 - loss: 2.0237 - val_accuracy: 0.2849 - val_loss: 2.4971
Epoch 6/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.5571 - loss: 1.6107 - val_accuracy: 0.2527 - val_loss: 2.8036
Epoch 7/10
[1m24/24[0m [32m━━━━

In [14]:
def predict_category(ingredient, model, tokenizer, label_encoder, max_len=50):
    text = clean_text(ingredient)
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')
    prediction = model.predict(padded, verbose=0)
    predicted_class = np.argmax(prediction, axis=1)
    return label_encoder.inverse_transform(predicted_class)[0]

In [16]:
# Example usage
example_ingredients = ["strawberry","vanila"]
print("\nPredictions with TextBiRNN:")
for ingredient in example_ingredients:
    category = predict_category(ingredient, model_birnn, tokenizer, label_encoder)
    print(f"Ingredient: {ingredient}, Predicted Category: {category}")


Predictions with TextBiRNN:
Ingredient: strawberry, Predicted Category: Fruit
Ingredient: vanila, Predicted Category: Meat
