<a href="https://colab.research.google.com/github/Adrita2211/ML_Project/blob/ML_Project_branch/integredient-category-detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Bidirectional, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# --- Data Preprocessing ---

# Load the dataset
data = pd.read_csv('/content/sample_data/Ingredients.csv')

# Combine 'Aliased Ingredient Name' and 'Ingredient Synonyms' for richer text input
data['text'] = data['Aliased Ingredient Name'].astype(str) + ' ' + data['Ingredient Synonyms'].fillna('')

# Clean text: lowercase, remove special characters
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    return text.strip()

data['text'] = data['text'].apply(clean_text)

# Tokenize text
max_words = 1000  # Maximum vocabulary size
max_len = 50      # Maximum sequence length
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

# Encode categories
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(data['Category'])
num_classes = len(label_encoder.classes_)
labels = to_categorical(labels, num_classes=num_classes)

# Split data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# --- Model Definitions ---

def build_text_rnn(vocab_size, embed_size, hidden_size, num_classes, dropout=0.5):
    model = Sequential([
        Embedding(vocab_size, embed_size, input_length=max_len),
        SimpleRNN(hidden_size, return_sequences=False),
        Dropout(dropout),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_text_birnn(vocab_size, embed_size, hidden_size, num_classes, dropout=0.5):
    model = Sequential([
        Embedding(vocab_size, embed_size, input_length=max_len),
        Bidirectional(SimpleRNN(hidden_size, return_sequences=False)),
        Dropout(dropout),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# --- Model Parameters ---

vocab_size = min(max_words, len(tokenizer.word_index) + 1)
embed_size = 100
hidden_size = 128
num_epochs = 10
batch_size = 32

# Build and train TextRNN
print("Training TextRNN...")
model_rnn = build_text_rnn(vocab_size, embed_size, hidden_size, num_classes)
model_rnn.summary()
history_rnn = model_rnn.fit(
    X_train, y_train,
    epochs=num_epochs,
    batch_size=batch_size,
    validation_data=(X_test, y_test),
    verbose=1
)

# Evaluate TextRNN
test_loss_rnn, test_acc_rnn = model_rnn.evaluate(X_test, y_test, verbose=0)
print(f"TextRNN Test Accuracy: {test_acc_rnn*100:.2f}%")

# Build and train TextBiRNN
print("\nTraining TextBiRNN...")
model_birnn = build_text_birnn(vocab_size, embed_size, hidden_size, num_classes)
model_birnn.summary()
history_birnn = model_birnn.fit(
    X_train, y_train,
    epochs=num_epochs,
    batch_size=batch_size,
    validation_data=(X_test, y_test),
    verbose=1
)

# Evaluate TextBiRNN
test_loss_birnn, test_acc_birnn = model_birnn.evaluate(X_test, y_test, verbose=0)
print(f"TextBiRNN Test Accuracy: {test_acc_birnn*100:.2f}%")

# --- Inference Function ---

def predict_category(ingredient, model, tokenizer, label_encoder, max_len=50):
    text = clean_text(ingredient)
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')
    prediction = model.predict(padded, verbose=0)
    predicted_class = np.argmax(prediction, axis=1)
    return label_encoder.inverse_transform(predicted_class)[0]

# Example usage
example_ingredients = ["strawberry","vanila"]
print("\nPredictions with TextBiRNN:")
for ingredient in example_ingredients:
    category = predict_category(ingredient, model_birnn, tokenizer, label_encoder)
    print(f"Ingredient: {ingredient}, Predicted Category: {category}")

Training TextRNN...




Epoch 1/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - accuracy: 0.1167 - loss: 2.9472 - val_accuracy: 0.1398 - val_loss: 2.8275
Epoch 2/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.1231 - loss: 2.8912 - val_accuracy: 0.2043 - val_loss: 2.8691
Epoch 3/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.1570 - loss: 2.8565 - val_accuracy: 0.1882 - val_loss: 2.6971
Epoch 4/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.2844 - loss: 2.4464 - val_accuracy: 0.2366 - val_loss: 2.7009
Epoch 5/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.4618 - loss: 1.9493 - val_accuracy: 0.3011 - val_loss: 2.7494
Epoch 6/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.5842 - loss: 1.5374 - val_accuracy: 0.2581 - val_loss: 2.7897
Epoch 7/10
[1m24/24[0m [32m━━━━

Epoch 1/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 63ms/step - accuracy: 0.1169 - loss: 2.9800 - val_accuracy: 0.1344 - val_loss: 2.8100
Epoch 2/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.1669 - loss: 2.8226 - val_accuracy: 0.1344 - val_loss: 2.7892
Epoch 3/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.2423 - loss: 2.6386 - val_accuracy: 0.2957 - val_loss: 2.5819
Epoch 4/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.3212 - loss: 2.3433 - val_accuracy: 0.3011 - val_loss: 2.5726
Epoch 5/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.4566 - loss: 1.8450 - val_accuracy: 0.3118 - val_loss: 2.5431
Epoch 6/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 68ms/step - accuracy: 0.6419 - loss: 1.3965 - val_accuracy: 0.3011 - val_loss: 2.8255
Epoch 7/10
[1m24/24[0m [32m━━━━