In [7]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint

# Function to create binary labels
def create_labels(data, column, keyword):
    return data[column].apply(lambda x: 1 if keyword.lower() in x.lower() else 0).values

# Load the data
train_data = pd.read_csv('C:/Users/hp/Downloads/NLP Project/NLP_Recipe_train.csv')
test_data = pd.read_csv('C:/Users/hp/Downloads/NLP Project/NLP_Recipe_test.csv')

# Create binary labels based on the presence of "vegetarian" in the description
train_labels = create_labels(train_data, 'description', 'vegetarian')
test_labels = create_labels(test_data, 'description', 'vegetarian')

# Parameters for tokenization and padding
vocab_size = 10000  # Size of the vocabulary
max_length = 200    # Maximum length of each sequence
trunc_type = 'post' # Truncate the sequences from the end
padding_type = 'post' # Pad the sequences at the end
oov_tok = "<OOV>"   # Token for out-of-vocabulary words

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_data['steps'])

# Convert texts to sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_data['steps'])
test_sequences = tokenizer.texts_to_sequences(test_data['steps'])

# Pad the sequences to ensure uniform length
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Define and compile the LSTM model
lstm_model = Sequential([
    Embedding(vocab_size, 64, input_length=max_length),
    LSTM(64),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Sigmoid activation for binary classification
])
lstm_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Define checkpoint callback to save the best model
checkpoint_path = "C:/Users/hp/Downloads/NLP Project/NLP_Project_Group35/baseline_best_model.keras"
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

print("Training the LSTM model")
# Train the LSTM model
lstm_model.fit(train_padded, train_labels, epochs=10, validation_data=(test_padded, test_labels), callbacks=[checkpoint])

# Define and compile the GRU model
gru_model = Sequential([
    Embedding(vocab_size, 64, input_length=max_length),
    GRU(64),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Sigmoid activation for binary classification
])
gru_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

print("Training the GRU model")
# Train the GRU model
gru_model.fit(train_padded, train_labels, epochs=10, validation_data=(test_padded, test_labels), callbacks=[checkpoint])


Training the LSTM model
Epoch 1/10




[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step - accuracy: 0.9742 - loss: 0.1831
Epoch 1: val_accuracy improved from -inf to 0.98333, saving model to C:/Users/hp/Downloads/NLP Project/NLP_Project_Group35/baseline_best_model.keras
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 100ms/step - accuracy: 0.9742 - loss: 0.1828 - val_accuracy: 0.9833 - val_loss: 0.0831
Epoch 2/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - accuracy: 0.9795 - loss: 0.0975
Epoch 2: val_accuracy did not improve from 0.98333
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 88ms/step - accuracy: 0.9795 - loss: 0.0975 - val_accuracy: 0.9833 - val_loss: 0.0862
Epoch 3/10
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.9784 - loss: 0.0995
Epoch 3: val_accuracy did not improve from 0.98333
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 95ms/step - accuracy: 0

<keras.src.callbacks.history.History at 0x23e876a6850>

In [9]:
from nltk.translate.bleu_score import sentence_bleu
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the model from the checkpoint
model = load_model("C:/Users/hp/Downloads/NLP Project/NLP_Project_Group35/baseline_best_model.keras")

# Function to generate text based on the model's prediction
def generate_text(model, sequence):
    # Predict the probability of the sequence being in the positive class
    probability = model.predict(sequence)[0][0]
    
    # Generate a text output based on the probability threshold (e.g., 0.5)
    if probability > 0.5:
        return "Vegetarian"
    else:
        return "Non-Vegetarian"

# Load and prepare your test data
test_data = pd.read_csv('C:/Users/hp/Downloads/NLP Project/NLP_Recipe_test.csv')
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(test_data['steps'])  # Assuming 'steps' is the column for text
test_sequences = tokenizer.texts_to_sequences(test_data['steps'])
test_padded = pad_sequences(test_sequences, maxlen=200, padding='post', truncating='post')

# Generate texts for the first 5 test samples
generated_texts = [generate_text(model, test_padded[i].reshape(1, -1)) for i in range(5)]

# Define reference text (not meaningful in this context, used for demonstration)
reference_texts = [["this", "is", "a", "reference", "text"]]

# Calculate BLEU scores for the first 5 generated texts (conceptually incorrect usage of BLEU)
bleu_scores = [sentence_bleu([reference_texts[0]], generated_text.split()) for generated_text in generated_texts]
print("BLEU Scores:", bleu_scores)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 386ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
BLEU Scores: [0, 0, 0, 0, 0]


In [2]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset, DatasetDict

# Load the data
train_data = pd.read_csv('C:/Users/hp/Downloads/NLP Project/NLP_Recipe_train.csv')
test_data = pd.read_csv('C:/Users/hp/Downloads/NLP Project/NLP_Recipe_test.csv')

# Prepare the dataset to have input text and target text
def format_recipe(item):
    return {
        "input_text": f"Key Ingredients and Details: {item['ner']}\nFull Ingredients and Steps:",
        "target_text": f"{item['ingredients']} {item['steps']}"
    }

train_dataset = Dataset.from_pandas(train_data).map(format_recipe)
test_dataset = Dataset.from_pandas(test_data).map(format_recipe)
datasets = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# Tokenizer and model selection
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,  # Set to False to disable mixed precision training
    fp16_full_eval=False,  # Set to False to disable FP16 half precision evaluation
)


# Define Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# To generate a recipe, define a function that takes key ingredients and details
def generate_recipe(key_ingredients):
    prompt = f"Key Ingredients and Details: {key_ingredients}\nFull Ingredients and Steps:"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model.generate(inputs["input_ids"], max_length=150)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
print(generate_recipe("chicken, garlic, onion, tomato, basil"))


Map: 100%|██████████| 6118/6118 [00:01<00:00, 5718.51 examples/s]
Map: 100%|██████████| 1080/1080 [00:00<00:00, 5471.58 examples/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  0%|          | 0/4590 [00:00<?, ?it/s]

IndexError: Invalid key: 4191 is out of bounds for size 0

In [5]:
%pip install sentencepiece


Defaulting to user installation because normal site-packages is not writeable
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
     -------------------------------------- 991.5/991.5 kB 4.8 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip
