# Imports

In [1]:
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from collections import Counter

# Dataset Inspection

In [2]:
# Load the cleaned data
df = pd.read_csv('clean_df.csv')

In [3]:
# Group by recipe_id and aggregate ingredients into lists
grouped_df = df.groupby('recipe_id')['ingredient'].apply(list).reset_index()

In [4]:
# Extract the list of lists of ingredients
recipes = grouped_df['ingredient'].tolist()

# Get the list of all unique ingredients
all_ingredients = df['ingredient'].unique().tolist()

# Use MultiLabelBinarizer to encode the ingredients
mlb = MultiLabelBinarizer(classes=all_ingredients)
encoded_recipes = mlb.fit_transform(recipes)

# Create a DataFrame for easier manipulation
encoded_df = pd.DataFrame(encoded_recipes, columns=mlb.classes_)

# Outline:

In [5]:
input_dim = len(all_ingredients)  # Number of unique ingredients

In [6]:
# Attemping dropout layers
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the model
model = Sequential()
model.add(Dropout(0.1, input_shape=(input_dim,)))  # Dropout layer for the input layer
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for hidden layers
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for hidden layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for hidden layers
model.add(Dense(input_dim, activation='sigmoid'))  # Output layer for multi-label classification


# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout (Dropout)           (None, 389)               0         
                                                                 
 dense (Dense)               (None, 512)               199680    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dropout_3 (Dropout)         (None, 128)               0

Model Training:

In [7]:
X = encoded_df.values  # Input features (binary vectors of ingredients)
y = encoded_df.values  # Target labels (same as input for multi-label classification)

model.fit(X, y, epochs=100, batch_size=50, validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x244da5202b0>

Generating New Recipes:

Start with a seed ingredient or set of ingredients.
Use the trained model to predict additional ingredients.

## Next chunk of code: Randomly decides to include this ingredient based on its probability

In [None]:
import numpy as np
from collections import Counter
import pandas as pd

# Assuming `predicted_probabilities` is already defined and is a 2D array
# and `mlb` is your MultiLabelBinarizer instance

def generate_semi_random_cocktail(predicted_probabilities, mlb, top_n=10, threshold=0.05):
    # Ensure predicted_probabilities is 2D
    if predicted_probabilities.ndim == 1:
        predicted_probabilities = predicted_probabilities.reshape(1, -1)
    
    # Get the indices that would sort each row of the array in descending order
    sorted_indices = np.argsort(-predicted_probabilities, axis=1)
    
    # Get the sorted probabilities
    sorted_probabilities = np.take_along_axis(predicted_probabilities, sorted_indices, axis=1)

    # For each row, map the sorted indices back to the ingredient names
    # Assuming you want to do this for the first row in the predictions
    top_ingredients_indices = sorted_indices[0]
    top_probabilities = sorted_probabilities[0]

    # Map these indices back to the ingredient names
    top_ingredients = mlb.classes_[top_ingredients_indices]

    # Semi-random selection based on sorted probabilities
    selected_ingredients = []
    for i in range(top_n):
        if top_probabilities[i] > threshold:
            # Randomly decide to include this ingredient based on its probability
            if (np.random.random() * 0.1) < top_probabilities[i]:
                selected_ingredients.append(top_ingredients[i])
    
    return selected_ingredients

# Number of times to generate recipes
num_iterations = 1000

# Example seed ingredients
seed_ingredients = ['gin', 'lime juice']
seed_vector = mlb.transform([seed_ingredients])[0]

# List to store generated recipes
generated_recipes = []

for _ in range(num_iterations):
    # Fit the model for one epoch
    model.fit(X, y, epochs=1, batch_size=389, validation_split=0.2, verbose=0)
    
    # Predict probabilities
    predicted_probabilities = model.predict(seed_vector.reshape(1, -1))
    
    # Generate recipe using the semi-random selection method
    generated_recipe = generate_semi_random_cocktail(predicted_probabilities, mlb, top_n=10, threshold=0.000000005)
    
    # Add the seed ingredients to ensure they are included
    full_recipe = list(set(seed_ingredients + generated_recipe))
    
    # Store the generated recipe as a tuple for easier counting
    generated_recipes.append(tuple(full_recipe))

# Count the occurrences of each unique recipe
recipe_counts = Counter(generated_recipes)

# Convert to DataFrame
df = pd.DataFrame(recipe_counts.items(), columns=['Recipe', 'Count'])

# Sort by count (optional)
df = df.sort_values(by='Count', ascending=False)



In [None]:
import pandas as pd

# Set the maximum number of rows to display
pd.set_option('display.max_rows', 10)  # This will display all rows


In [None]:
df

In [None]:

import matplotlib.pyplot as plt

# Create histogram
plt.figure(figsize=(10, 6))
plt.hist(df['Count'], bins=10, color='blue', edgecolor='black')
plt.title('Histogram of Count')
plt.xlabel('Count')
plt.ylabel('Frequency')

# Show the plot
plt.show()


In [None]:
import numpy as np

# Assuming `predicted_probabilities` is already defined and is a 2D array
# and `mlb` is your MultiLabelBinarizer instance

# Get the indices that would sort each row of the array in descending order
sorted_indices = np.argsort(-predicted_probabilities, axis=1)

# Get the sorted probabilities
sorted_probabilities = np.take_along_axis(predicted_probabilities, sorted_indices, axis=1)

# For each row, map the sorted indices back to the ingredient names
# Assuming you want to do this for the first row in the predictions
top_ingredients_indices = sorted_indices[0]
top_probabilities = sorted_probabilities[0]

# Map these indices back to the ingredient names
top_ingredients = mlb.classes_[top_ingredients_indices]

# Print the top ingredients and their corresponding probabilities
for ingredient, probability in zip(top_ingredients, top_probabilities):
    print(f"Ingredient: {ingredient}, Probability: {probability}")
