# Imports

In [1]:
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from collections import Counter

# Dataset Inspection

In [2]:
# Load the cleaned data
df = pd.read_csv('clean_df.csv')

In [3]:
# Group by recipe_id and aggregate ingredients into lists
grouped_df = df.groupby('recipe_id')['ingredient'].apply(list).reset_index()

In [4]:
# Extract the list of lists of ingredients
recipes = grouped_df['ingredient'].tolist()

# Get the list of all unique ingredients
all_ingredients = df['ingredient'].unique().tolist()

# Use MultiLabelBinarizer to encode the ingredients
mlb = MultiLabelBinarizer(classes=all_ingredients)
encoded_recipes = mlb.fit_transform(recipes)

# Create a DataFrame for easier manipulation
encoded_df = pd.DataFrame(encoded_recipes, columns=mlb.classes_)

# Outline:

In [5]:
input_dim = len(all_ingredients)  # Number of unique ingredients

In [6]:
# Attemping dropout layers
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the model
model = Sequential()
model.add(Dropout(0.1, input_shape=(input_dim,)))  # Dropout layer for the input layer
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for hidden layers
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for hidden layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for hidden layers
model.add(Dense(input_dim, activation='sigmoid'))  # Output layer for multi-label classification


# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout (Dropout)           (None, 389)               0         
                                                                 
 dense (Dense)               (None, 512)               199680    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dropout_3 (Dropout)         (None, 128)               0

Model Training:

In [7]:
X = encoded_df.values  # Input features (binary vectors of ingredients)
y = encoded_df.values  # Target labels (same as input for multi-label classification)

#model.fit(X, y, epochs=20, batch_size=50, validation_split=0.2)

Generating New Recipes:

Start with a seed ingredient or set of ingredients.
Use the trained model to predict additional ingredients.

## Next chunk of code: Randomly decides to include this ingredient based on its probability

In [8]:
import numpy as np
from collections import Counter
import pandas as pd

# Assuming `predicted_probabilities` is already defined and is a 2D array
# and `mlb` is your MultiLabelBinarizer instance

def generate_semi_random_cocktail(predicted_probabilities, mlb, top_n=10, threshold=0.05):
    # Ensure predicted_probabilities is 2D
    if predicted_probabilities.ndim == 1:
        predicted_probabilities = predicted_probabilities.reshape(1, -1)
    
    # Get the indices that would sort each row of the array in descending order
    sorted_indices = np.argsort(-predicted_probabilities, axis=1)
    
    # Get the sorted probabilities
    sorted_probabilities = np.take_along_axis(predicted_probabilities, sorted_indices, axis=1)

    # For each row, map the sorted indices back to the ingredient names
    # Assuming you want to do this for the first row in the predictions
    top_ingredients_indices = sorted_indices[0]
    top_probabilities = sorted_probabilities[0]

    # Map these indices back to the ingredient names
    top_ingredients = mlb.classes_[top_ingredients_indices]

    # Semi-random selection based on sorted probabilities
    selected_ingredients = []
    for i in range(top_n):
        if top_probabilities[i] > threshold:
            # Randomly decide to include this ingredient based on its probability
            if (np.random.random() * 0.1) < top_probabilities[i]:
                selected_ingredients.append(top_ingredients[i])
    
    return selected_ingredients

# Number of times to generate recipes
num_iterations = 1000

# Example seed ingredients
seed_ingredients = ['gin', 'lime juice']
seed_vector = mlb.transform([seed_ingredients])[0]

# List to store generated recipes
generated_recipes = []

for _ in range(num_iterations):
    # Fit the model for one epoch
    model.fit(X, y, epochs=1, batch_size=389, validation_split=0.2, verbose=0)
    
    # Predict probabilities
    predicted_probabilities = model.predict(seed_vector.reshape(1, -1))
    
    # Generate recipe using the semi-random selection method
    generated_recipe = generate_semi_random_cocktail(predicted_probabilities, mlb, top_n=10, threshold=0.000000005)
    
    # Add the seed ingredients to ensure they are included
    full_recipe = list(set(seed_ingredients + generated_recipe))
    
    # Store the generated recipe as a tuple for easier counting
    generated_recipes.append(tuple(full_recipe))

# Count the occurrences of each unique recipe
recipe_counts = Counter(generated_recipes)

# Convert to DataFrame
df = pd.DataFrame(recipe_counts.items(), columns=['Recipe', 'Count'])

# Sort by count (optional)
df = df.sort_values(by='Count', ascending=False)



In [14]:
df

Unnamed: 0,Recipe,Count
144,"(gin, lime juice)",74
248,"(gin, luxardo maraschino, lime juice)",57
275,"(green chatreuse, gin, luxardo maraschino, lim...",43
276,"(green chatreuse, gin, lime juice)",41
27,"(angostura, lime juice, simple syrup, sweet ve...",38
...,...,...
141,"(lime juice, gin, grenadine, orange curaçao, c...",1
140,"(club soda, lime juice, gin, orange curaçao, c...",1
138,"(lime juice, apricot brandy, lemon twist, gin,...",1
137,"(lime juice, apricot brandy, lemon twist, gin,...",1


In [10]:
import numpy as np

# Assuming `predicted_probabilities` is already defined and is a 2D array
# and `mlb` is your MultiLabelBinarizer instance

# Get the indices that would sort each row of the array in descending order
sorted_indices = np.argsort(-predicted_probabilities, axis=1)

# Get the sorted probabilities
sorted_probabilities = np.take_along_axis(predicted_probabilities, sorted_indices, axis=1)

# For each row, map the sorted indices back to the ingredient names
# Assuming you want to do this for the first row in the predictions
top_ingredients_indices = sorted_indices[0]
top_probabilities = sorted_probabilities[0]

# Map these indices back to the ingredient names
top_ingredients = mlb.classes_[top_ingredients_indices]

# Print the top ingredients and their corresponding probabilities
for ingredient, probability in zip(top_ingredients, top_probabilities):
    print(f"Ingredient: {ingredient}, Probability: {probability}")


Ingredient: gin, Probability: 0.9821605682373047
Ingredient: lime juice, Probability: 0.9812763333320618
Ingredient: simple syrup, Probability: 0.0165865495800972
Ingredient: simple, Probability: 0.014875117689371109
Ingredient: green chatreuse, Probability: 0.014819134026765823
Ingredient: luxardo maraschino, Probability: 0.014735396951436996
Ingredient: cointreau, Probability: 0.008647087030112743
Ingredient: apricot brandy, Probability: 0.007234945893287659
Ingredient: orange curaçao, Probability: 0.0068562570959329605
Ingredient: club soda, Probability: 0.006665622349828482
Ingredient: orange juice, Probability: 0.006034763529896736
Ingredient: green chartreuse, Probability: 0.005578760989010334
Ingredient: grenadine, Probability: 0.005445604212582111
Ingredient: ginger ale, Probability: 0.0031947388779371977
Ingredient: lemon juice, Probability: 0.002903936430811882
Ingredient: sugar, Probability: 0.002682232763618231
Ingredient: orange bitters, Probability: 0.002456306479871273
I