# Imports

In [1]:
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from collections import Counter

# Dataset Inspection

In [2]:
# Load the cleaned data
df = pd.read_csv('clean_df.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2892 entries, 0 to 2891
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   quantity         2586 non-null   float64
 1   unit             2524 non-null   object 
 2   ingredient       2890 non-null   object 
 3   ingredient_step  2892 non-null   int64  
 4   recipe_id        2892 non-null   int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 113.1+ KB


In [4]:
# Group by recipe_id and aggregate ingredients into lists
grouped_df = df.groupby('recipe_id')['ingredient'].apply(list).reset_index()

# Display grouped DataFrame
grouped_df

Unnamed: 0,recipe_id,ingredient
0,0,"[dark rum, lime juice, sugar, angostura, perno..."
1,1,"[silver tequila, marie brizard creme de cacao,..."
2,2,"[gin, mari brizard white creme de cacao, lille..."
3,3,"[pernod, sugar, water]"
4,4,"[gold tequila, gold rum, grapefruit juice, pin..."
...,...,...
650,650,"[light rum, galliano, triple sec, lime juice]"
651,651,"[dark rum, light rum, tia maria, orange juice,..."
652,652,"[vodka, galliano, lime juice, pineapple juice]"
653,653,"[yellow chartreuse, pernod, apricot brandy]"


In [5]:
# Extract the list of lists of ingredients
recipes = grouped_df['ingredient'].tolist()

# Get the list of all unique ingredients
all_ingredients = df['ingredient'].unique().tolist()

# Use MultiLabelBinarizer to encode the ingredients
mlb = MultiLabelBinarizer(classes=all_ingredients)
encoded_recipes = mlb.fit_transform(recipes)

# Create a DataFrame for easier manipulation
encoded_df = pd.DataFrame(encoded_recipes, columns=mlb.classes_)

In [6]:
encoded_df.shape

(655, 389)

# Outline:

Model Building:

Our model architecture is:

l0: 389 (amount of unique ingredients)

hidden layers:          relu

L :            sigmoid   , 389

In [7]:
input_dim = len(all_ingredients)  # Number of unique ingredients

In [None]:


model = Sequential()
model.add(Dense(units=128, input_dim=input_dim, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=input_dim, activation='sigmoid'))  # Output layer with sigmoid activation

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [8]:
# attempt at at a 2nd model, larger architecture since 389 param

input_dim = len(all_ingredients)  # Number of unique ingredients

model = Sequential()
model.add(Dense(units=512, input_dim=input_dim, activation='relu'))  # First hidden layer
model.add(Dense(units=256, activation='relu'))  # Second hidden layer
model.add(Dense(units=128, activation='relu'))  # Second hidden layer
model.add(Dense(units=input_dim, activation='sigmoid'))  # Output layer

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
# Print model summary to verify the architecture
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               199680    
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 389)               50181     
                                                                 
Total params: 414,085
Trainable params: 414,085
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Attemping dropout layers
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the model
model = Sequential()
model.add(Dropout(0.1, input_shape=(input_dim,)))  # Dropout layer for the input layer
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for hidden layers
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for hidden layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for hidden layers
model.add(Dense(input_dim, activation='sigmoid'))  # Output layer for multi-label classification


# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model Training:

In [10]:
X = encoded_df.values  # Input features (binary vectors of ingredients)
y = encoded_df.values  # Target labels (same as input for multi-label classification)

#model.fit(X, y, epochs=20, batch_size=50, validation_split=0.2)

Generating New Recipes:

Start with a seed ingredient or set of ingredients.
Use the trained model to predict additional ingredients.

In [11]:
model.fit(X, y, epochs=50, batch_size=50, validation_split=0.2)

seed_ingredients = ['vodka']  # Example seed ingredients
seed_vector = mlb.transform([seed_ingredients])[0]

predicted_probabilities = model.predict(seed_vector.reshape(1, -1))
predicted_ingredients = mlb.inverse_transform((predicted_probabilities > 0.00001).astype(int))[0]

generated_recipe = list(set(seed_ingredients + list(predicted_ingredients)))
print("Generated Recipe:", generated_recipe)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Generated Recipe: ['kassatly chtaura orgeat', 'kahlúa', nan, 'vodka', 'rose petals', 'apricot liqueur', 'lillet blanc', 'orange juice', 'baileys irish cream', 'drambuie', 'cherry brandy', 'cognac', 'pastis', 'cucumber spear', 'falernum', 'milk', 'campari', 'hot sauce', 'old tom gin', 'cola', 'chopped green onion', 'pineapple pieces', 'whiskey', 'cubed ripe papaya', 'silver tequila', 'tia maria coffee lique

In [12]:
# Number of times to generate recipes
num_iterations = 100

# Example seed ingredients
seed_ingredients = ['gin', 'lime juice']
seed_vector = mlb.transform([seed_ingredients])[0]

# List to store generated recipes
generated_recipes = []

for _ in range(num_iterations):
    # Fit the model for one epoch
    model.fit(X, y, epochs=5, batch_size=389, validation_split=0.2, verbose=0)
    
    # Predict probabilities
    predicted_probabilities = model.predict(seed_vector.reshape(1, -1))
    
    # Convert probabilities to ingredients
    predicted_ingredients = mlb.inverse_transform((predicted_probabilities > 0.001).astype(int))[0]
    
    # Generate recipe
    generated_recipe = list(set(seed_ingredients + list(predicted_ingredients)))
    
    # Store the generated recipe as a tuple for easier counting
    generated_recipes.append(tuple(generated_recipe))

# Count the occurrences of each unique recipe
recipe_counts = Counter(generated_recipes)

# Convert to DataFrame
df = pd.DataFrame(recipe_counts.items(), columns=['Recipe', 'Count'])

# Sort by count (optional)
df = df.sort_values(by='Count', ascending=False)

print(df)


                                               Recipe  Count
23                                  (lime juice, gin)     70
22                  (lime juice, orange curaçao, gin)      5
18  (lime juice, luxardo maraschino, simple, orang...      3
14  (red pepper sauce, lime juice, luxardo marasch...      2
1   (lime juice, passion fruit liqueur, peach liqu...      1
21  (lime juice, luxardo maraschino, simple, orang...      1
20  (lime juice, orange curaçao, luxardo maraschin...      1
19  (lime juice, luxardo maraschino, orange curaça...      1
17  (red pepper sauce, lime juice, luxardo marasch...      1
16  (red pepper sauce, lime juice, luxardo marasch...      1
15  (red pepper sauce, lime juice, luxardo marasch...      1
13  (red pepper sauce, orange curaçao, lime juice,...      1
0   (calvados, lime juice, passion fruit liqueur, ...      1
11  (red pepper sauce, lime juice, passion fruit l...      1
10  (red pepper sauce, orange curaçao, lime juice,...      1
9   (lime juice, passion

In [13]:
df['Recipe']

23                                    (lime juice, gin)
22                    (lime juice, orange curaçao, gin)
18    (lime juice, luxardo maraschino, simple, orang...
14    (red pepper sauce, lime juice, luxardo marasch...
1     (lime juice, passion fruit liqueur, peach liqu...
21    (lime juice, luxardo maraschino, simple, orang...
20    (lime juice, orange curaçao, luxardo maraschin...
19    (lime juice, luxardo maraschino, orange curaça...
17    (red pepper sauce, lime juice, luxardo marasch...
16    (red pepper sauce, lime juice, luxardo marasch...
15    (red pepper sauce, lime juice, luxardo marasch...
13    (red pepper sauce, orange curaçao, lime juice,...
0     (calvados, lime juice, passion fruit liqueur, ...
11    (red pepper sauce, lime juice, passion fruit l...
10    (red pepper sauce, orange curaçao, lime juice,...
9     (lime juice, passion fruit liqueur, peach liqu...
8     (lime juice, passion fruit liqueur, peach liqu...
7     (lime juice, passion fruit liqueur, peach 

In [14]:
#df['Recipe'][0]

In [15]:
import numpy as np

# Assuming `predicted_probabilities` is already defined and is a 2D array
# and `mlb` is your MultiLabelBinarizer instance

# Get the indices that would sort each row of the array in descending order
sorted_indices = np.argsort(-predicted_probabilities, axis=1)

# Get the sorted probabilities
sorted_probabilities = np.take_along_axis(predicted_probabilities, sorted_indices, axis=1)

# For each row, map the sorted indices back to the ingredient names
# Assuming you want to do this for the first row in the predictions
top_ingredients_indices = sorted_indices[0]
top_probabilities = sorted_probabilities[0]

# Map these indices back to the ingredient names
top_ingredients = mlb.classes_[top_ingredients_indices]

# Print the top ingredients and their corresponding probabilities
for ingredient, probability in zip(top_ingredients, top_probabilities):
    print(f"Ingredient: {ingredient}, Probability: {probability}")


Ingredient: lime juice, Probability: 0.9999992847442627
Ingredient: gin, Probability: 0.9999978542327881
Ingredient: luxardo maraschino, Probability: 0.00014239137817639858
Ingredient: green chatreuse, Probability: 5.950605918769725e-05
Ingredient: orange curaçao, Probability: 3.428636773605831e-05
Ingredient: red pepper sauce, Probability: 1.5466295735677704e-05
Ingredient: simple, Probability: 1.3398287592281122e-05
Ingredient: guava nectar, Probability: 1.023065942717949e-05
Ingredient: galliano, Probability: 8.401830200455151e-06
Ingredient: clam juice, Probability: 8.120511665765662e-06
Ingredient: campari, Probability: 6.73890008329181e-06
Ingredient: peach liqueur, Probability: 5.9380749917181674e-06
Ingredient: ginger beer, Probability: 5.406109721661778e-06
Ingredient: orange juice, Probability: 4.796224402525695e-06
Ingredient: calvados, Probability: 3.872756678902078e-06
Ingredient: cherry brandy, Probability: 2.8033630314894253e-06
Ingredient: pomegranate syrup, Probability

Summary of Steps:
Clean Data Preparation:

Create a DataFrame with binary encoded ingredient vectors for each recipe.
Model Building:

Define a neural network to learn from these vectors.
Model Training:

Train the model on your dataset.
Recipe Generation:

Generate new recipes starting from seed ingredients.
By following these steps, you should be able to create and train a model that can generate new cocktail recipes based on the patterns it learns from your dataset. Let me know if you need more detail on any of these steps!

The architecture described here is a simple feedforward neural network, also known as a Multi-Layer Perceptron (MLP). This architecture is used for multi-label classification, where each ingredient is treated as a binary label that can be present or absent in a recipe.

Architecture Explanation
Input Layer:

The input layer consists of nodes equal to the number of unique ingredients. Each node represents whether a particular ingredient is present (1) or absent (0) in the recipe.
Input Dimension: input_dim = len(all_ingredients).
Hidden Layers:

The network has two hidden layers, each with 128 neurons. These layers use the ReLU (Rectified Linear Unit) activation function, which introduces non-linearity to the model and allows it to learn complex patterns.
First Hidden Layer: Dense(units=128, input_dim=input_dim, activation='relu')
Second Hidden Layer: Dense(units=128, activation='relu')
Output Layer:

The output layer also consists of nodes equal to the number of unique ingredients. Each node represents the probability of the corresponding ingredient being part of the recipe.
The sigmoid activation function is used in the output layer to produce probabilities between 0 and 1 for each ingredient.
Output Layer: Dense(units=input_dim, activation='sigmoid')
Loss Function:

The model uses binary cross-entropy loss, suitable for multi-label classification where each label (ingredient) is a binary decision (present or absent).
Loss Function: binary_crossentropy
Optimizer:

The Adam optimizer is used to minimize the loss function during training. Adam is a popular choice due to its adaptive learning rate and efficiency.
Optimizer: adam
Metrics:

The accuracy metric is used to evaluate the model's performance during training and validation.
Metrics: accuracy
Why This Architecture?
Simplicity: This architecture is straightforward and easy to implement, making it suitable for a dataset of your size (654 recipes and 447 unique ingredients).
Flexibility: The use of dense layers allows the network to learn from the presence and absence of each ingredient, capturing the relationships between different ingredients.
Scalability: Adding more hidden layers or increasing the number of neurons in each layer can improve the model's capacity to learn more complex patterns, if needed.
Summary
This architecture is a feedforward neural network designed for multi-label classification. It takes a binary vector representing the presence or absence of each ingredient as input and outputs a binary vector representing the predicted probabilities of each ingredient being part of the cocktail recipe. This approach is suitable for generating new recipes by learning the common ingredient combinations from the training data.








## Next chunk of code: Randomly decides to include this ingredient based on its probability

In [None]:
import numpy as np
from collections import Counter
import pandas as pd

# Assuming `predicted_probabilities` is already defined and is a 2D array
# and `mlb` is your MultiLabelBinarizer instance

def generate_semi_random_cocktail(predicted_probabilities, mlb, top_n=10, threshold=0.05):
    # Ensure predicted_probabilities is 2D
    if predicted_probabilities.ndim == 1:
        predicted_probabilities = predicted_probabilities.reshape(1, -1)
    
    # Get the indices that would sort each row of the array in descending order
    sorted_indices = np.argsort(-predicted_probabilities, axis=1)
    
    # Get the sorted probabilities
    sorted_probabilities = np.take_along_axis(predicted_probabilities, sorted_indices, axis=1)

    # For each row, map the sorted indices back to the ingredient names
    # Assuming you want to do this for the first row in the predictions
    top_ingredients_indices = sorted_indices[0]
    top_probabilities = sorted_probabilities[0]

    # Map these indices back to the ingredient names
    top_ingredients = mlb.classes_[top_ingredients_indices]

    # Semi-random selection based on sorted probabilities
    selected_ingredients = []
    for i in range(top_n):
        if top_probabilities[i] > threshold:
            # Randomly decide to include this ingredient based on its probability
            if (np.random.random() * 0.1) < top_probabilities[i]:
                selected_ingredients.append(top_ingredients[i])
    
    return selected_ingredients

# Number of times to generate recipes
num_iterations = 100

# Example seed ingredients
seed_ingredients = ['gin', 'lime juice']
seed_vector = mlb.transform([seed_ingredients])[0]

# List to store generated recipes
generated_recipes = []

for _ in range(num_iterations):
    # Fit the model for one epoch
    model.fit(X, y, epochs=1, batch_size=389, validation_split=0.2, verbose=0)
    
    # Predict probabilities
    predicted_probabilities = model.predict(seed_vector.reshape(1, -1))
    
    # Generate recipe using the semi-random selection method
    generated_recipe = generate_semi_random_cocktail(predicted_probabilities, mlb, top_n=10, threshold=0.000000005)
    
    # Add the seed ingredients to ensure they are included
    full_recipe = list(set(seed_ingredients + generated_recipe))
    
    # Store the generated recipe as a tuple for easier counting
    generated_recipes.append(tuple(full_recipe))

# Count the occurrences of each unique recipe
recipe_counts = Counter(generated_recipes)

# Convert to DataFrame
df = pd.DataFrame(recipe_counts.items(), columns=['Recipe', 'Count'])

# Sort by count (optional)
df = df.sort_values(by='Count', ascending=False)

print(df)




In [None]:
df

In [None]:
all_ingredients[1]

In [None]:
print("Predicted Probabilities:", predicted_probabilities)

In [None]:
predicted_probabilities[86]