# Imports

In [1]:
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from collections import Counter

# Dataset Inspection

In [2]:
# Load the cleaned data
df = pd.read_csv('clean_df.csv')

Unnamed: 0,quantity,unit,ingredient,ingredient_step,recipe_id
0,45.0,ml,dark rum,0,0
1,22.5,ml,lime juice,1,0
2,15.0,ml,sugar,2,0
3,1.0,dash,angostura,3,0
4,6.0,drop,pernod,4,0
5,240.0,ml,crushed ice,5,0
6,60.0,ml,silver tequila,0,1
7,22.5,ml,marie brizard creme de cacao,1,1
8,22.5,ml,lemon juice,2,1
9,45.0,ml,gin,0,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2892 entries, 0 to 2891
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   quantity         2586 non-null   float64
 1   unit             2524 non-null   object 
 2   ingredient       2890 non-null   object 
 3   ingredient_step  2892 non-null   int64  
 4   recipe_id        2892 non-null   int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 113.1+ KB


In [5]:
# Group by recipe_id and aggregate ingredients into lists
grouped_df = df.groupby('recipe_id')['ingredient'].apply(list).reset_index()

# Display grouped DataFrame
grouped_df

Unnamed: 0,recipe_id,ingredient
0,0,"[dark rum, lime juice, sugar, angostura, perno..."
1,1,"[silver tequila, marie brizard creme de cacao,..."
2,2,"[gin, mari brizard white creme de cacao, lille..."
3,3,"[pernod, sugar, water]"
4,4,"[gold tequila, gold rum, grapefruit juice, pin..."
...,...,...
650,650,"[light rum, galliano, triple sec, lime juice]"
651,651,"[dark rum, light rum, tia maria, orange juice,..."
652,652,"[vodka, galliano, lime juice, pineapple juice]"
653,653,"[yellow chartreuse, pernod, apricot brandy]"


In [6]:
# Extract the list of lists of ingredients
recipes = grouped_df['ingredient'].tolist()

# Get the list of all unique ingredients
all_ingredients = df['ingredient'].unique().tolist()

# Use MultiLabelBinarizer to encode the ingredients
mlb = MultiLabelBinarizer(classes=all_ingredients)
encoded_recipes = mlb.fit_transform(recipes)

# Create a DataFrame for easier manipulation
encoded_df = pd.DataFrame(encoded_recipes, columns=mlb.classes_)

In [7]:
encoded_df.shape

(655, 389)

# Outline:

Model Building:

Our model architecture is:

l0: 389 (amount of unique ingredients)

hidden layers:          relu

L :            sigmoid   , 389

In [40]:
input_dim = len(all_ingredients)  # Number of unique ingredients

model = Sequential()
model.add(Dense(units=128, input_dim=input_dim, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=input_dim, activation='sigmoid'))  # Output layer with sigmoid activation

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [46]:
# attempt at at a 2nd model, larger architecture since 389 param

input_dim = len(all_ingredients)  # Number of unique ingredients

model = Sequential()
model.add(Dense(units=256, input_dim=input_dim, activation='relu'))  # First hidden layer
model.add(Dense(units=128, activation='relu'))  # Second hidden layer
model.add(Dense(units=64, activation='relu'))  # Second hidden layer
model.add(Dense(units=128, activation='relu'))  # Second hidden layer
model.add(Dense(units=256, activation='relu'))  # Third hidden layer
model.add(Dense(units=input_dim, activation='sigmoid'))  # Output layer

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [41]:
# Print model summary to verify the architecture
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_21 (Dense)            (None, 128)               49920     
                                                                 
 dense_22 (Dense)            (None, 128)               16512     
                                                                 
 dense_23 (Dense)            (None, 128)               16512     
                                                                 
 dense_24 (Dense)            (None, 389)               50181     
                                                                 
Total params: 133,125
Trainable params: 133,125
Non-trainable params: 0
_________________________________________________________________


Model Training:

In [17]:
X = encoded_df.values  # Input features (binary vectors of ingredients)
y = encoded_df.values  # Target labels (same as input for multi-label classification)

model.fit(X, y, epochs=20, batch_size=50, validation_split=0.2)



<keras.callbacks.History at 0x26713099bb0>

Generating New Recipes:

Start with a seed ingredient or set of ingredients.
Use the trained model to predict additional ingredients.

In [18]:
model.fit(X, y, epochs=50, batch_size=50, validation_split=0.2)

seed_ingredients = ['vodka']  # Example seed ingredients
seed_vector = mlb.transform([seed_ingredients])[0]

predicted_probabilities = model.predict(seed_vector.reshape(1, -1))
predicted_ingredients = mlb.inverse_transform((predicted_probabilities > 0.00001).astype(int))[0]

generated_recipe = list(set(seed_ingredients + list(predicted_ingredients)))
print("Generated Recipe:", generated_recipe)


Generated Recipe: [nan, 'raw egg white', 'pimento dram', 'grapefruit juice', 'pineapple juice', '100 proof rhum agricole', 'frangelico', 'orange slice', 'orange juice', 'spearmint', 'orange liqueur', 'generic bitters', 'dry vermouth', 'grated nutmeg', 'pastis', 'ruby port', 'southern comfort', 'dubonnet rouge', 'apple cider', 'passion fruit juice', 'silver tequila', 'single-malt scotch', 'pineapple syrup', 'clam juice', 'chambord', 'champagne', 'pineapple pieces', 'fruit', 'marachino', 'campari', 'english cucumber', 'cranberry juice', 'fernet branca', 'passion fruit syrup', 'add 3 oz. club soda', 'chocolate bitters', 'diced ripe avocado', 'apricot liqueur', 'herbsaint', 'tuaca', 'kahlúa', 'strawberry syrup', 'punt e mes', 'velvet falernum', 'brandy', 'mandarine napoléon liqueur', 'apple juice', 'tequila', 'godiva liqueur', 'peach bitters', 'gomme syrup', 'harveys bristol cream sherry', 'spirits', 'peychaud’s bitters', 'sweetened ginger juice', 'pisco', 'maraschino liqueur', 'cucumber s

In [47]:
# Number of times to generate recipes
num_iterations = 100

# Example seed ingredients
seed_ingredients = ['vodka']
seed_vector = mlb.transform([seed_ingredients])[0]

# List to store generated recipes
generated_recipes = []

for _ in range(num_iterations):
    # Fit the model for one epoch
    model.fit(X, y, epochs=5, batch_size=50, validation_split=0.2, verbose=0)
    
    # Predict probabilities
    predicted_probabilities = model.predict(seed_vector.reshape(1, -1))
    
    # Convert probabilities to ingredients
    predicted_ingredients = mlb.inverse_transform((predicted_probabilities > 0.001).astype(int))[0]
    
    # Generate recipe
    generated_recipe = list(set(seed_ingredients + list(predicted_ingredients)))
    
    # Store the generated recipe as a tuple for easier counting
    generated_recipes.append(tuple(generated_recipe))

# Count the occurrences of each unique recipe
recipe_counts = Counter(generated_recipes)

# Convert to DataFrame
df = pd.DataFrame(recipe_counts.items(), columns=['Recipe', 'Count'])

# Sort by count (optional)
df = df.sort_values(by='Count', ascending=False)

print(df)


                                               Recipe  Count
20                                           (vodka,)     45
19                                  (vodka, galliano)     24
18                    (vodka, orange twist, galliano)      7
17        (vodka, frangelico, orange twist, galliano)      4
0   (nan, raw egg white, pimento dram, grapefruit ...      3
16  (dry white wine, orange twist, vodka, frangeli...      2
5   (nan, raw egg white, pimento dram, grapefruit ...      1
6   (nan, grand marnier, butterscotch schnapps, cr...      1
7   (grand marnier, butterscotch schnapps, crème d...      1
8   (grand marnier, butterscotch schnapps, crème d...      1
9   (grand marnier, sloe gin, campari, crème de me...      1
1   (nan, raw egg white, pimento dram, grapefruit ...      1
11  (grand marnier, sloe gin, campari, dry white w...      1
12  (grand marnier, sloe gin, campari, dry white w...      1
13  (grand marnier, raspberries, sloe gin, campari...      1
14  (dry white wine, tri

In [48]:
df['Recipe']

20                                             (vodka,)
19                                    (vodka, galliano)
18                      (vodka, orange twist, galliano)
17          (vodka, frangelico, orange twist, galliano)
0     (nan, raw egg white, pimento dram, grapefruit ...
16    (dry white wine, orange twist, vodka, frangeli...
5     (nan, raw egg white, pimento dram, grapefruit ...
6     (nan, grand marnier, butterscotch schnapps, cr...
7     (grand marnier, butterscotch schnapps, crème d...
8     (grand marnier, butterscotch schnapps, crème d...
9     (grand marnier, sloe gin, campari, crème de me...
1     (nan, raw egg white, pimento dram, grapefruit ...
11    (grand marnier, sloe gin, campari, dry white w...
12    (grand marnier, sloe gin, campari, dry white w...
13    (grand marnier, raspberries, sloe gin, campari...
14    (dry white wine, triple sec, crème de cassis, ...
15    (dry white wine, crème de cassis, orange twist...
4     (nan, raw egg white, pimento dram, grapefr

In [49]:
df['Recipe'][0]

(nan,
 'raw egg white',
 'pimento dram',
 'grapefruit juice',
 'pineapple juice',
 '100 proof rhum agricole',
 'frangelico',
 'orange slice',
 'orange juice',
 'spearmint',
 'orange liqueur',
 'generic bitters',
 'dry vermouth',
 'grated nutmeg',
 'pastis',
 'ruby port',
 'southern comfort',
 'dubonnet rouge',
 'apple cider',
 'passion fruit juice',
 'silver tequila',
 'single-malt scotch',
 'pineapple syrup',
 'clam juice',
 'chambord',
 'champagne',
 'pineapple pieces',
 'fruit',
 'marachino',
 'campari',
 'english cucumber',
 'cranberry juice',
 'fernet branca',
 'passion fruit syrup',
 'add 3 oz. club soda',
 'chocolate bitters',
 'diced ripe avocado',
 'apricot liqueur',
 'herbsaint',
 'tuaca',
 'kahlúa',
 'strawberry syrup',
 'punt e mes',
 'velvet falernum',
 'brandy',
 'mandarine napoléon liqueur',
 'apple juice',
 'tequila',
 'godiva liqueur',
 'peach bitters',
 'gomme syrup',
 'harveys bristol cream sherry',
 'spirits',
 'peychaud’s bitters',
 'sweetened ginger juice',
 'pisc

In [50]:
import numpy as np

# Assuming `predicted_probabilities` is already defined and is a 2D array
# and `mlb` is your MultiLabelBinarizer instance

# Get the indices that would sort each row of the array in descending order
sorted_indices = np.argsort(-predicted_probabilities, axis=1)

# Get the sorted probabilities
sorted_probabilities = np.take_along_axis(predicted_probabilities, sorted_indices, axis=1)

# For each row, map the sorted indices back to the ingredient names
# Assuming you want to do this for the first row in the predictions
top_ingredients_indices = sorted_indices[0]
top_probabilities = sorted_probabilities[0]

# Map these indices back to the ingredient names
top_ingredients = mlb.classes_[top_ingredients_indices]

# Print the top ingredients and their corresponding probabilities
for ingredient, probability in zip(top_ingredients, top_probabilities):
    print(f"Ingredient: {ingredient}, Probability: {probability}")


Ingredient: vodka, Probability: 0.999999463558197
Ingredient: galliano, Probability: 0.0001782289909897372
Ingredient: orange twist, Probability: 1.1576246834010817e-05
Ingredient: frangelico, Probability: 7.087958238116698e-06
Ingredient: raspberries, Probability: 3.9186056710605044e-07
Ingredient: dry white wine, Probability: 1.7923585460266622e-07
Ingredient: tequila, Probability: 1.2438665919489722e-07
Ingredient: kahlúa, Probability: 6.359358906138368e-08
Ingredient: disaronno, Probability: 4.517415419513782e-08
Ingredient: tia maria, Probability: 2.2347350636664487e-08
Ingredient: crème de cassis, Probability: 1.4890775723586103e-08
Ingredient: crème de framboise, Probability: 3.003483017849362e-09
Ingredient: triple sec, Probability: 2.660035303136965e-09
Ingredient: orange slice, Probability: 8.197545420252084e-10
Ingredient: dry sake, Probability: 3.2542160632864636e-10
Ingredient: campari, Probability: 1.4774119427851673e-10
Ingredient: silver tequila, Probability: 7.66075189

Summary of Steps:
Clean Data Preparation:

Create a DataFrame with binary encoded ingredient vectors for each recipe.
Model Building:

Define a neural network to learn from these vectors.
Model Training:

Train the model on your dataset.
Recipe Generation:

Generate new recipes starting from seed ingredients.
By following these steps, you should be able to create and train a model that can generate new cocktail recipes based on the patterns it learns from your dataset. Let me know if you need more detail on any of these steps!

The architecture described here is a simple feedforward neural network, also known as a Multi-Layer Perceptron (MLP). This architecture is used for multi-label classification, where each ingredient is treated as a binary label that can be present or absent in a recipe.

Architecture Explanation
Input Layer:

The input layer consists of nodes equal to the number of unique ingredients. Each node represents whether a particular ingredient is present (1) or absent (0) in the recipe.
Input Dimension: input_dim = len(all_ingredients).
Hidden Layers:

The network has two hidden layers, each with 128 neurons. These layers use the ReLU (Rectified Linear Unit) activation function, which introduces non-linearity to the model and allows it to learn complex patterns.
First Hidden Layer: Dense(units=128, input_dim=input_dim, activation='relu')
Second Hidden Layer: Dense(units=128, activation='relu')
Output Layer:

The output layer also consists of nodes equal to the number of unique ingredients. Each node represents the probability of the corresponding ingredient being part of the recipe.
The sigmoid activation function is used in the output layer to produce probabilities between 0 and 1 for each ingredient.
Output Layer: Dense(units=input_dim, activation='sigmoid')
Loss Function:

The model uses binary cross-entropy loss, suitable for multi-label classification where each label (ingredient) is a binary decision (present or absent).
Loss Function: binary_crossentropy
Optimizer:

The Adam optimizer is used to minimize the loss function during training. Adam is a popular choice due to its adaptive learning rate and efficiency.
Optimizer: adam
Metrics:

The accuracy metric is used to evaluate the model's performance during training and validation.
Metrics: accuracy
Why This Architecture?
Simplicity: This architecture is straightforward and easy to implement, making it suitable for a dataset of your size (654 recipes and 447 unique ingredients).
Flexibility: The use of dense layers allows the network to learn from the presence and absence of each ingredient, capturing the relationships between different ingredients.
Scalability: Adding more hidden layers or increasing the number of neurons in each layer can improve the model's capacity to learn more complex patterns, if needed.
Summary
This architecture is a feedforward neural network designed for multi-label classification. It takes a binary vector representing the presence or absence of each ingredient as input and outputs a binary vector representing the predicted probabilities of each ingredient being part of the cocktail recipe. This approach is suitable for generating new recipes by learning the common ingredient combinations from the training data.






