# Imports

In [1]:
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from collections import Counter

# Dataset Inspection

In [2]:
# Load the cleaned data
df = pd.read_csv('clean_df.csv')

In [4]:
# Group by recipe_id and aggregate ingredients into lists
grouped_df = df.groupby('recipe_id')['ingredient'].apply(list).reset_index()

# Display grouped DataFrame
grouped_df

Unnamed: 0,recipe_id,ingredient
0,0,"[dark rum, lime juice, sugar, angostura, perno..."
1,1,"[silver tequila, marie brizard creme de cacao,..."
2,2,"[gin, mari brizard white creme de cacao, lille..."
3,3,"[pernod, sugar, water]"
4,4,"[gold tequila, gold rum, grapefruit juice, pin..."
...,...,...
650,650,"[light rum, galliano, triple sec, lime juice]"
651,651,"[dark rum, light rum, tia maria, orange juice,..."
652,652,"[vodka, galliano, lime juice, pineapple juice]"
653,653,"[yellow chartreuse, pernod, apricot brandy]"


In [5]:
# Extract the list of lists of ingredients
recipes = grouped_df['ingredient'].tolist()

# Get the list of all unique ingredients
all_ingredients = df['ingredient'].unique().tolist()

# Use MultiLabelBinarizer to encode the ingredients
mlb = MultiLabelBinarizer(classes=all_ingredients)
encoded_recipes = mlb.fit_transform(recipes)

# Create a DataFrame for easier manipulation
encoded_df = pd.DataFrame(encoded_recipes, columns=mlb.classes_)

In [6]:
encoded_df.shape

(655, 389)

# Outline:

Model Building:

Our model architecture is:

l0: 389 (amount of unique ingredients)

hidden layers:          relu

L :            sigmoid   , 389

In [7]:
input_dim = len(all_ingredients)  # Number of unique ingredients

In [8]:


model = Sequential()
model.add(Dense(units=128, input_dim=input_dim, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=input_dim, activation='sigmoid'))  # Output layer with sigmoid activation

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
# attempt at at a 2nd model, larger architecture since 389 param

input_dim = len(all_ingredients)  # Number of unique ingredients

model = Sequential()
model.add(Dense(units=256, input_dim=input_dim, activation='relu'))  # First hidden layer
model.add(Dense(units=128, activation='relu'))  # Second hidden layer
model.add(Dense(units=64, activation='relu'))  # Second hidden layer
model.add(Dense(units=128, activation='relu'))  # Second hidden layer
model.add(Dense(units=256, activation='relu'))  # Third hidden layer
model.add(Dense(units=input_dim, activation='sigmoid'))  # Output layer

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [10]:
# Print model summary to verify the architecture
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 256)               99840     
                                                                 
 dense_4 (Dense)             (None, 128)               32896     
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dense_6 (Dense)             (None, 128)               8320      
                                                                 
 dense_7 (Dense)             (None, 256)               33024     
                                                                 
 dense_8 (Dense)             (None, 389)               99973     
                                                                 
Total params: 282,309
Trainable params: 282,309
Non-tr

In [11]:
# Attemping dropout layers
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the model
model = Sequential()
model.add(Dropout(0.1, input_shape=(input_dim,)))  # Dropout layer for the input layer
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for hidden layers
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for hidden layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # Dropout layer for hidden layers
model.add(Dense(input_dim, activation='sigmoid'))  # Output layer for multi-label classification


# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout (Dropout)           (None, 389)               0         
                                                                 
 dense_9 (Dense)             (None, 512)               199680    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_10 (Dense)            (None, 256)               131328    
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_11 (Dense)            (None, 128)               32896     
                                                                 
 dropout_3 (Dropout)         (None, 128)              

Model Training:

In [12]:
X = encoded_df.values  # Input features (binary vectors of ingredients)
y = encoded_df.values  # Target labels (same as input for multi-label classification)

#model.fit(X, y, epochs=20, batch_size=50, validation_split=0.2)

Generating New Recipes:

Start with a seed ingredient or set of ingredients.
Use the trained model to predict additional ingredients.

In [13]:
model.fit(X, y, epochs=50, batch_size=50, validation_split=0.2)

seed_ingredients = ['vodka']  # Example seed ingredients
seed_vector = mlb.transform([seed_ingredients])[0]

predicted_probabilities = model.predict(seed_vector.reshape(1, -1))
predicted_ingredients = mlb.inverse_transform((predicted_probabilities > 0.00001).astype(int))[0]

generated_recipe = list(set(seed_ingredients + list(predicted_ingredients)))
print("Generated Recipe:", generated_recipe)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Generated Recipe: [nan, 'pineapple', 'passion fruit liqueur', 'green cocktail olive', 'crème de banane', 'mari brizard white creme de cacao', 'harveys bristol cream sherry', 'mandarin  juice', 'walnut liqueur', 'diced onion', 'jigger sazerac brandy', 'cubed ripe papaya', 'crème de cassis', 'aperol', 'cachaça', 'diced ripe avocado', 'apple schnapps', 'vanilla extract', 'turbinado syrup', 'sweet-and-sour', '

In [14]:
# Number of times to generate recipes
num_iterations = 100

# Example seed ingredients
seed_ingredients = ['gin', 'lime juice']
seed_vector = mlb.transform([seed_ingredients])[0]

# List to store generated recipes
generated_recipes = []

for _ in range(num_iterations):
    # Fit the model for one epoch
    model.fit(X, y, epochs=5, batch_size=389, validation_split=0.2, verbose=0)
    
    # Predict probabilities
    predicted_probabilities = model.predict(seed_vector.reshape(1, -1))
    
    # Convert probabilities to ingredients
    predicted_ingredients = mlb.inverse_transform((predicted_probabilities > 0.001).astype(int))[0]
    
    # Generate recipe
    generated_recipe = list(set(seed_ingredients + list(predicted_ingredients)))
    
    # Store the generated recipe as a tuple for easier counting
    generated_recipes.append(tuple(generated_recipe))

# Count the occurrences of each unique recipe
recipe_counts = Counter(generated_recipes)

# Convert to DataFrame
df = pd.DataFrame(recipe_counts.items(), columns=['Recipe', 'Count'])

# Sort by count (optional)
df = df.sort_values(by='Count', ascending=False)

print(df)


                                               Recipe  Count
60  (campari, cola, orgeat, mint, lemon twist, ros...      2
0   (nan, cola, passion fruit liqueur, tia maria, ...      1
63  (campari, cola, orgeat, mint, lemon twist, ros...      1
72  (campari, cola, orgeat, mint, lemon twist, ros...      1
71  (campari, cola, orgeat, mint, lemon twist, ros...      1
..                                                ...    ...
30  (nan, cola, passion fruit liqueur, calvados, h...      1
29  (cola, passion fruit liqueur, calvados, heavy ...      1
28  (cola, passion fruit liqueur, calvados, heavy ...      1
27  (cola, passion fruit liqueur, calvados, heavy ...      1
98  (campari, cola, orgeat, mint, lemon twist, ros...      1

[99 rows x 2 columns]


In [15]:
df['Recipe']

60    (campari, cola, orgeat, mint, lemon twist, ros...
0     (nan, cola, passion fruit liqueur, tia maria, ...
63    (campari, cola, orgeat, mint, lemon twist, ros...
72    (campari, cola, orgeat, mint, lemon twist, ros...
71    (campari, cola, orgeat, mint, lemon twist, ros...
                            ...                        
30    (nan, cola, passion fruit liqueur, calvados, h...
29    (cola, passion fruit liqueur, calvados, heavy ...
28    (cola, passion fruit liqueur, calvados, heavy ...
27    (cola, passion fruit liqueur, calvados, heavy ...
98    (campari, cola, orgeat, mint, lemon twist, ros...
Name: Recipe, Length: 99, dtype: object

In [16]:
#df['Recipe'][0]

In [17]:
import numpy as np

# Assuming `predicted_probabilities` is already defined and is a 2D array
# and `mlb` is your MultiLabelBinarizer instance

# Get the indices that would sort each row of the array in descending order
sorted_indices = np.argsort(-predicted_probabilities, axis=1)

# Get the sorted probabilities
sorted_probabilities = np.take_along_axis(predicted_probabilities, sorted_indices, axis=1)

# For each row, map the sorted indices back to the ingredient names
# Assuming you want to do this for the first row in the predictions
top_ingredients_indices = sorted_indices[0]
top_probabilities = sorted_probabilities[0]

# Map these indices back to the ingredient names
top_ingredients = mlb.classes_[top_ingredients_indices]

# Print the top ingredients and their corresponding probabilities
for ingredient, probability in zip(top_ingredients, top_probabilities):
    print(f"Ingredient: {ingredient}, Probability: {probability}")


Ingredient: gin, Probability: 0.9803774952888489
Ingredient: lime juice, Probability: 0.9628426432609558
Ingredient: apricot brandy, Probability: 0.04439103230834007
Ingredient: green chatreuse, Probability: 0.043275777250528336
Ingredient: luxardo maraschino, Probability: 0.03683939576148987
Ingredient: campari, Probability: 0.02772119641304016
Ingredient: cointreau, Probability: 0.027539661154150963
Ingredient: orange curaçao, Probability: 0.01347269769757986
Ingredient: simple, Probability: 0.012985840439796448
Ingredient: grenadine, Probability: 0.011330204084515572
Ingredient: simple syrup, Probability: 0.010332969948649406
Ingredient: mandarine napoléon, Probability: 0.009924150072038174
Ingredient: cherry brandy, Probability: 0.009772823192179203
Ingredient: calvados, Probability: 0.008411979302763939
Ingredient: cola, Probability: 0.0083883386105299
Ingredient: lemon juice, Probability: 0.007963746786117554
Ingredient: crème de cassis, Probability: 0.007759174332022667
Ingredie

Summary of Steps:
Clean Data Preparation:

Create a DataFrame with binary encoded ingredient vectors for each recipe.
Model Building:

Define a neural network to learn from these vectors.
Model Training:

Train the model on your dataset.
Recipe Generation:

Generate new recipes starting from seed ingredients.
By following these steps, you should be able to create and train a model that can generate new cocktail recipes based on the patterns it learns from your dataset. Let me know if you need more detail on any of these steps!

The architecture described here is a simple feedforward neural network, also known as a Multi-Layer Perceptron (MLP). This architecture is used for multi-label classification, where each ingredient is treated as a binary label that can be present or absent in a recipe.

Architecture Explanation
Input Layer:

The input layer consists of nodes equal to the number of unique ingredients. Each node represents whether a particular ingredient is present (1) or absent (0) in the recipe.
Input Dimension: input_dim = len(all_ingredients).
Hidden Layers:

The network has two hidden layers, each with 128 neurons. These layers use the ReLU (Rectified Linear Unit) activation function, which introduces non-linearity to the model and allows it to learn complex patterns.
First Hidden Layer: Dense(units=128, input_dim=input_dim, activation='relu')
Second Hidden Layer: Dense(units=128, activation='relu')
Output Layer:

The output layer also consists of nodes equal to the number of unique ingredients. Each node represents the probability of the corresponding ingredient being part of the recipe.
The sigmoid activation function is used in the output layer to produce probabilities between 0 and 1 for each ingredient.
Output Layer: Dense(units=input_dim, activation='sigmoid')
Loss Function:

The model uses binary cross-entropy loss, suitable for multi-label classification where each label (ingredient) is a binary decision (present or absent).
Loss Function: binary_crossentropy
Optimizer:

The Adam optimizer is used to minimize the loss function during training. Adam is a popular choice due to its adaptive learning rate and efficiency.
Optimizer: adam
Metrics:

The accuracy metric is used to evaluate the model's performance during training and validation.
Metrics: accuracy
Why This Architecture?
Simplicity: This architecture is straightforward and easy to implement, making it suitable for a dataset of your size (654 recipes and 447 unique ingredients).
Flexibility: The use of dense layers allows the network to learn from the presence and absence of each ingredient, capturing the relationships between different ingredients.
Scalability: Adding more hidden layers or increasing the number of neurons in each layer can improve the model's capacity to learn more complex patterns, if needed.
Summary
This architecture is a feedforward neural network designed for multi-label classification. It takes a binary vector representing the presence or absence of each ingredient as input and outputs a binary vector representing the predicted probabilities of each ingredient being part of the cocktail recipe. This approach is suitable for generating new recipes by learning the common ingredient combinations from the training data.








## Next chunk of code: Randomly decides to include this ingredient based on its probability

In [23]:
import numpy as np
from collections import Counter
import pandas as pd

# Assuming `predicted_probabilities` is already defined and is a 2D array
# and `mlb` is your MultiLabelBinarizer instance

def generate_semi_random_cocktail(predicted_probabilities, mlb, top_n=10, threshold=0.05):
    # Ensure predicted_probabilities is 2D
    if predicted_probabilities.ndim == 1:
        predicted_probabilities = predicted_probabilities.reshape(1, -1)
    
    # Get the indices that would sort each row of the array in descending order
    sorted_indices = np.argsort(-predicted_probabilities, axis=1)
    
    # Get the sorted probabilities
    sorted_probabilities = np.take_along_axis(predicted_probabilities, sorted_indices, axis=1)

    # For each row, map the sorted indices back to the ingredient names
    # Assuming you want to do this for the first row in the predictions
    top_ingredients_indices = sorted_indices[0]
    top_probabilities = sorted_probabilities[0]

    # Map these indices back to the ingredient names
    top_ingredients = mlb.classes_[top_ingredients_indices]

    # Semi-random selection based on sorted probabilities
    selected_ingredients = []
    for i in range(top_n):
        if top_probabilities[i] > threshold:
            # Randomly decide to include this ingredient based on its probability
            if (np.random.random() * 0.1) < top_probabilities[i]:
                selected_ingredients.append(top_ingredients[i])
    
    return selected_ingredients

# Number of times to generate recipes
num_iterations = 100

# Example seed ingredients
seed_ingredients = ['gin', 'lime juice']
seed_vector = mlb.transform([seed_ingredients])[0]

# List to store generated recipes
generated_recipes = []

for _ in range(num_iterations):
    # Fit the model for one epoch
    model.fit(X, y, epochs=1, batch_size=389, validation_split=0.2, verbose=0)
    
    # Predict probabilities
    predicted_probabilities = model.predict(seed_vector.reshape(1, -1))
    
    # Generate recipe using the semi-random selection method
    generated_recipe = generate_semi_random_cocktail(predicted_probabilities, mlb, top_n=7, threshold=0.000000005)
    
    # Add the seed ingredients to ensure they are included
    full_recipe = list(set(seed_ingredients + generated_recipe))
    
    # Store the generated recipe as a tuple for easier counting
    generated_recipes.append(tuple(full_recipe))

# Count the occurrences of each unique recipe
recipe_counts = Counter(generated_recipes)

# Convert to DataFrame
df = pd.DataFrame(recipe_counts.items(), columns=['Recipe', 'Count'])

# Sort by count (optional)
df = df.sort_values(by='Count', ascending=False)

print(df)


                                               Recipe  Count
2                                   (gin, lime juice)     16
9               (luxardo maraschino, gin, lime juice)     10
3                   (gin, apricot brandy, lime juice)      9
4   (luxardo maraschino, green chatreuse, apricot ...      7
5   (luxardo maraschino, gin, green chatreuse, lim...      6
7   (luxardo maraschino, gin, apricot brandy, lime...      6
8                  (gin, green chatreuse, lime juice)      5
6      (luxardo maraschino, gin, campari, lime juice)      4
18                          (gin, lime juice, simple)      4
13          (gin, apricot brandy, lime juice, simple)      4
0          (campari, gin, apricot brandy, lime juice)      3
21      (luxardo maraschino, gin, lime juice, simple)      3
17   (luxardo maraschino, gin, cointreau, lime juice)      2
20         (gin, green chatreuse, lime juice, simple)      2
29  (gin, apricot brandy, lime juice, green chatre...      2
11  (campari, luxardo ma

In [27]:
df['Recipe'][4]

('luxardo maraschino',
 'green chatreuse',
 'apricot brandy',
 'lime juice',
 'gin')

In [26]:
# Count the number of ingredients in each recipe
df['Ingredient_Count'] = df['Recipe'].apply(len)

# Find the index of the recipe with the most ingredients
max_ingredients_index = df['Ingredient_Count'].idxmax()

# Retrieve the recipe with the most ingredients
recipe_with_most_ingredients = df.loc[max_ingredients_index, 'Recipe']

print("Recipe with the most ingredients:", recipe_with_most_ingredients)
print("Number of ingredients:", len(recipe_with_most_ingredients))

Recipe with the most ingredients: ('campari', 'luxardo maraschino', 'apricot brandy', 'cointreau', 'lime juice', 'gin')
Number of ingredients: 6


In [20]:
all_ingredients[1]

'lime juice'

In [21]:
print("Predicted Probabilities:", predicted_probabilities)

Predicted Probabilities: [[9.91460518e-04 9.71273124e-01 3.27676302e-03 2.29460839e-03
  1.44090827e-05 1.07792707e-03 1.79949966e-05 4.26653742e-07
  6.11162884e-03 9.87955153e-01 1.58138755e-06 1.17546406e-05
  2.67304076e-05 7.53243803e-04 3.96043579e-05 5.18730097e-03
  2.46925454e-04 1.93719738e-04 2.83818031e-06 2.74178137e-06
  1.10485104e-04 1.05297211e-07 7.03982543e-03 9.95532446e-07
  1.39291970e-07 6.39650752e-05 2.34002434e-03 1.48186082e-04
  2.36829379e-04 1.52515958e-03 2.30439613e-03 4.93203379e-06
  2.73582117e-07 6.08661903e-05 1.08130989e-04 3.11970362e-04
  1.24563917e-03 1.04664896e-05 8.67845782e-04 4.14440554e-04
  2.16534268e-03 2.32616949e-04 2.65600509e-04 3.92695876e-10
  2.63061677e-03 1.35529410e-06 4.01587487e-04 3.63613857e-04
  4.00458723e-02 5.29063074e-03 9.81976773e-05 2.77887779e-07
  4.73193300e-04 6.31715284e-06 1.12902089e-04 2.04976201e-02
  4.38578485e-04 1.19479584e-04 5.66187035e-03 5.06729302e-05
  2.71231997e-08 1.10051496e-06 2.32989296e-0

In [22]:
predicted_probabilities[86]

IndexError: index 86 is out of bounds for axis 0 with size 1