This script is for parsing the [hotaling & co cocktail dataset from Kaggle](https://www.kaggle.com/datasets/shuyangli94/cocktails-hotaling-co) and outputting it into json format for my scripts.

In [1]:
# settings and setup
import copy
import json
import math
import re
import pandas as pd

DATASET_FNAME = 'hotaling_cocktails.csv'
OUTPUT_FNAME = 'hotaling_cocktails.json'
COL_NAME_MAP = {'Cocktail Name': 'name'}

SUBS = {
    'Whiskey': ['Rye Whiskey', 'Bourbon', 'Scotch', 'Jack Daniels'],
    'Vermouth': ['Red Vermouth', 'Green Vermouth', 'Sweet Vermouth', 'Dry Vermouth'],
    'Bitters': ['Orange Bitters', 'Angostura Bitters'],
    'Rum': ['White Rum'],
    'Vodka': ['Vodka Citron'],
    'Cointreau': ['Triple Sec'],
    'Simple Syrup': ['Sugar Syrup', 'House-made Simple Syrup', 'Pure Cane Simple Syrup'],
    'Tea': ['Black Tea', 'Lipton'],
    'Lemon Juice': ['Fresh Lemon Juice', 'Squeezed Lemon'],
    'Lime Juice': ['Fresh Lime Juice', 'Squeezed Lime'],
    'Mint': ['Mint Leaves'],
    'Lemon Lime Soda': ['Sprite', '7-UP', 'Lemon-Lime Soda']
}
PANTRY = ['Ice', 'Water', 'Sugar', 'Salt', 'Black Pepper', 'Coffee', 'Honey', 'Tea']
BRANDS = ['St. George', 'Fee Brothers', 'Luxardo', 'Small Hand Foods', 'Deschutes', 'King\'s', 'Leopold Bros.', 'Monin']
AMTS = ['Sprig', 'Sprig Of', 'Dash', 'House-made', 'Homemade']

# read in data
df = pd.read_csv(DATASET_FNAME)

In [2]:
# adjust column names and then convert to json (jsonl style) format
df.rename(columns=COL_NAME_MAP, inplace=True)
df.columns = df.columns.str.lower()
df.fillna('', inplace=True)
cocktails = df.to_dict(orient='records')

Split ingredients by commas and parse out amounts.

In [3]:
pattern = re.compile(r'[A-Z].*$')
def parse_ingredient(ingr : str, is_garnish : bool = False, capitalize: bool = True) -> dict:
    if '(' in ingr:
        ingr = ingr.split('(')[0].strip()
        
    match = pattern.search(ingr)
    if match is None:
        return None
    out = {'name': match.group(), 'amt': ingr[:match.start()].strip()}
    if is_garnish:
        out['garnish'] = 'true'
    if capitalize:
        out['name'] = ' '.join(w.capitalize() for w in out['name'].split())
    return out

def parse_ingredients(ingrs : str, garnishes : str) -> list:
    ingrs = list(map(lambda s: s.strip().strip('*'), ingrs.split(',')))
    ingrs = list(map(parse_ingredient, ingrs))
    if garnishes and type(garnishes) == str:
        garnishes = list(map(lambda s: s.strip().strip('*'), garnishes.split(',')))
        garnishes = list(map(lambda s: parse_ingredient(s, is_garnish=True), garnishes))
        ingrs.extend(garnishes)

    return [ingr for ingr in ingrs if ingr is not None]


In [4]:
for cocktail in cocktails:
    cocktail['ingredients'] = parse_ingredients(cocktail['ingredients'], cocktail['garnish'])

Adjust format and save out.

In [5]:
final_dataset = {'substitutes': SUBS, 'recipes': cocktails}
with open(OUTPUT_FNAME, 'w') as fp:
    json.dump(final_dataset, fp, indent=2)

Now some basic stats about the dataset.

In [6]:
def is_val_at_key(tab, key, val) -> bool:
    return key in tab and tab[key] == val

def remove_items_in_pantry(recipes, pantry):
    recipes = copy.deepcopy(recipes)
    for recipe in recipes:
        recipe['ingredients'] = list(filter(lambda i: i['name'] not in pantry, recipe['ingredients']))
    return recipes

def remove_garnishes(recipes):
    recipes = copy.deepcopy(recipes)
    for recipe in recipes:
        recipe['ingredients'] = list(filter(lambda i: not is_val_at_key(i, 'garnish', 'true'), recipe['ingredients']))
    return recipes

def remove_substrings(recipes, substrs):
    def _remove_substrs(ingr):
        for substr in substrs:
            ingr['name'] = ingr['name'].replace(substr, '')
        return ingr

    recipes = copy.deepcopy(recipes)
    for recipe in recipes:
        recipe['ingredients'] = list(map(_remove_substrs, recipe['ingredients']))
    return recipes

def make_substitutions(recipes, subs):
    def _get_sub(ingr):
        for key, values in subs.items():
            if ingr['name'] == key or ingr['name'] in values:
                ingr['name'] = key
                return ingr 
        return ingr

    recipes = copy.deepcopy(recipes)
    for recipe in recipes:
        recipe['ingredients'] = list(map(_get_sub, recipe['ingredients']))
    return recipes

In [7]:
num_cocktails = len(cocktails)
unique_ingredients = set(ingr['name'] for c in cocktails for ingr in c['ingredients'])
num_unique_ingredients = len(unique_ingredients)

# filter out garnishes
cocktails_opt_1 = remove_garnishes(cocktails)
unique_ingredients_opt_1 = set(ingr['name'] for c in cocktails_opt_1 for ingr in c['ingredients'])
num_unique_ingredients_opt_1 = len(unique_ingredients_opt_1)

# filter out measurement words and brands
cocktails_opt_2 = remove_substrings(cocktails_opt_1, [*BRANDS, *AMTS])
unique_ingredients_opt_2 = set(ingr['name'] for c in cocktails_opt_2 for ingr in c['ingredients'])
num_unique_ingredients_opt_2 = len(unique_ingredients_opt_2)

# replace some ingredients with common names
cocktails_opt_3 = make_substitutions(cocktails_opt_2, SUBS)
unique_ingredients_opt_3 = set(ingr['name'] for c in cocktails_opt_3 for ingr in c['ingredients'])
num_unique_ingredients_opt_3 = len(unique_ingredients_opt_3)

# remove already owned
cocktails_opt_4 = remove_items_in_pantry(cocktails_opt_3, PANTRY)
unique_ingredients_opt_4 = set(ingr['name'] for c in cocktails_opt_4 for ingr in c['ingredients'])
num_unique_ingredients_opt_4 = len(unique_ingredients_opt_4)


print(f'# Cocktails:          {num_cocktails}')
print(f'# Ingredients:        {num_unique_ingredients}')
print(f'# Ingredients')
print(f'  w/ opt 1:           {num_unique_ingredients_opt_1}')
print(f'# Ingredients')
print(f'  w/ opt 1 & 2:       {num_unique_ingredients_opt_2}')
print(f'# Ingredients')
print(f'  w/ opt 1, 2 & 3:    {num_unique_ingredients_opt_3}')
print(f'# Ingredients')
print(f'  w/ opt 1, 2, 3 & 4: {num_unique_ingredients_opt_4}')

# Cocktails:          687
# Ingredients:        1310
# Ingredients
  w/ opt 1:           1073
# Ingredients
  w/ opt 1 & 2:       1069
# Ingredients
  w/ opt 1, 2 & 3:    1058
# Ingredients
  w/ opt 1, 2, 3 & 4: 1054
