In [1]:
import json
import re

In [2]:
with open("data/recipes/amounts.json", encoding="utf-8") as f:
    amounts = json.load(f)

In [32]:
re1 = re.compile(r"^([\d½¼¾⅛]+(?: [½¼¾⅛])?(?:,\d+)?)(?: ((?:kl\. |gr\. )?\w+(?:, gestr\.|, gehäuft)?)(?:/n|/e|\(n\)|\(e\)|\.)?)?$")
re_singol = re.compile(r"^(kl. |gr. )?(Tasse|mg|ml|Tüte|Kopf|Stück|Spritzer|Teil|Becher|Blatt|Zehe|Blätter|Bund|Dose|[ET]L(,.*)?|cl|dl|Flasche|Fläschchen|Glas|Gläser|Handvoll|Körner|Liter|Msp.|Paar|Paket|Pck.|Pkt.|Port.|Prise.*|Scheibe)(/[en])?$")


In [70]:
replacings = "gestr.=gestrichen,geh.=gehäuft,m.-große=mittelgroße,m.-großer=mittelgroßer,TL=Teelöffel,Eßlöffel=Esslöffel,EL=Esslöffel,Msp.=Messerspitze,Pck.=Packung,Pkt.=Packung,Port.=Portion".split(",")

replacings = {ee.split("=")[0]: ee.split("=")[1] for ee in replacings}

In [71]:
parsed1 = []
for _amount in amounts:
    amount = _amount
    if amount == "":
        amount = "1"
    #amount = amount.replace("m.-große", "mittelgroße")
    if re_singol.match(amount):
        amount = "1 " + amount
    for bef, aft in replacings.items():
        amount = re.sub(r"\b" + re.escape(bef) + r"(\b|$)", aft, amount)
    m = re1.match(amount)
    if m is not None:
        parsed1.append((_amount, *m.groups()))
    else:
        print("UNMATCHED  ", _amount)
        parsed1.append((_amount, None, None))

UNMATCHED   0,2 TL, gestr.
UNMATCHED   0,3 TL, gestr.
UNMATCHED   0,33 EL, gestr.
UNMATCHED   0,33 TL, gestr.
UNMATCHED   1 EL, gestr.
UNMATCHED   1 TL, gestr.
UNMATCHED   1 einige Stiele
UNMATCHED   1 m.-großes
UNMATCHED   1 n. B.
UNMATCHED   1 ½ EL, gestr.
UNMATCHED   1 ½ TL, gestr.
UNMATCHED   10 EL, gestr.
UNMATCHED   10 TL, gestr.
UNMATCHED   10 einige Stiele
UNMATCHED   10 n. B.
UNMATCHED   12 TL, gestr.
UNMATCHED   125 n. B.
UNMATCHED   14 TL, gestr.
UNMATCHED   15 EL, gestr.
UNMATCHED   18 EL, gestr.
UNMATCHED   2 EL, gestr.
UNMATCHED   2 TL, gestr.
UNMATCHED   2 einige Stiele
UNMATCHED   2 m.-großes
UNMATCHED   2 n. B.
UNMATCHED   2 ¼ TL, gestr.
UNMATCHED   2 ½ EL, gestr.
UNMATCHED   2 ½ TL, gestr.
UNMATCHED   2 ¾ TL, gestr.
UNMATCHED   20 EL, gestr.
UNMATCHED   250 n. B.
UNMATCHED   3 EL, gestr.
UNMATCHED   3 TL, gestr.
UNMATCHED   3 m.-großes
UNMATCHED   3 n. B.
UNMATCHED   3 ½ TL, gestr.
UNMATCHED   30 Prozent %
UNMATCHED   30 n. B.
UNMATCHED   35 einige Stiele
UNMATCHED   

In [77]:
len(parsed1), len(amounts)

(2782, 2782)

In [78]:
re_num = re.compile(r"([\d½¼¾⅛]+)(?: ([½¼¾⅛]))?(?:,(\d+))?")

In [79]:
def parse_num(amount, unit):
    if amount is None:
        return (None, None)
    fs = {"½": 1/2, "¼": 1/4, "¾": 3/4, "⅛": 1/8}
    whole, addition, fraction = re_num.match(amount).groups()
    whole = fs[whole] if whole in fs else int(whole)
    addition = fs[addition] if addition is not None else 0
    fraction = float("0." + fraction) if fraction is not None else 0
    total_amount = whole + addition + fraction
    if unit == "kg":
        unit = "g"
        total_amount *= 1000
    if unit == "mg":
        unit = "g"
        total_amount /= 1000
    if unit and (unit.lower() == "l" or unit.lower() == "liter"):
        unit = "ml"
        total_amount *= 1000
    return total_amount, unit

parsed_amounts = {orig: parse_num(n, e) for orig, n, e in parsed1}

In [80]:
parsed_amounts

{'': (1, None),
 '1320 g': (1320, 'g'),
 '8 Beutel': (8, 'Beutel'),
 '300 Pck.': (300, 'Packung'),
 '40 Stück(e)': (40, 'Stück'),
 '½ gr. Dose/n': (0.5, 'gr. Dose'),
 '416 g': (416, 'g'),
 '11 g': (11, 'g'),
 '0,67 Pck.': (0.67, 'Packung'),
 '4 kleine': (4, 'kleine'),
 '6 Spritzer': (6, 'Spritzer'),
 '500 cm': (500, 'cm'),
 '12 EL': (12, 'Esslöffel'),
 '23,1 g': (23.1, 'g'),
 '300 EL': (300, 'Esslöffel'),
 '17 ½ ml': (17.5, 'ml'),
 '200 Liter': (200000, 'ml'),
 '355 g': (355, 'g'),
 '¼ Stängel': (0.25, 'Stängel'),
 '2 ¾ Liter': (2750.0, 'ml'),
 '4 halbe': (4, 'halbe'),
 '¼ großer': (0.25, 'großer'),
 '2 ½ dl': (2.5, 'dl'),
 '8 EL, gestr.': (None, None),
 '¾ Bund': (0.75, 'Bund'),
 '7,2 g': (7.2, 'g'),
 '4 kl. Dose/n': (4, 'kl. Dose'),
 '3 Stängel': (3, 'Stängel'),
 '2 Beutel': (2, 'Beutel'),
 '14 Stück(e)': (14, 'Stück'),
 '56 halbe': (56, 'halbe'),
 '6 m.-große': (6, 'mittelgroße'),
 '0,33 Knolle/n': (0.33, 'Knolle'),
 '0,33': (0.33, None),
 '1,2 TL': (1.2, 'Teelöffel'),
 '0,04 Liter'

In [81]:
with open("data/recipes/parsed_amounts.json", "w") as f:
    json.dump(parsed_amounts, f)