In [1]:
import re
from pprint import pprint
from collections import Counter

In [2]:
items = [] 

with open("20221004_item_list.txt") as infile : 
    next(infile)
    
    for idx, row in enumerate(infile) : 
        items.append(row.strip())


In [3]:
prefix_pattern = re.compile(r"^.{0,4} ")
spaced_hyphen_pattern = re.compile(r" - ")

After exploration, one annoying pattern has emerged. When we have one hyphen, such as `26 Infidel Belgian IPA - Selkirk Abbey`, the pattern seems to be `beer - brewery`. When we have three hyphens, it seems that often brewer comes first. For instance, `Z Mountains Walking - Sweets - Fruited Berlinerweisse 16oz can` and `Ommegang - Rare Vos Belgian Style Amber Ale  - 6.5% ABV 20 IBU`. 

I'm going to try to do a two-pass analysis, going through the "one hyphen" products and pulling out the breweries, then testing to see where the brewery is in the item. For the record, the larger cell that creates `item_translation` is where I started my work and then I added this other stuff.

In [4]:
likely_breweries = list()

for item in items :
    clean_item = prefix_pattern.sub("",item).lower()
    pieces = spaced_hyphen_pattern.split(clean_item)
    
    if len(pieces) == 2 :
        likely_breweries.append(pieces[1].strip())


In [5]:
Counter(likely_breweries).most_common(15)

[('draught works', 176),
 ('sierra nevada', 162),
 ('new belgium', 157),
 ('firestone walker', 156),
 ('odell', 145),
 ('stone', 142),
 ('deschutes', 136),
 ('big sky', 127),
 ('great burn', 110),
 ('oskar blues', 108),
 ('blacksmith', 105),
 ('kettlehouse', 104),
 ('hopworks', 95),
 ('grand teton', 94),
 ('elysian', 92)]

In [6]:
brewery_set = (set(likely_breweries).union({"widmer","breakside brewing","homestead ales",
                                            "bitburger","stone"}) - 
    {"ipa","amber","lager","pilsner","stout"})

In [11]:
item_translation = dict()
# key = original item
# value = [clean_item_name, brewery (if present), remainder (if present)]

clean_items = set()

for item in items :
    
    clean_item = prefix_pattern.sub("",item).lower()
    
    clean_item = clean_item.replace("windmere","widmer")
    
    clean_items.add(clean_item)
    pieces = spaced_hyphen_pattern.split(clean_item)
    
    beer = ""
    brewery = ""
    other_info = ""
    
    # This next section tries to get the right values in the right 
    # places for beer/brewery/other stuff
    if len(pieces) > 1 : 
        pieces = [p.strip() for p in pieces]
        
        if len(pieces) == 2 :
            
            if pieces[0] in brewery_set : 
                brewery = pieces[0]
                beer = pieces[1]
            else :
                brewery = pieces[1]
                beer = pieces[0]
        elif len(pieces) == 3 :
            if pieces[0] in brewery_set : 
                brewery, beer, other_info = pieces
            else :
                beer, brewery, other_info = pieces
        else : 
            if pieces[0] in brewery_set : 
                brewery, beer = pieces[:2]
            else :
                beer = pieces[0]
                brewery = pieces[1]
                
            other_info = " - ".join(pieces[1:]).strip()
        
    else :
        
        clean_item_tokens = clean_item.split()
        
        for token in clean_item_tokens :
            if token in brewery_set :
                brewery = token
        
        beer = " ".join([token for token in clean_item_tokens if token != brewery])
                
            
    item_translation[item] = [beer, brewery, other_info]

    if "Cioke" in item :
        print(item)
        print(item_translation[item])



Quarticello ""Cioke"" Lambrusco - 00 - 2020
['quarticello ""cioke"" lambrusco', '00', '2020']


In [12]:
def quote_wrap(text) : 
    return('"' + text + '"')
    

# Quotes in fields cause problems for GBQ upload, so I'll wrap those pieces in quotes for
# writing out. For instance, here's an item: 
# Quarticello ""Cioke"" Lambrusco - 00 - 2020
# with these fields: 
# ['quarticello ""cioke"" lambrusco', '00', '2020']

with open("item_lookup.txt",'w') as outfile :
    outfile.write("item\tbeer\tbrewery\tother_info\n")
    
    for item, pieces in item_translation.items() :
        pieces = [quote_wrap(p) for p in pieces]
        
        beer, brewery, other_info = pieces
        
        outfile.write(f"{item}\t{beer}\t{brewery}\t{other_info}\n")



['"quarticello ""cioke"" lambrusco"', '"00"', '"2020"']


In [13]:
# For convenience with some other places I need this file. 

with open("/Users/chandler/dropbox/teaching/repos/ada-python-gbq/item_lookup.txt",'w') as outfile :
    outfile.write("item\tbeer\tbrewery\tother_info\n")
    
    for item, pieces in item_translation.items() :
        pieces = [quote_wrap(p) for p in pieces]
        
        beer, brewery, other_info = pieces
        
        outfile.write(f"{item}\t{beer}\t{brewery}\t{other_info}\n")


In [14]:
# For convenience with some other places I need this file. 

with open("/Users/chandler/dropbox/teaching/2022/ada/week-07/item_lookup.txt",'w') as outfile :
    outfile.write("item\tbeer\tbrewery\tother_info\n")
    
    for item, pieces in item_translation.items() :
        pieces = [quote_wrap(p) for p in pieces]
        
        beer, brewery, other_info = pieces
        
        outfile.write(f"{item}\t{beer}\t{brewery}\t{other_info}\n")


In [23]:
count = 0

for idx, item in enumerate(item_translation) :
    beer, brewery, other_info = item_translation[item] 
    if brewery and brewery not in brewery_set :
        count +=1 
        print(f"{idx} and {item_translation[item]}")
        break
    
    
print(f"Count is {count}")

62 and ['widmer omission', 'lager', '']
Count is 1


In [32]:
for item, pieces in item_translation.items() :
    beer, brewery, other_info = pieces
    
    if brewery == "big sky" : 
        print(f"The item is {item}.")
        print(f"Beer: {beer}\tBrewery: {brewery}")
        print("--------------------------------------")
    

The item is 10 Scapegoat - Big Sky.
Beer: scapegoat	Brewery: big sky
--------------------------------------
The item is 13 Biere De Noel - Big Sky.
Beer: biere de noel	Brewery: big sky
--------------------------------------
The item is 21 Ivan the Terrible - Big Sky.
Beer: ivan the terrible	Brewery: big sky
--------------------------------------
The item is 28 BA Power Wagon Wheat Wine - Big Sky.
Beer: ba power wagon wheat wine	Brewery: big sky
--------------------------------------
The item is 12 Big Sky - Glacier Hop Ranch Wet Hop.
Beer: glacier hop ranch wet hop	Brewery: big sky
--------------------------------------
The item is 12 Rye Pale Ale - Big Sky.
Beer: rye pale ale	Brewery: big sky
--------------------------------------
The item is 12 NITRO Moose Drool - Big Sky.
Beer: nitro moose drool	Brewery: big sky
--------------------------------------
The item is 11 Cream Ale - Big Sky.
Beer: cream ale	Brewery: big sky
--------------------------------------
The item is 1 Trout Slayer