In [1]:
import numpy as np
import pandas as pd

# import urllib.request as r

from bs4 import BeautifulSoup

import re


In [2]:
# # Get festival beers webpage
# beer_url = "https://gabsfestival.com/festival-beers/"
# beer_page = r.urlopen(beer_url)
# beer_html = beer_page.read().decode('utf-8')

# http error 403: access denied
# Going to do the stupid thing and ctrl+a the source text:
# 1. Go to https://gabsfestival.com/festival-beers/
# 2. Right click -> view page source
# 3. Ctrl+a, ctrl+c to copy all
# 4. Paste to file named beer.html or some such thing


In [3]:
# Load contents of html file to text
with open("beer_page_2024.html", encoding='utf-8') as beer_page:
    beer_html = beer_page.read()

# Make the beer soup
beer_soup = BeautifulSoup(beer_html)

In [38]:
# Function to organise the title section
def parse_beer_titles(words):
    words_split = re.split('((?<=^\d)\s|(?<=^\d{2})\s|(?<=^\d{3})\s|\s(?=\([A-Za-z]+\))|(?<=\))\s)', re.sub(r'(?<! )\(', ' (', re.sub(r'^#', '', words)))
    if len(words_split) == 3: # Only number and brewery
        beer_num = int(words_split[0])
        beer_brewery = words_split[2]
        beer_state = ''
        beer_name = ''
    else:
        beer_num = int(words_split[0])
        beer_brewery = words_split[2]
        beer_state = re.sub(r'[\(\)]', '', words_split[4]).strip()
        beer_name = ' '.join(words_split[5:]).strip()
    return (beer_num, beer_brewery, beer_state, beer_name)

# Function to organise the description section (if NOT separation between abv/style and description)
def parse_beer_desc1(beer_desc_list):
    beer_list_split = beer_desc_list[0].strip().split('%')
    if len(beer_list_split[0]) > 5: # non standard abv struct
        # Hard code word positions becuase I'm lazy, probably don't do this
        print(beer_list_split)
        # beer_abv = float(beer_list_split[0][-1])
        # beer_style = ' '.join(beer_list_split[1].split()[1:3])
        # beer_desc = '%'.join(beer_list_split)
        beer_abv, beer_style, beer_desc = (np.NaN, np.NaN, ' '.join(beer_list_split))
    else:
        beer_abv = float(beer_list_split[0])
        beer_list_split = re.split(r""""|“|''""", beer_list_split[1])
        if len(beer_list_split) == 1: # No description text
            beer_desc = np.NaN
            beer_style = re.sub(r'\s+', ' ', beer_list_split[0].strip())
        else:
            beer_style = re.sub(r'\s+', ' ', beer_list_split[0].strip())
            beer_desc = re.sub(r'^"|"$', '', ' '.join(beer_list_split[1:]).strip())
    return (beer_abv, beer_style, beer_desc)

# Function to organise the description section (if separation between abv/style and description)
def parse_beer_desc2(beer_desc_list):
    beer_abv = float(beer_desc_list[0].strip().split('%')[0])
    beer_style = re.sub(r'\s+', ' ', beer_desc_list[0].strip().split('%')[1].strip())
    beer_desc = re.sub(r"""^"|^''|^“|"$|''$|”$""", '', re.sub(r'\s+', ' ', beer_desc_list[1].strip()))
    return (beer_abv, beer_style, beer_desc)

# Function to combine other functions to organise all the things
def categorise_beer_parts(beer_card):
    # Beer title
    beer_title = beer_card.find(['h2', 'h3'], {'class':'elementor-flip-box__layer__title'})
    words = ' '.join([x.strip() for x in beer_title.strings]).strip()
    beer_title = parse_beer_titles(words)

    if beer_title[1] == 'TBC':
        return None

    # Beer desc
    beer_description = beer_card.find("div", {'class':'elementor-flip-box__layer__description'})
    if beer_description:
        beer_meta = list(beer_description.strings)
        if len(beer_meta) == 2:
            beer_desc = parse_beer_desc2(beer_meta)
        else:
            beer_desc = parse_beer_desc1(beer_meta)
    else: beer_desc = (5.0, '', '')

    return beer_title + beer_desc

# Function to calculate section number based on beer number
def beer_section(num):
    section = 0
    while num > 0:
        section += 1
        num -= 20
        if section > 6:
            raise ValueError("Section can't be greater than 6")
    return section



In [39]:
# Initialise object to store beers
beers = []

# Loop through all beer cards in the html
for beer_card in beer_soup.find_all("div", {'class':"elementor-flip-box"}):
    beer = categorise_beer_parts(beer_card)
    beers.append(beer)

# Put it all in a data frame and drop duplicates
beers_df = pd.DataFrame(beers, columns=['num', 'brewery', 'state', 'name', 'abv', 'style', 'desc']).drop_duplicates().reset_index(drop=True)
# Drop NA
beers_df = beers_df.dropna(subset='num')

# Calculate section number
beers_df['section'] = [beer_section(x) for x in beers_df.num]

# Inspect results
display(beers_df)
display(beers_df.loc[[21, 102, 116]])

['Spicy, with a characteristic yeast note, fruity and strong']


Unnamed: 0,num,brewery,state,name,abv,style,desc,section
0,1.0,Clifton Hill Brewing,VIC,HONEY GUM (RED IPA),7.0,Red IPA,A toffee- like biscuity aroma gives way to car...,1
1,2.0,Brewmanity Beer Co,VIC,Choc à l'orange,7.0,Imperial Stout,A selection of the finest after dinner choc or...,1
2,3.0,Six String Brewing Co,NSW,Spaghetti Saison,5.0,Saison/Farmhouse Ale,"Basil & Peppercorn Saison Bright and pungent, ...",1
3,4.0,The Brew Baron Beer Co.,QLD,Cocoa Comet,7.0,Porter,A hot chocolate drink for adults only! Brewed ...,1
4,5.0,8 Wired,NZ,Crumbs - Imperial Cookie Stout,10.0,Imperial Stout,Crumbs Imperial Cookie Stout is the rebellious...,1
...,...,...,...,...,...,...,...,...
114,115.0,Moon Dog,VIC,Raspberry Sherbet Bomb,6.5,Specialty Beer,This Sherbet Bomb has been made from a mixed c...,6
115,116.0,monkey shoulder,Scotland,Ginger Monkey,6.0,Cocktail,"Refreshing ginger, vanilla, spiced oak, hints ...",6
116,117.0,Matso's Piña Colada,,,6.0,Cocktail,Tropical bliss in a glass! This 6% ABV piña co...,6
117,118.0,Eddies Cider,VIC,Eddies Crisp Apple,4.5,Traditional Cider,Nice colour and depth. Soft and delicate apple...,6


Unnamed: 0,num,brewery,state,name,abv,style,desc,section
21,22.0,3 Griffins Brewing High Carb White IPA,,,4.2,IPA - White,Training for a marathon or endurance event? Ca...,2
102,103.0,Willie The Boatman,NSW,,6.0,Classic Pilsner Lager,,6
116,117.0,Matso's Piña Colada,,,6.0,Cocktail,Tropical bliss in a glass! This 6% ABV piña co...,6


In [40]:
# Check for missing numbers
beer_nums = list(beers_df['num'])
for i in range(1, 121):
    if i not in beer_nums:
        print(f'No beer #{i}')

No beer #62
No beer #120


In [41]:
# Export to .csv
beers_df.to_csv("../Data/GABS_2024_festival_beers.csv", index=False)

# Performed some manual cleaning directly on output file, including filling in missing data.
