In [28]:
# Preamble
import numpy as np
import pandas as pd
import plotly.express as px
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors

In [106]:
# Load in beer spreadsheet
main_file_path = "../"

# Load in both datasets
df_23 = pd.read_csv(main_file_path + "Data/GABS_2023_Festival_Beers.csv")
df_24 = pd.read_csv(main_file_path + "Data/GABS_2024_Festival_Beers.csv")

In [107]:
# Inspect column names
print(df_23.columns)
df_23['year'] = '2023' # Add year col

Index(['Section', 'Number', 'Brewery', 'Beer', 'Style', 'Description',
       'Anticipation', 'Anticipation notes', 'Reality', 'Reality notes'],
      dtype='object')


In [108]:
# Inspect column names
print(df_24.columns)
df_24['year'] = '2024' # Add year col

Index(['num', 'brewery', 'state', 'name', 'abv', 'style', 'desc', 'section'], dtype='object')


In [109]:
# Combine into one df
df = pd.concat([df_23.rename(columns={'Number':'num', 'Section':'section', 'Brewery':'brewery',
                                      'Beer':'name', 'Style':'style', 'Description':'desc'}),
                df_24
                ]).drop(columns=['abv', 'state', 'Anticipation', 'Anticipation notes',
                                 'Reality', 'Reality notes']).set_index(['num', 'year'])

In [110]:
# Inspect df
display(df.shape)
display(df.head())

(237, 5)

Unnamed: 0_level_0,Unnamed: 1_level_0,section,brewery,name,style,desc
num,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,2023,1,Dad & Dave,Boba Beer,Sour,Tart Grapefruit Beliner Weisse infused with Ja...
2.0,2023,1,The Coastal Brewing Company,Homestead Blackberry and Apple Pastry Sour,Specialty,Perfect for an autumn or winter evening bringi...
3.0,2023,1,7th Day Brewery,Suit Up,IPA Black,Choc & raisins mingle with piney resin and cit...
4.0,2023,1,Six String Brewing Co,Whiskey in the jar,Wood Aged Beer,"A barrel aged, American-Style Imperial Stout."
5.0,2023,1,The Marsden Brewhouse,Marsden Neopolitan Porter,Stout Tropical,"Just like the ice cream, Berry - Vanilla - Cho..."


In [111]:
# Create a column combining all text descriptor fields
df['words'] = [str(name).strip().upper() + ' '
               + str(style).strip().upper() + ' '
               + str(description).strip().upper()
               for name, style, description
               in zip(df['name'], df['style'], df['desc'])]

In [112]:
# TF-IDF vectorisation on description to find most unique beers?
X_tfidf = TfidfVectorizer().fit_transform([re.sub(r'[\s]+', ' ', re.sub(r'[\W]', ' ', x)).upper().strip() for x in df.words])

# Compute uniqueness
beer_uniqueness_cos = pd.Series(1 - np.mean(cosine_similarity(X_tfidf), axis=1))

In [153]:
# Plot some clusters maybe?
pca_arr_all = PCA(n_components=2).fit_transform(np.asarray(X_tfidf.todense()))
print(pca_arr_all.shape)


pca_df = pd.DataFrame(pca_arr_all).reset_index(drop=True).set_index(df.index)
pca_df['Section'] = df['section']
pca_df['Number'] = [x[0] for x in df.index]
pca_df['Beer'] = df['name']
pca_df['Brewery'] = df['brewery']
pca_df['Style'] = df['style']
pca_df['Year'] = [x[1] for x in df.index]

fig = px.scatter(pca_df,
                 x=0,
                 y=1,
                 color="Year",
                 hover_data=["Beer", "Section", "Number", "Style"],
                 title="GABS 2023-24 (tf-idf vectorised, PCA reduction to 2-dim)"
                )
fig.show()

(237, 2)


In [118]:
# TF-IDF vectorisation on description to find most unique beers?
X_count = CountVectorizer().fit_transform([re.sub(r'[\s]+', ' ', re.sub(r'[\W]', ' ', x)).upper().strip() for x in df.words])

# Plot some clusters maybe?
pca_arr_cnt_all = PCA(n_components=2).fit_transform(np.asarray(X_count.todense()))
print(pca_arr_cnt_all.shape)

pca_cnt_df = pd.DataFrame(pca_arr_cnt_all).reset_index(drop=True).set_index(df.index)
pca_cnt_df['Section'] = df['section']
pca_cnt_df['Number'] = [x[0] for x in df.index]
pca_cnt_df['Beer'] = df['name']
pca_cnt_df['Brewery'] = df['brewery']
pca_cnt_df['Style'] = df['style']
pca_cnt_df['Year'] = [x[1] for x in df.index]

fig = px.scatter(pca_cnt_df,
                 x=0,
                 y=1,
                 color="Year",
                 hover_data=["Beer", "Section", "Number", "Style"],
                 title="GABS 2023-24 (Count vectorised, PCA reduction to 2-dim)"
                )
fig.show()

(237, 2)


In [119]:
# Go back to tf-idf

# Compute similarity
beer_tfidf_cos_sim = cosine_similarity(X_tfidf)
np.fill_diagonal(beer_tfidf_cos_sim, 0)

In [120]:
beer_tfidf_cos_sim

array([[0.        , 0.02169606, 0.01263583, ..., 0.01737867, 0.        ,
        0.00538614],
       [0.02169606, 0.        , 0.0063801 , ..., 0.04891608, 0.08699579,
        0.10105386],
       [0.01263583, 0.0063801 , 0.        , ..., 0.02631691, 0.01960832,
        0.01720581],
       ...,
       [0.01737867, 0.04891608, 0.02631691, ..., 0.        , 0.0798091 ,
        0.05958983],
       [0.        , 0.08699579, 0.01960832, ..., 0.0798091 , 0.        ,
        0.25689323],
       [0.00538614, 0.10105386, 0.01720581, ..., 0.05958983, 0.25689323,
        0.        ]])

In [122]:
test_num = 108
display(df.iloc[test_num])
display(df.iloc[np.argsort(beer_tfidf_cos_sim[test_num])[-num_recs:]])

section                                                    6
brewery                                     Hiker Brewing Co
name                                           Hold the Pork
style                                              Specialty
desc       Infused with smoked pineapple and szechuan pep...
words      HOLD THE PORK SPECIALTY INFUSED WITH SMOKED PI...
Name: (109.0, 2023), dtype: object

Unnamed: 0_level_0,Unnamed: 1_level_0,section,brewery,name,style,desc,words
num,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
23.0,2024,2,Precinct Brewing,Tarte Noir,Kettle Sour,Tarte Noir - a black kettle-soured ale full of...,TARTE NOIR KETTLE SOUR TARTE NOIR - A BLACK KE...
67.0,2023,4,Shark Island Brewing Co,Life Can Be Sweet & Sour,Saison,A zesty Saison infused with Cinnamon and Pinea...,LIFE CAN BE SWEET & SOUR SAISON A ZESTY SAISON...
17.0,2024,1,Cavalier Brewing,Smoke Rings,Kettle Sour,Inspired by your childhood christmases spent a...,SMOKE RINGS KETTLE SOUR INSPIRED BY YOUR CHILD...
81.0,2024,5,Morrison Brewery,Smoked Kelp Stout,Smoked Beer (non Rauchbier),"Peat Smoked Kelp Imperial Stout, aged in Islay...",SMOKED KELP STOUT SMOKED BEER (NON RAUCHBIER) ...
15.0,2024,1,Molly Rose Brewing,"Pepper, Pepper, Pepper",American IPA,"Using vacuum distillation, we capture the esse...","PEPPER, PEPPER, PEPPER AMERICAN IPA USING VACU..."


In [123]:
np.argsort(beer_tfidf_cos_sim[test_num])[-2]

198

In [149]:
def get_same_year_recs(ind, num_recs):
    base_year = df.iloc[ind].name[1]
    recs = []
    rec_inds = np.argsort(beer_tfidf_cos_sim[ind])
    i = 1
    while len(recs) < num_recs:
        rec_ind = rec_inds[-i]
        rec_df = df.iloc[rec_ind]
        if rec_df.name[1] == base_year:
            recs.append(rec_ind)
        i += 1

    return recs

def get_beer_ind(num, year):
    df_entry = df.query(f"num=={int(num)}&year=='{int(year)}'")
    return df.index.get_loc(df_entry.index[0])

In [152]:
test_num = 108
num_recs = 5

display(df.iloc[test_num])
display(df.iloc[get_same_year_recs(test_num, num_recs)])

section                                                    6
brewery                                     Hiker Brewing Co
name                                           Hold the Pork
style                                              Specialty
desc       Infused with smoked pineapple and szechuan pep...
words      HOLD THE PORK SPECIALTY INFUSED WITH SMOKED PI...
Name: (109.0, 2023), dtype: object

Unnamed: 0_level_0,Unnamed: 1_level_0,section,brewery,name,style,desc,words
num,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
67.0,2023,4,Shark Island Brewing Co,Life Can Be Sweet & Sour,Saison,A zesty Saison infused with Cinnamon and Pinea...,LIFE CAN BE SWEET & SOUR SAISON A ZESTY SAISON...
62.0,2023,4,Valley Hops Brewing,The Nugget,IPA Wild,"Ripe pineapple, peach and citrus with the righ...","THE NUGGET IPA WILD RIPE PINEAPPLE, PEACH AND ..."
1.0,2023,1,Dad & Dave,Boba Beer,Sour,Tart Grapefruit Beliner Weisse infused with Ja...,BOBA BEER SOUR TART GRAPEFRUIT BELINER WEISSE ...
120.0,2023,6,Flying Brick Cider Co,Splicer,Cider,"Passionfruit, Pineapple & Lime.","SPLICER CIDER PASSIONFRUIT, PINEAPPLE & LIME."
118.0,2023,6,Spreyton Cider Co,Sour Pineapple Sour,Cider,Sour drops and Warheads!,SOUR PINEAPPLE SOUR CIDER SOUR DROPS AND WARHE...


In [155]:
beer_num = 3
year = 2024
num_recs = 5

test_num = get_beer_ind(beer_num, year)

display(df.iloc[test_num])
display(df.iloc[get_same_year_recs(test_num, num_recs)])

section                                                    1
brewery                                Six String Brewing Co
name                                        Spaghetti Saison
style                                   Saison/Farmhouse Ale
desc       Basil & Peppercorn Saison Bright and pungent, ...
words      SPAGHETTI SAISON SAISON/FARMHOUSE ALE BASIL & ...
Name: (3.0, 2024), dtype: object

Unnamed: 0_level_0,Unnamed: 1_level_0,section,brewery,name,style,desc,words
num,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
50.0,2024,3,Beereratne Brewing,Sparkling Saison Noir,Saison/Farmhouse Ale,We wanted to make a beer that celebrated where...,SPARKLING SAISON NOIR SAISON/FARMHOUSE ALE WE ...
105.0,2024,6,Impi Brewers,Black Mamba - Imperial Black Saison,Saison/Farmhouse Ale,Rich black in colour like the mouth of a black...,BLACK MAMBA - IMPERIAL BLACK SAISON SAISON/FAR...
15.0,2024,1,Molly Rose Brewing,"Pepper, Pepper, Pepper",American IPA,"Using vacuum distillation, we capture the esse...","PEPPER, PEPPER, PEPPER AMERICAN IPA USING VACU..."
112.0,2024,6,Stoic Brewing,Bacon Me Crazy,American IPA,A bold brew where Smoky bacon aroma dances fro...,BACON ME CRAZY AMERICAN IPA A BOLD BREW WHERE ...
101.0,2024,6,Lady Burra Brewhouse,Strawberry Champagne Beer,Specialty Beer,Is it wine or beer? We dare not confine such u...,STRAWBERRY CHAMPAGNE BEER SPECIALTY BEER IS IT...
