In [1]:
# Preamble
import numpy as np
import pandas as pd
import plotly.express as px
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors

In [2]:
CountVectorizer
# Load in beer spreadsheet
main_file_path = "../"

df = pd.read_csv(main_file_path + "Data/GABS_2024_Festival_Beers.csv").set_index("num")

In [3]:
# Inspect data
df.head()

Unnamed: 0_level_0,brewery,state,name,abv,style,desc,section
num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0,Clifton Hill Brewing,VIC,HONEY GUM (RED IPA),7.0,Red IPA,A toffee- like biscuity aroma gives way to car...,1
2.0,Brewmanity Beer Co,VIC,Choc à l'orange,7.0,Imperial Stout,A selection of the finest after dinner choc or...,1
3.0,Six String Brewing Co,NSW,Spaghetti Saison,5.0,Saison/Farmhouse Ale,"Basil & Peppercorn Saison Bright and pungent, ...",1
4.0,The Brew Baron Beer Co.,QLD,Cocoa Comet,7.0,Porter,A hot chocolate drink for adults only! Brewed ...,1
5.0,8 Wired,NZ,Crumbs - Imperial Cookie Stout,10.0,Imperial Stout,Crumbs Imperial Cookie Stout is the rebellious...,1


In [4]:
# Calculate std dev for each section
std_df = df.groupby("Section")["Anticipation"].agg(np.var)#.reset_index()
display(std_df)

KeyError: 'Section'

In [5]:
# Best paddles from each section
# display(df.groupby("Section")["Anticipation"].nlargest(5, keep="all").sort_index())

# Create a column combining all text descriptor fields
df['words'] = [str(name).strip().upper() + ' '
               + str(style).strip().upper() + ' '
               + str(description).strip().upper()
               for name, style, description
               in zip(df['name'], df['style'], df['desc'])]

# TF-IDF vectorisation on description to find most unique beers?
X = TfidfVectorizer().fit_transform([re.sub(r'[\s]+', ' ', re.sub(r'[\W]', ' ', x)).upper().strip() for x in df.words])
# Average of TF-IDF?
beer_uniqueness_mean = pd.Series(np.array(np.mean(X, axis=1)).flatten())
# Cosine similarity? (and then average)
beer_uniqueness_cos = pd.Series(1 - np.mean(cosine_similarity(X), axis=1))
# Euclidean distances? (and then average)
beer_uniqueness_euc = pd.Series(np.mean(euclidean_distances(X), axis=1))

# Choose one
df['beer_uniqueness'] = beer_uniqueness_cos



In [7]:
X.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
euclidean_distances(X)

array([[0.        , 1.40979791, 1.39061823, ..., 1.37109772, 1.38991739,
        1.40254719],
       [1.40979791, 0.        , 1.40920329, ..., 1.38928395, 1.41030093,
        1.40222247],
       [1.39061823, 1.40920329, 0.        , ..., 1.35620476, 1.37760619,
        1.38837476],
       ...,
       [1.37109772, 1.38928395, 1.35620476, ..., 0.        , 1.36226433,
        1.37966043],
       [1.38991739, 1.41030093, 1.37760619, ..., 1.36226433, 0.        ,
        1.20377467],
       [1.40254719, 1.40222247, 1.38837476, ..., 1.37966043, 1.20377467,
        0.        ]])

In [7]:
# Interesting, I might maximise utility by getting a second paddle
# from a section before the first from another.

# Not ideal, but a while loop I guess?
all_beers = set(df.index)
paddle_index = 1
paddle_list = dict()
while len(all_beers) > 0:
    
    # Subset beer list
    temp_df = df[df.index.isin(list(all_beers))]
    
    # Find the section with the best paddle
    sec = temp_df.sort_values("beer_uniqueness", ascending=False) \
            .groupby("Section")["Anticipation"].nlargest(5).reset_index() \
            .groupby("Section") \
            .agg("sum") \
            .join(pd.DataFrame(std_df).rename(columns={"Anticipation":"std"}), on="Section") \
            .sort_values(["Anticipation", "std"], ascending=False) \
            .index[0]
    
    # Get that paddle
    paddle = list(temp_df[temp_df.Section==sec].sort_values(["Anticipation", "beer_uniqueness"], ascending=[False, False]).head(5).reset_index().Number)
    
    # Store that paddle
    paddle_list[str(paddle_index)] = (f'Section {sec}', paddle)
    
    # Increment paddle index
    paddle_index += 1
    
    # Update remaining beers
    all_beers = all_beers - set(paddle)

# Display results
for key, value in paddle_list.items():
    print(f'{key}: ', value)

KeyError: 'Section'

In [8]:
# Display Results
rank = []
section = []
beer1 = []
beer2 = []
beer3 = []
beer4 = []
beer5 = []
for key, value in paddle_list.items():
    rank.append(key)
    section.append(value[0])
    beer1.append(value[1][0])
    beer2.append(value[1][1])
    beer3.append(value[1][2])
    beer4.append(value[1][3])
    beer5.append(value[1][4])
    
god_tier_paddle_tiers_df = pd.DataFrame({"Rank":rank,
                                         "Section":section,
                                         "Beer 1":beer1,
                                         "Beer 2":beer2,
                                         "Beer 3":beer3,
                                         "Beer 4":beer4,
                                         "Beer 5":beer5})

display(god_tier_paddle_tiers_df)

Unnamed: 0,Rank,Section,Beer 1,Beer 2,Beer 3,Beer 4,Beer 5


In [9]:
fig = px.box(df,
             x="Section",
             y="Anticipation")
fig.show()

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['brewery', 'state', 'name', 'abv', 'style', 'desc', 'section', 'words', 'beer_uniqueness'] but received: Section

In [10]:
df.groupby("Section")["Anticipation"].agg(np.std).reset_index()

KeyError: 'Section'

In [11]:
fig = px.bar(std_df.reset_index(),
             x="Section", y="Anticipation",
             labels={"Anticipation":"Anticipation Std Dev"},
             title="Std Dev of Section Scores")
fig.show()

NameError: name 'std_df' is not defined

In [12]:
fig = px.bar(df[df.index!=45].groupby("Section")["Anticipation"].agg(np.var).reset_index(),
             x="Section", y="Anticipation",
             labels={"Anticipation":"Anticipation Std Dev"},
             title="Std Dev of Section Scores (one bad beer removed)")
fig.show()

KeyError: 'Section'

In [18]:
# Plot some clusters maybe?
pca_arr_all = PCA(n_components=2).fit_transform(np.asarray(X.todense()))
print(pca_arr_all.shape)


pca_df = pd.DataFrame(pca_arr_all).reset_index(drop=True).set_index(df.index)
pca_df['Section'] = df['section']
pca_df['Number'] = df.index
pca_df['Beer'] = df['name']
pca_df['Brewery'] = df['brewery']
pca_df['Style'] = df['style']
#pca_df['Anticipation'] = df.Anticipation

fig = px.scatter(pca_df,
                 x=0,
                 y=1,
                 color="Style",
                 hover_data=["Beer", "Section", "Number"],
                 title="GABS 2024"
                )
fig.show()

(117, 2)


In [14]:
print(X.shape)

(117, 1651)


In [15]:
# t-SNE?
pca_arr = PCA(n_components=50).fit_transform(np.asarray(X.todense()))
print(pca_arr.shape)
tsne_arr = TSNE().fit_transform(pca_arr)
print(tsne_arr.shape)

tsne_df = pd.DataFrame(tsne_arr).reset_index(drop=True).set_index(df.index)
tsne_df['Section'] = df['section']
tsne_df['Number'] = df.index
tsne_df['Beer'] = df['name']
tsne_df['Brewery'] = df['brewery']
tsne_df['Style'] = df['style']


(117, 50)
(117, 2)


In [16]:
display(tsne_df)

Unnamed: 0_level_0,0,1,Section,Number,Beer,Brewery,Style
num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0,-10.033188,5.971817,1,1.0,HONEY GUM (RED IPA),Clifton Hill Brewing,Red IPA
2.0,-4.282264,4.314384,1,2.0,Choc à l'orange,Brewmanity Beer Co,Imperial Stout
3.0,9.859373,1.793159,1,3.0,Spaghetti Saison,Six String Brewing Co,Saison/Farmhouse Ale
4.0,9.136356,6.028160,1,4.0,Cocoa Comet,The Brew Baron Beer Co.,Porter
5.0,-0.571690,-0.350629,1,5.0,Crumbs - Imperial Cookie Stout,8 Wired,Imperial Stout
...,...,...,...,...,...,...,...
115.0,5.739968,9.788649,6,115.0,Raspberry Sherbet Bomb,Moon Dog,Specialty Beer
116.0,-6.187871,9.104838,6,116.0,Ginger Monkey,monkey shoulder,Cocktail
117.0,5.993478,6.120999,6,117.0,,Matso's Piña Colada,Cocktail
118.0,-4.178394,9.022579,6,118.0,Eddies Crisp Apple,Eddies Cider,Traditional Cider


In [17]:
fig = px.scatter(tsne_df,
                 x=0,
                 y=1,
                 color="Style",
                 hover_data=["Beer", "Section", "Number"]
                )
fig.show()

In [222]:
tsne_df.to_csv(main_file_path + "Data/GABS_embedding.csv")

In [36]:
# Create distance matrix

column_name_dict = {}
i = 0
for j in df.index:
    column_name_dict[i] = j
    i += 1

distance_df = pd.DataFrame(normalize(euclidean_distances(X))).rename(columns=column_name_dict).reset_index(drop=True).set_index(df.index)

distance_df['Section'] = df.Section
distance_df['Number'] = df.index
distance_df['Beer'] = df.Beer
distance_df['Brewery'] = df.Brewery
distance_df['Style'] = df.Style
distance_df['Anticipation'] = df.Anticipation

distance_df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,117,118,119,120,Section,Number,Beer,Brewery,Style,Anticipation
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.09218,0.092075,0.090823,0.093074,0.090693,0.093074,0.093074,0.093074,0.092131,...,0.093074,0.088538,0.092216,0.093074,1,1,Boba Beer,Dad & Dave,Sour,5
2,0.092026,0.0,0.092493,0.092919,0.092919,0.091068,0.092497,0.090799,0.092919,0.092919,...,0.08953,0.088794,0.090335,0.092919,1,2,Homestead Blackberry and Apple Pastry Sour,The Coastal Brewing Company,Specialty,6
3,0.092244,0.092819,0.0,0.093245,0.093245,0.093245,0.092735,0.090391,0.093245,0.092355,...,0.089564,0.092537,0.091989,0.093245,1,3,Suit Up,7th Day Brewery,IPA Black,7
4,0.09099,0.093245,0.093245,0.0,0.090094,0.093245,0.093245,0.085901,0.091853,0.093245,...,0.093245,0.093245,0.093245,0.093245,1,4,Whiskey in the jar,Six String Brewing Co,Wood Aged Beer,8
5,0.093532,0.093532,0.093532,0.090371,0.0,0.093532,0.093532,0.091766,0.092032,0.093532,...,0.093532,0.093532,0.093532,0.093532,1,5,Marsden Neopolitan Porter,The Marsden Brewhouse,Stout Tropical,10


In [37]:
distance_df.to_csv(main_file_path + "Data/GABS_distances.csv")

In [50]:
nn = NearestNeighbors(n_neighbors=10, metric='precomputed')
nn.fit(euclidean_distances(X))

distances, indexes = nn.kneighbors()


In [56]:
print(len(list(indexes[0])))

print(list(range(1, 10)))

10
[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [59]:
sim_df = pd.DataFrame(columns=['Beer', 'Sim_Rank', 'Sim_Beer'])

for beer in df.index:
    sim_list = list(indexes[beer-1]+1)
    beer_list = [beer for _ in sim_list]
    sim_rank = list(range(1, len(sim_list)+1))
    temp_df = pd.DataFrame({'Beer':beer_list, 'Sim_Rank':sim_rank, 'Sim_Beer':sim_list})
    sim_df = pd.concat([sim_df, temp_df])

display(sim_df)
sim_df.to_csv(main_file_path + "Data/GABS_sim_neighbours.csv")



Unnamed: 0,Beer,Sim_Rank,Sim_Beer
0,1,1,58
1,1,2,102
2,1,3,67
3,1,4,44
4,1,5,109
...,...,...,...
5,120,6,91
6,120,7,114
7,120,8,32
8,120,9,116
