In [19]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [20]:
url = "https://raw.githubusercontent.com/AHMerrill/unstructured-data-2/main/reviews_final.csv"
reviews = pd.read_csv(url)
display(reviews.head())
display(reviews.shape)

Unnamed: 0,beer,brewery,style,style_id,average_user_rating,username,user_rating,delta_from_average,look,smell,taste,feel,overall,date,review_text,brewery_id,beer_id,page_start
0,Caffè Americano,Cigar City Brewing,American Imperial Stout,157,4.46,MadMadMike,4.53,0.07,4.25,4.25,4.75,4.5,4.5,"Jul 29, 2025","In bottle, on tap, at the brewery - anywhere t...",17981,98020,0
1,Caffè Americano,Cigar City Brewing,American Imperial Stout,157,4.46,Rug,4.06,-0.4,4.0,4.25,4.0,4.0,4.0,"Jul 01, 2022",Unknown vintage\n\nSome more BIF heat from the...,17981,98020,0
2,Caffè Americano,Cigar City Brewing,American Imperial Stout,157,4.46,BFCarr,4.43,-0.03,4.25,4.25,4.5,4.5,4.5,"Apr 02, 2021",Pours dark brown with a thin tan head. Aroma c...,17981,98020,0
3,Caffè Americano,Cigar City Brewing,American Imperial Stout,157,4.46,Dfeinman1,4.23,-0.23,4.0,4.75,4.0,4.0,4.25,"Mar 02, 2021",Such a tasty beer. Perfect mouthfeel and carbo...,17981,98020,0
4,Caffè Americano,Cigar City Brewing,American Imperial Stout,157,4.46,Radome,4.54,0.08,4.75,4.5,4.5,4.75,4.5,"Jan 02, 2021",Poured from a bomber bottle into a Duvel glass...,17981,98020,0


(17863, 18)

## We used TF-IDF to get the top words.  TF-IDF is still BoW (in the sense that it's orderless), just down-weighting words that appear everywhere.  We used TF-IDF only to pick out attributes.  Later we'll switch to count-vectorizer.

In [21]:
# Use scikit-learn’s TfidfVectorizer
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",    # basic English stopwords only
    token_pattern=r"(?u)\b[a-zA-Z]{3,}\b",  # only words with ≥3 letters
    max_df=0.8,              # ignore terms in >80% of docs (too common)
    min_df=5,                # keep only words that appear in ≥5 reviews
    max_features=10000        # limit vocabulary size
)

X_tfidf = vectorizer.fit_transform(reviews["review_text"].astype(str))

print("Shape:", X_tfidf.shape)

Shape: (17863, 7011)


In [30]:
# Average TF-IDF across all reviews
avg_tfidf = X_tfidf.mean(axis=0).A1
vocab = vectorizer.get_feature_names_out()

top_idx = np.argsort(avg_tfidf)[::-1][:500]
top_words = [(vocab[i], avg_tfidf[i]) for i in top_idx]

## Then we pasted the top 500 words into chatGPT and asked it to pick out an attribute list

In [23]:
# ==================================================
# Master list of attribute keywords mined from corpus
# ==================================================
ATTRIBUTES = [
    # Hop-forward / bitter
    "hoppy", "hops", "bitter", "bitterness", "dank", "pine", "piney", "resin", "resinous", "floral",

    # Citrus & fruit
    "citrus", "grapefruit", "orange", "lemon", "tangerine", "mango", "pineapple", "peach",
    "tropical", "juicy", "citrusy", "apricot", "papaya", "guava",

    # Roast / cocoa / coffee
    "roasted", "roasty", "roast", "coffee", "espresso", "chocolate", "cocoa", "mocha",

    # Caramel / dessert-like
    "vanilla", "caramel", "toffee", "maple", "molasses", "fudge", "dessert", "candied",

    # Sour / tart / funk
    "sour", "tart", "tartness", "acidity", "acidic", "funky", "barnyard", "lambic",

    # Body / mouthfeel
    "smooth", "creamy", "silky", "velvety", "chewy", "oily", "sticky", "viscous", "dense",

    # Crisp / refreshing
    "crisp", "dry", "refreshing", "clean", "bright",

    # Spice / herbal
    "spicy", "pepper", "cinnamon", "herbal",

    # Strength / heat
    "strong", "boozy", "warming", "warmth",

    # Appearance
    "hazy", "cloudy", "clear"
]


We then collapsed / mapped some of our attributes so that users input is more regularized

In [25]:
# =====================================
# Canonical attribute mapping dictionary
# =====================================

ATTR_MAP = {
    # Hop-forward / bitter
    "hoppy": "hoppy",
    "hops": "hoppy",
    "bitter": "bitter",
    "bitterness": "bitter",
    "dank": "dank",
    "pine": "pine",
    "piney": "pine",
    "resin": "resin",
    "resinous": "resin",
    "floral": "floral",

    # Citrus & fruit
    "citrus": "citrus",
    "citrusy": "citrus",
    "grapefruit": "grapefruit",
    "orange": "orange",
    "lemon": "lemon",
    "tangerine": "tangerine",
    "mango": "mango",
    "pineapple": "pineapple",
    "peach": "peach",
    "tropical": "tropical",
    "juicy": "juicy",
    "apricot": "apricot",
    "papaya": "papaya",
    "guava": "guava",

    # Roast / cocoa / coffee
    "roast": "roasty",
    "roasted": "roasty",
    "roasty": "roasty",
    "coffee": "coffee",
    "espresso": "coffee",
    "chocolate": "chocolate",
    "cocoa": "chocolate",
    "mocha": "chocolate",

    # Caramel / dessert-like
    "vanilla": "vanilla",
    "caramel": "caramel",
    "toffee": "toffee",
    "maple": "maple",
    "molasses": "molasses",
    "fudge": "fudge",
    "dessert": "dessert",
    "candied": "candied",

    # Sour / tart / funk
    "sour": "sour",
    "tart": "sour",
    "tartness": "sour",
    "acidity": "sour",
    "acidic": "sour",
    "funky": "funky",
    "barnyard": "funky",
    "lambic": "lambic",

    # Body / mouthfeel
    "smooth": "smooth",
    "creamy": "creamy",
    "silky": "silky",
    "velvety": "velvety",
    "chewy": "chewy",
    "oily": "oily",
    "sticky": "sticky",
    "viscous": "viscous",
    "dense": "dense",

    # Crisp / refreshing
    "crisp": "crisp",
    "dry": "dry",
    "refreshing": "refreshing",
    "clean": "clean",
    "bright": "bright",

    # Spice / herbal
    "spicy": "spicy",
    "pepper": "pepper",
    "cinnamon": "cinnamon",
    "herbal": "herbal",

    # Strength / heat
    "strong": "strong",
    "boozy": "boozy",
    "warming": "warm",
    "warmth": "warm",

    # Appearance
    "hazy": "hazy",
    "cloudy": "cloudy",
    "clear": "clear"
}


In [26]:
# ==============================
# Customer preferences (edit me)
# ==============================
def normalize_attribute(token: str) -> str:
    return ATTR_MAP.get(token.lower(), token.lower())

# Example for customer input
CUSTOMER_ATTRIBUTES = ["Hops", "CRISP", "citrusy"]
CUSTOMER_ATTRIBUTES = [normalize_attribute(a) for a in CUSTOMER_ATTRIBUTES]
print(CUSTOMER_ATTRIBUTES)
# ['hoppy', 'crisp', 'citrus']

# Example for reviews (tokenize then normalize)
# review_tokens = review_text.split()
# normalized_tokens = [normalize_attribute(t) for t in review_tokens]


['hoppy', 'crisp', 'citrus']


## We also normalized the review_text field so that the words there will better match the user input normalization

In [28]:
import re

def normalize_attribute(token: str) -> str:
    return ATTR_MAP.get(token.lower(), token.lower())

def normalize_review(text: str) -> str:
    """
    1. Lowercase
    2. Tokenize on spaces (you can get fancier later if needed)
    3. Replace tokens using ATTR_MAP if present
    4. Rejoin into normalized review text
    """
    if not isinstance(text, str):
        return ""
    tokens = re.findall(r"[a-zA-Z]+", text.lower())  # keep only words
    norm_tokens = [normalize_attribute(t) for t in tokens]
    return " ".join(norm_tokens)

# Create a normalized text column just for vectorization
reviews["review_norm"] = reviews["review_text"].apply(normalize_review)

# Check a few before/after
for i in range(5):
    print("RAW:", reviews["review_text"].iloc[i][:120])
    print("NORM:", reviews["review_norm"].iloc[i][:120])
    print("---")


RAW: In bottle, on tap, at the brewery - anywhere this classic gem shows up, got to get a pull. High quality stuff right here
NORM: in bottle on tap at the brewery anywhere this classic gem shows up got to get a pull high quality stuff right here the c
---
RAW: Unknown vintage

Some more BIF heat from the legendary @Eziel! I’m starting to work my way up to the more acclaimed bott
NORM: unknown vintage some more bif heat from the legendary eziel i m starting to work my way up to the more acclaimed bottles
---
RAW: Pours dark brown with a thin tan head. Aroma certainly coffee and some booze notes. Taste is mostly booze (whiskey/bourb
NORM: pours dark brown with a thin tan head aroma certainly coffee and some booze notes taste is mostly booze whiskey bourbon 
---
RAW: Such a tasty beer. Perfect mouthfeel and carbonation.
NORM: such a tasty beer perfect mouthfeel and carbonation
---
RAW: Poured from a bomber bottle into a Duvel glass.

L - Deep brown color like motor oil, with no light all

## For the recommender, we switched to count-vectorizer.  That way each word is represented as a vector of word counts, averaged at the beer level.  That count-vector is then compared to the user's attribute vector with cosine similarity

### Note: we'll use VADER for sentiment analysis.  It's built into nltk and was constructed from social media & review-style data, so it should do ok with slang and intensifiers like "super-hoppy"


In [29]:
reviews.columns

Index(['beer', 'brewery', 'style', 'style_id', 'average_user_rating',
       'username', 'user_rating', 'delta_from_average', 'look', 'smell',
       'taste', 'feel', 'overall', 'date', 'review_text', 'brewery_id',
       'beer_id', 'page_start', 'review_norm'],
      dtype='object')