In [1]:
# importing relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
coffee_df = pd.read_csv("simplified_coffee.csv")

In [3]:
coffee_df.head()

Unnamed: 0,name,roaster,roast,loc_country,origin,100g_USD,rating,review_date,review
0,Ethiopia Shakiso Mormora,Revel Coffee,Medium-Light,United States,Ethiopia,4.7,92,November 2017,"Crisply sweet, cocoa-toned. Lemon blossom, roa..."
1,Ethiopia Suke Quto,Roast House,Medium-Light,United States,Ethiopia,4.19,92,November 2017,"Delicate, sweetly spice-toned. Pink peppercorn..."
2,Ethiopia Gedeb Halo Beriti,Big Creek Coffee Roasters,Medium,United States,Ethiopia,4.85,94,November 2017,"Deeply sweet, subtly pungent. Honey, pear, tan..."
3,Ethiopia Kayon Mountain,Red Rooster Coffee Roaster,Light,United States,Ethiopia,5.14,93,November 2017,"Delicate, richly and sweetly tart. Dried hibis..."
4,Ethiopia Gelgelu Natural Organic,Willoughby's Coffee & Tea,Medium-Light,United States,Ethiopia,3.97,93,November 2017,"High-toned, floral. Dried apricot, magnolia, a..."


In [4]:
coffee_df = coffee_df.drop(["roaster", "roast", "loc_country", "origin", "100g_USD", "rating", "review_date"], axis=1)

In [5]:
coffee_df

Unnamed: 0,name,review
0,Ethiopia Shakiso Mormora,"Crisply sweet, cocoa-toned. Lemon blossom, roa..."
1,Ethiopia Suke Quto,"Delicate, sweetly spice-toned. Pink peppercorn..."
2,Ethiopia Gedeb Halo Beriti,"Deeply sweet, subtly pungent. Honey, pear, tan..."
3,Ethiopia Kayon Mountain,"Delicate, richly and sweetly tart. Dried hibis..."
4,Ethiopia Gelgelu Natural Organic,"High-toned, floral. Dried apricot, magnolia, a..."
...,...,...
1241,Finca Patzibir,"Crisply sweet, nut-toned. Almond brittle, pie ..."
1242,Proyecto Aurora,"Chocolaty, floral-framed. Cocoa nib, honeysuck..."
1243,Finca El Potrero,"High-toned, enticingly sweet. Black cherry, na..."
1244,Chacayá Santiago Atitlán,"Vibrantly sweet, subtly nuanced. Apricot, dar..."


In [6]:
# define the hypernym groups
hypernymsy = {
    'Fruity': ['apricot', 'cherry', 'date', 'plum', 'peach', 'tamarind', 'bergamot', 'grape', 'pear', 'persimmon', 'fruity', 'raisin', 'watermelon', 'plump', 'fig', 'tomato'],
    'Tropical': ['mango', 'pomegranate', 'guava', 'pineapple', 'lychee', 'coconut', 'tropical', 'passionfruit'],
    'Citrusy': ['lemon','citrusy', 'grapefruit', 'tangerine', 'citrus', 'orange', 'lime', 'amber', 'pomelo'],
    'Woody': ['cedar', 'oak', 'fir', 'wood', 'plumeria', 'elm', 'sandalwood', 'mushroom', 'earth'],
    'Berry': ['currant', 'raspberry', 'berry', 'blueberry', 'wine', 'mulberry', 'winy', 'winey', 'strawberry', 'goji', 'grappa'],
    'Sweet': ['tart', 'syrupy', 'syrup', 'honey', 'molasses', 'sugar', 'rum', 'jam', 'nutella', 'brandy', 'pie', 'maple', 'agave'],
    'Candy': ['caramel', 'nougat','toffee', 'candy', 'candied', 'butterscotch', 'fudge'],
    'Nutty': ['almond', 'hazelnut', 'nut', 'cashew', 'pistachio', 'macadamia', 'walnut', 'nutty'],
    'Creamy': ['butter', 'milk', 'creamy', 'yogurt'],
    'Herbal': ['narcissus', 'freesia', 'honeysuckle' 'thyme', 'marjoram', 'wisteria', 'musk', 'tobacco', 'herbaceous', 'herb', 'spearmint', 'cardamom', 'lemongrass', 'balm'],
    'Chocolaty': ['chocolate', 'cocoa', 'chocolaty', 'cacao'],
    'Floral': ['floral', 'flower', 'jasmine', 'magnolia', 'verbena', 'lilac', 'gardenia', 'lavender', 'blossom', 'lily', 'florals', 'rose', 'rhododendron', 'violet', 'hibiscus'],
    'Texture': ['juicy', 'smooth', 'crisp', 'crisply', 'delicate', 'silky', 'viscous', 'dry', 'scorched'],
    'Character': ['satiny','saturated', 'aromatic', 'complex', 'smoky'],
    'Acidity / Spicy': ['acidity', 'bittersweet', 'hop', 'zesty', 'spice', 'peppercorn', 'frankincense', 'spicy', 'cinnamon', 'clove', 'salted', 'tangy', 'ginger'],
}

In [7]:
# create a dictionary to store the match counts per key
match_counts = {}

# loop over the keys and values in the dictionary
for key, value in hypernymsy.items():
    # count the number of matches for the current key
    counts = []
    for row in coffee_df["review"]:
        # step 1
        my_column_words = row.lower()
        match_count = sum(1 for word in value if word in my_column_words)
        counts.append(match_count)
    # store the match counts for the current key in the dictionary
    match_counts[key] = counts

# add the match counts as new columns to the data frame
for key, value in match_counts.items():
    coffee_df[f"{key}_count"] = value
    
# add new column with key that has the most matches
max_match_keys = []
for i in range(len(coffee_df)):
    row_counts = [coffee_df[f"{key}_count"][i] for key in hypernymsy.keys()]
    max_match_key = max(hypernymsy.keys(), key=lambda x: row_counts[list(hypernymsy.keys()).index(x)])
    max_match_keys.append(max_match_key)
    
coffee_df["max_match_key"] = max_match_keys

coffee_df = pd.DataFrame(coffee_df)

In [8]:
# checking to see how many unique values are in the "max_match_key"
coffee_df["max_match_key"].value_counts()

Sweet              444
Texture            233
Fruity             114
Berry               71
Woody               69
Citrusy             62
Floral              57
Nutty               52
Acidity / Spicy     51
Chocolaty           29
Herbal              23
Tropical            18
Character           11
Creamy              10
Candy                2
Name: max_match_key, dtype: int64

In [11]:
# create a dictionary to store the match counts per key
# match_counts = {}

# loop over the keys and values in the dictionary
# for key, value in hypernymsy.items():
    # count the number of matches for the current key
    # counts = []
    # for row in coffee_df["review"]:
        # step 1
        # my_column_words = row.lower().split()
        # match_count = sum(1 for word in value if word in my_column_words)
        # counts.append(match_count)
    # store the match counts for the current key in the dictionary
    # match_counts[key] = counts

# add the match counts as new columns to the data frame
# for key, value in match_counts.items():
    # coffee_df[f"{key}_count"] = value
    
# add new column with key that has the most matches
# max_match_keys = []
# for i in range(len(coffee_df)):
    # row_counts = [coffee_df[f"{key}_count"][i] for key in hypernymsy.keys()]
    # max_match_key = max(hypernymsy.keys(), key=lambda x: row_counts[list(hypernymsy.keys()).index(x)])
    # max_match_keys.append(max_match_key)
    
# coffee_df["max_match_key"] = max_match_keys

# coffee_df = pd.DataFrame(coffee_df)