In [1]:
import pandas as pd
from pandarallel import pandarallel

df = pd.read_csv("../data/wine_cleaned_train.csv")

In [2]:
support = df["region_variety"].value_counts().to_dict()

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [4]:
# use a list of stopwords for getting descriptors
stopwords = nlp.Defaults.stop_words

# add more stopwords that I found through examples
stopwords |= {
    "aroma",
    "aromas",
    "flavor",
    "flavors",
    "note",
    "notes",
    "food",
    "touch",
    "wine",
    "it's",
}


def get_descriptors(string):
    """
    This function uses tools provided by spaCy to grab all adjectives and noun chunks in the string that describes a wine.
    Returns all descriptors as a list.
    """

    string = string.lower()  # make all descriptors lower-case
    spacy_tokens = nlp(
        string
    )  # use spaCy to tokenize the string, comes with `token.{pos_, lemma_}` that I will use
    noun_chunks = spacy_tokens.noun_chunks  # get all noun chunks in the string

    toReturn = []  # initialize list of descriptors to return
    for chunk in noun_chunks:
        if (
            all(
                (str(token) not in stopwords)
                and token.is_punct  # no token in the noun chunk can be a stopword
                != True
                and "-PRON-"  # no token in the noun chunk can be punctuation
                not in token.lemma_  # no token in the noun chunk can be a pronoun
                for token in chunk  # conditions above must hold for each token in the noun chunk
            )
            == True
        ):
            toReturn.append(str(chunk))  # then this noun chunk can be returned

    # there are still stand-alone adjectives which weren't used to describe any nouns but not returned, I want them too
    # below is the solution

    already_in_noun_chunks = [
        word for token in toReturn for word in str(token).split()
    ]  # get all words that the noun chunks already contain into a list
    for token in spacy_tokens:
        if (token.pos_ == "ADJ") & (
            str(token) not in already_in_noun_chunks
        ):  # if a token in the string is an adjective and not already in the list to be returned
            toReturn.append(str(token))

    return list(toReturn)

In [5]:
pandarallel.initialize(progress_bar=True)
df["keywords"] = df["description"].parallel_apply(get_descriptors)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=10556), Label(value='0 / 10556')))…

In [6]:
df = df[["keywords", "region_variety"]]

In [7]:
df = df.groupby("region_variety").agg(sum)

In [8]:
df = df.reset_index()

In [17]:
from collections import Counter
import numpy as np

## Calculating TF(t,d) = log(1 + count(t | d))
df["tf"] = df["keywords"].map(Counter)
for counter_object in df.tf:
    for word in counter_object:
        counter_object[word] = np.log(counter_object[word] + 1)

idf = []
## Calculating IDF(d, t) = log(N/|{d in D: t in d}|)
for i in df.tf:
    results = {}
    for word in i:
        denominator = 0
        for j in df.tf:
            if word in j:
                denominator += 1
        results[word] = np.log(584 / denominator)
    idf.append(results)
df["idf"] = idf

In [24]:
df

Unnamed: 0,region_variety,keywords,tf,idf
0,Argentina-Mendoza Province:Bonarda,"[bonarda, likable plum, expectations, argentin...","{'bonarda': 1.9459101490553132, 'likable plum'...","{'bonarda': 4.983606621708336, 'likable plum':..."
1,Argentina-Mendoza Province:Bordeaux-style Red ...,"[revancha, wines, esteemed winemaker roberto d...","{'revancha': 1.0986122886681098, 'wines': 1.09...","{'revancha': 6.369900982828227, 'wines': 1.735..."
2,Argentina-Mendoza Province:Cabernet Franc,"[cab franc, cassis, wild berries, pastry, vani...","{'cab franc': 1.791759469228055, 'cassis': 2.0...","{'cab franc': 2.9687036011660717, 'cassis': 1...."
3,Argentina-Mendoza Province:Cabernet Sauvignon,"[clove, broad shoulders, firm tannins, campfir...","{'clove': 2.1972245773362196, 'broad shoulders...","{'clove': 1.0765961581037347, 'broad shoulders..."
4,Argentina-Mendoza Province:Chardonnay,"[terms, woody chardonnay, vanilla, oak, basic,...","{'terms': 1.3862943611198906, 'woody chardonna...","{'terms': 1.574110437231486, 'woody chardonnay..."
...,...,...,...,...
579,US-Washington:Tempranillo,"[white pepper, toffee, rustic tannins, full, b...","{'white pepper': 0.6931471805599453, 'toffee':...","{'white pepper': 1.182515176987472, 'toffee': ..."
580,US-Washington:Viognier,"[scents, toasted grain, accents, citrus blosso...","{'scents': 1.3862943611198906, 'toasted grain'...","{'scents': 1.0765961581037347, 'toasted grain'..."
581,US-Washington:White Blend,"[riesling, chenin blanc, pinot gris, muscat, s...","{'riesling': 2.3978952727983707, 'chenin blanc...","{'riesling': 2.6087008671346648, 'chenin blanc..."
582,US-Washington:Zinfandel,"[good balance, tart red fruits, suggestions, d...","{'good balance': 0.6931471805599453, 'tart red...","{'good balance': 2.026095560974543, 'tart red ..."


In [12]:
# TODO: sort keywords by tfidf score

In [13]:
from tqdm import tqdm

keywords = {}
for i, row in tqdm(df.iterrows()):
    counter = Counter(row["keywords"])
    top10 = counter.most_common(10)
    keywords[row["region_variety"]] = {
        "keywords": [word for word, _ in top10],
        "counts": [count for _, count in top10],
    }

584it [00:00, 5870.74it/s]


In [14]:
import json

with open("../models/idx_to_label.json", "r") as f:
    idx_to_label = json.load(f)

idx_to_keywords = [
    {
        "label": label,
        "keywords": keywords[label]["keywords"],
        "counts": keywords[label]["counts"],
        "support": support[label],
    }
    for label in idx_to_label
]

with open("../models/idx_to_keywords.json", "w") as f:
    json.dump(idx_to_keywords, f)
    f.write("\n")

In [15]:
df

Unnamed: 0,region_variety,keywords,tfidf
0,Argentina-Mendoza Province:Bonarda,"[bonarda, likable plum, expectations, argentin...","{'bonarda': 1.9459101490553132, 'likable plum'..."
1,Argentina-Mendoza Province:Bordeaux-style Red ...,"[revancha, wines, esteemed winemaker roberto d...","{'revancha': 1.0986122886681098, 'wines': 1.09..."
2,Argentina-Mendoza Province:Cabernet Franc,"[cab franc, cassis, wild berries, pastry, vani...","{'cab franc': 1.791759469228055, 'cassis': 2.0..."
3,Argentina-Mendoza Province:Cabernet Sauvignon,"[clove, broad shoulders, firm tannins, campfir...","{'clove': 2.1972245773362196, 'broad shoulders..."
4,Argentina-Mendoza Province:Chardonnay,"[terms, woody chardonnay, vanilla, oak, basic,...","{'terms': 1.3862943611198906, 'woody chardonna..."
...,...,...,...
579,US-Washington:Tempranillo,"[white pepper, toffee, rustic tannins, full, b...","{'white pepper': 0.6931471805599453, 'toffee':..."
580,US-Washington:Viognier,"[scents, toasted grain, accents, citrus blosso...","{'scents': 1.3862943611198906, 'toasted grain'..."
581,US-Washington:White Blend,"[riesling, chenin blanc, pinot gris, muscat, s...","{'riesling': 2.3978952727983707, 'chenin blanc..."
582,US-Washington:Zinfandel,"[good balance, tart red fruits, suggestions, d...","{'good balance': 0.6931471805599453, 'tart red..."
