In [1]:
import pandas as pd
from pandarallel import pandarallel

data = {
    "train": pd.read_csv("../data/wine_cleaned_train.csv"),
    "test": pd.read_csv("../data/wine_cleaned_test.csv"),
    "val": pd.read_csv("../data/wine_cleaned_val.csv"),
}

In [2]:
df = data["train"]
support = df["region_variety"].value_counts().to_dict()

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [4]:
# use a list of stopwords for getting descriptors
stopwords = nlp.Defaults.stop_words

# add more stopwords that I found through examples
stopwords |= {
    "aroma",
    "aromas",
    "flavor",
    "flavors",
    "note",
    "notes",
    "food",
    "touch",
    "wine",
    "it's",
}


def get_descriptors(string):
    """
    This function uses tools provided by spaCy to grab all adjectives and noun chunks in the string that describes a wine.
    Returns all descriptors as a list.
    """

    string = string.lower()  # make all descriptors lower-case
    spacy_tokens = nlp(
        string
    )  # use spaCy to tokenize the string, comes with `token.{pos_, lemma_}` that I will use
    noun_chunks = spacy_tokens.noun_chunks  # get all noun chunks in the string

    toReturn = []  # initialize list of descriptors to return
    for chunk in noun_chunks:
        if (
            all(
                (str(token) not in stopwords)
                and token.is_punct  # no token in the noun chunk can be a stopword
                != True
                and "-PRON-"  # no token in the noun chunk can be punctuation
                not in token.lemma_  # no token in the noun chunk can be a pronoun
                for token in chunk  # conditions above must hold for each token in the noun chunk
            )
            == True
        ):
            toReturn.append(str(chunk))  # then this noun chunk can be returned

    # there are still stand-alone adjectives which weren't used to describe any nouns but not returned, I want them too
    # below is the solution

    already_in_noun_chunks = [
        word for token in toReturn for word in str(token).split()
    ]  # get all words that the noun chunks already contain into a list
    for token in spacy_tokens:
        if (token.pos_ == "ADJ") & (
            str(token) not in already_in_noun_chunks
        ):  # if a token in the string is an adjective and not already in the list to be returned
            toReturn.append(str(token))

    return list(toReturn)

In [5]:
pandarallel.initialize(progress_bar=True)
for split in data:
    df = data[split]
    df["keywords"] = df["description"].parallel_apply(get_descriptors)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13195), Label(value='0 / 13195')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=734), Label(value='0 / 734'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=733), Label(value='0 / 733'))), HB…

In [6]:
for split in data:
    df = data[split].copy(deep=True)
    df["keywords"] = df["keywords"].map(lambda x: " ".join(x))
    df = df[["keywords", "region_variety"]]
    df.to_csv(f"../data/wine_keywords_{split}.csv", index=False)

In [7]:
df = data["train"]
df = df[["keywords", "region_variety"]]

In [8]:
df = df.groupby("region_variety").agg(sum)

In [9]:
df = df.reset_index()

In [10]:
df.head()

Unnamed: 0,region_variety,keywords
0,Argentina-Mendoza Province:Bonarda,"[bonarda, likable plum, expectations, argentin..."
1,Argentina-Mendoza Province:Bordeaux-style Red ...,"[revancha, wines, esteemed winemaker roberto d..."
2,Argentina-Mendoza Province:Cabernet Franc,"[cab franc, cassis, wild berries, pastry, vani..."
3,Argentina-Mendoza Province:Cabernet Sauvignon,"[clove, broad shoulders, firm tannins, campfir..."
4,Argentina-Mendoza Province:Chardonnay,"[terms, woody chardonnay, vanilla, oak, basic,..."


In [11]:
from collections import Counter
import numpy as np

## Calculating TF(t,d) = log(1 + count(t | d))
df["tf"] = df["keywords"].map(Counter)
for counter_object in df.tf:
    for word in counter_object:
        counter_object[word] = np.log(counter_object[word] + 1)

In [12]:
all_words = {}
for counter_object in df.tf:
    for word in counter_object:
        all_words[word] = 1

idf = {}
## Calculating IDF(d, t) = log(N/|{d in D: t in d}|)
for word in all_words:
    denominator = 0
    for counter in df.tf:
        if word in counter:
            denominator += 1
    idf[word] = np.log(584 / denominator)

In [13]:
df.head()

Unnamed: 0,region_variety,keywords,tf
0,Argentina-Mendoza Province:Bonarda,"[bonarda, likable plum, expectations, argentin...","{'bonarda': 1.9459101490553132, 'likable plum'..."
1,Argentina-Mendoza Province:Bordeaux-style Red ...,"[revancha, wines, esteemed winemaker roberto d...","{'revancha': 1.0986122886681098, 'wines': 1.09..."
2,Argentina-Mendoza Province:Cabernet Franc,"[cab franc, cassis, wild berries, pastry, vani...","{'cab franc': 1.791759469228055, 'cassis': 2.0..."
3,Argentina-Mendoza Province:Cabernet Sauvignon,"[clove, broad shoulders, firm tannins, campfir...","{'clove': 2.1972245773362196, 'broad shoulders..."
4,Argentina-Mendoza Province:Chardonnay,"[terms, woody chardonnay, vanilla, oak, basic,...","{'terms': 1.3862943611198906, 'woody chardonna..."


In [14]:
# Calculating TF-IDF and find top 10 keywords for each region_variety
from tqdm import tqdm


keywords = {}
for _, row in tqdm(df.iterrows()):
    tfidf = {k: row["tf"][k] * idf[k] for k in row["tf"]}
    counter = Counter(tfidf)
    top10 = counter.most_common(10)
    keywords[row["region_variety"]] = {
        "keywords": [word for word, _ in top10],
        "tfidf": [count for _, count in top10],
    }

584it [00:00, 712.34it/s]


In [15]:
import json

with open("../models/idx_to_label.json", "r") as f:
    idx_to_label = json.load(f)

idx_to_keywords = [
    {
        "label": label,
        "keywords": keywords[label]["keywords"],
        "tfidf": keywords[label]["tfidf"],
        "support": support[label],
    }
    for label in idx_to_label
]

with open("../models/idx_to_keywords.json", "w") as f:
    json.dump(idx_to_keywords, f)
    f.write("\n")