In [1]:
import pandas as pd
from pandarallel import pandarallel

df = pd.read_csv("../data/wine_cleaned_train.csv")

In [3]:
support = df["region_variety"].value_counts().to_dict()

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [5]:
# use a list of stopwords for getting descriptors
stopwords = nlp.Defaults.stop_words

# add more stopwords that I found through examples
stopwords |= {
    "aroma",
    "aromas",
    "flavor",
    "flavors",
    "note",
    "notes",
    "food",
    "touch",
    "wine",
    "it's",
}


def get_descriptors(string):
    """
    This function uses tools provided by spaCy to grab all adjectives and noun chunks in the string that describes a wine.
    Returns all descriptors as a list.
    """

    string = string.lower()  # make all descriptors lower-case
    spacy_tokens = nlp(
        string
    )  # use spaCy to tokenize the string, comes with `token.{pos_, lemma_}` that I will use
    noun_chunks = spacy_tokens.noun_chunks  # get all noun chunks in the string

    toReturn = []  # initialize list of descriptors to return
    for chunk in noun_chunks:
        if (
            all(
                (str(token) not in stopwords)
                and token.is_punct  # no token in the noun chunk can be a stopword
                != True
                and "-PRON-"  # no token in the noun chunk can be punctuation
                not in token.lemma_  # no token in the noun chunk can be a pronoun
                for token in chunk  # conditions above must hold for each token in the noun chunk
            )
            == True
        ):
            toReturn.append(str(chunk))  # then this noun chunk can be returned

    # there are still stand-alone adjectives which weren't used to describe any nouns but not returned, I want them too
    # below is the solution

    already_in_noun_chunks = [
        word for token in toReturn for word in str(token).split()
    ]  # get all words that the noun chunks already contain into a list
    for token in spacy_tokens:
        if (token.pos_ == "ADJ") & (
            str(token) not in already_in_noun_chunks
        ):  # if a token in the string is an adjective and not already in the list to be returned
            toReturn.append(str(token))

    return list(toReturn)

In [6]:
pandarallel.initialize(progress_bar=True)
df["keywords"] = df["description"].parallel_apply(get_descriptors)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13195), Label(value='0 / 13195')))…

In [7]:
df = df[["keywords", "region_variety"]]

In [8]:
df = df.groupby("region_variety").agg(sum)

In [9]:
df = df.reset_index()

In [10]:
from collections import Counter
from tqdm import tqdm


keywords = {}
for i, row in tqdm(df.iterrows()):
    counter = Counter(row["keywords"])
    top10 = counter.most_common(10)
    keywords[row["region_variety"]] = {
        "keywords": [word for word, _ in top10],
        "counts": [count for _, count in top10],
    }

584it [00:00, 3306.00it/s]


In [11]:
import json

with open("../models/idx_to_label.json", "r") as f:
    idx_to_label = json.load(f)

idx_to_keywords = [
    {
        "label": label,
        "keywords": keywords[label]["keywords"],
        "counts": keywords[label]["counts"],
        "support": support[label],
    }
    for label in idx_to_label
]

with open("../models/idx_to_keywords.json", "w") as f:
    json.dump(idx_to_keywords, f)
    f.write("\n")