In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load keywords and labels

data = {
    "train": pd.concat(
        [
            pd.read_csv("../data/wine_keywords_train.csv"),
            pd.read_csv("../data/wine_keywords_val.csv"),
        ]
    ),
    "test": pd.read_csv("../data/wine_keywords_test.csv"),
}

In [3]:
data["train"].head()

Unnamed: 0,keywords,region_variety
0,core adequate acidity moderate extraction medi...,France-Languedoc-Roussillon:Cabernet Sauvignon
1,complexity varietal character black plum light...,US-California:Merlot
2,rhubarb cranberry fruit red apple light simple...,US-Oregon:Pinot Noir
3,impressive fullness ripeness black cherry leat...,"Italy-Veneto:Corvina, Rondinella, Molinara"
4,dusty tones mineral saffron pollen concentrate...,Germany-Mosel:Riesling


In [4]:
data_slim = (
    data["train"]
    .groupby("region_variety")
    .apply(lambda x: x.sample(20, random_state=42, replace=True))
    .reset_index(drop=True)
)

In [5]:
data_slim.describe()

Unnamed: 0,keywords,region_variety
count,11680,11680
unique,9084,584
top,grape bucelas dry vinho extra weight lemon sma...,Argentina-Mendoza Province:Bonarda
freq,7,20


In [6]:
# Count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(data_slim["keywords"])

train_vectors = vectorizer.transform(data_slim["keywords"])
test_vectors = vectorizer.transform(data["test"]["keywords"])

In [7]:
print("Vectorizer #features:", len(vectorizer.get_feature_names()))
print("Vectorizer features:", vectorizer.get_feature_names()[500:600])

Vectorizer #features: 6081
Vectorizer features: ['bell', 'bellangelo', 'belzbrunnen', 'benito', 'benjamin', 'berenguer', 'beresan', 'bergamot', 'bergerac', 'bernard', 'berried', 'berries', 'berry', 'berryish', 'berrylicious', 'bertani', 'best', 'better', 'betz', 'beverage', 'bianca', 'bianco', 'biancolella', 'bical', 'bieler', 'bienenberg', 'big', 'bigger', 'biggest', 'bigtime', 'bilberry', 'bill', 'billards', 'billing', 'billo', 'bing', 'biodynamic', 'biodynamically', 'birch', 'bird', 'birds', 'biscotti', 'biscuit', 'biscuits', 'biscuity', 'bisquertt', 'bistro', 'bite', 'bites', 'biting', 'bitner', 'bits', 'bitter', 'bitterness', 'bitters', 'bittersweet', 'black', 'blackberries', 'blackberrry', 'blackberry', 'blackcurrant', 'blackened', 'blackness', 'blacktop', 'blanc', 'blanca', 'blanched', 'blanco', 'blancs', 'bland', 'blangé', 'blasting', 'blatant', 'blaufränkisch', 'blaye', 'blazing', 'blend', 'blended', 'blending', 'blends', 'bleue', 'blind', 'bliss', 'blockbuster', 'blockier', '



In [8]:
random_state = 42

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, top_k_accuracy_score
from tqdm import tqdm

# Create a custom function to evaluate the list of models:


def top_5_acc(estimator, X, y):
    try:
        y_pred_proba = estimator.predict_proba(X)
    except AttributeError:
        y_pred_proba = estimator.decision_function(X)
    return top_k_accuracy_score(y, y_pred_proba, k=5)


def eval_model(model, X, y, n_jobs=-1):
    """Evaluate a list of models using cross-validation.

    Args:
        models (dict): A dictionary of models to evaluate.
        X (array-like): Training data.
        y (array-like): Training labels.

    Returns:
        scores (list): Dictionary of scores (another dict) for each model.
    """
    # for name, model in tqdm(models.items()):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    cv_scores = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=top_5_acc,
        n_jobs=n_jobs,
        verbose=1,
        return_train_score=True,
    )

    print(
        f"[Train] Top-5 prediction mean accuracy: {cv_scores['train_score'].mean():.3f} (+/- {cv_scores['train_score'].std() * 2:.3f})"
    )
    print(
        f"[Test] Top-5 prediction mean accuracy: {cv_scores['test_score'].mean():.3f} (+/- {cv_scores['test_score'].std() * 2:.3f})"
    )
    return cv_scores

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


# Create a dictionary to collect model metrics
scores = dict()

In [10]:
# evaluate naïve bayes
nb = MultinomialNB()
scores["nb"] = eval_model(nb, train_vectors, data_slim["region_variety"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[Train] Top-5 prediction mean accuracy: 0.972 (+/- 0.002)
[Test] Top-5 prediction mean accuracy: 0.502 (+/- 0.014)


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.1s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.2s finished


In [11]:
# evaluate perceptron
perceptron = Perceptron(random_state=random_state, early_stopping=True)
scores["perceptron"] = eval_model(
    perceptron, train_vectors, data_slim["region_variety"]
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.8s remaining:    4.2s


[Train] Top-5 prediction mean accuracy: 0.936 (+/- 0.002)
[Test] Top-5 prediction mean accuracy: 0.415 (+/- 0.015)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.0s finished


In [12]:
# evaluate linear SVM
svm = LinearSVC(random_state=random_state)
scores["svm"] = eval_model(svm, train_vectors, data_slim["region_variety"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    3.6s remaining:    5.4s


[Train] Top-5 prediction mean accuracy: 1.000 (+/- 0.000)
[Test] Top-5 prediction mean accuracy: 0.475 (+/- 0.018)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.8s finished


In [13]:
# evaluate decision tree
tree = DecisionTreeClassifier(random_state=random_state, max_depth=20)
scores["tree"] = eval_model(tree, train_vectors, data_slim["region_variety"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.4s remaining:    0.6s


[Train] Top-5 prediction mean accuracy: 0.034 (+/- 0.009)
[Test] Top-5 prediction mean accuracy: 0.022 (+/- 0.006)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


In [14]:
# evaluate logistic regression
logreg = LogisticRegression(random_state=random_state, max_iter=80, solver="saga")
scores["logreg"] = eval_model(logreg, train_vectors, data_slim["region_variety"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   16.7s remaining:   25.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.9s finished


[Train] Top-5 prediction mean accuracy: 0.998 (+/- 0.000)
[Test] Top-5 prediction mean accuracy: 0.509 (+/- 0.013)


## Some thoughts on stacking strategies:

1. To align with our BERT model, we intentionally selected the top 5 prediction accuracy as our evaluation metrics in training. This is a little tricky. The `make_scorer()` function takes two parameters `needs_proba` and `needs_threshold` which are both `False` by default. But for specific models, we need to turn the parameters on to calculate the similar 
2. We discarded the ensemble models in stacking, since the training time of ensemble models in our case is comparatively long due to the size and feature number of the dataset.

We managed to show the efforts, and would focus more on the report.

Strategies:

1. Level 1 model (meta learner) has to be as simple as possible. A linear model should suffice.
2. Level 0 models should be sequentially decreasing in terms of model performances. An analogue would be stepwise linear model selection from a null model. Specifically, we add the model with the best performance, then the second best to see if it improves, etc.

In [15]:
from sklearn.ensemble import StackingClassifier

level0 = list()
level0.append(("logreg", logreg))  # candidate 1
# level0.append(("svm", svm)) # candidate 2
level0.append(("perceptron", perceptron))  # candidate 3
# level0.append(("nb", nb)) # candidate 4
# level0.append(("tree", tree)) # candidate 5

level1 = LogisticRegression(
    random_state=random_state, max_iter=60
)  # meta learner uses candidate 1

stacked_131 = StackingClassifier(
    estimators=level0, final_estimator=level1, cv=5, n_jobs=-1, passthrough=False
)

# evaluate stacked model

scores["stacked_131"] = eval_model(
    stacked_131, train_vectors, data_slim["region_variety"]
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  3.2min remaining:  4.8min
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or

[Train] Top-5 prediction mean accuracy: 0.710 (+/- 0.037)
[Test] Top-5 prediction mean accuracy: 0.332 (+/- 0.023)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.3min finished


In [16]:
from sklearn.ensemble import StackingClassifier

level0 = list()
level0.append(("logreg", logreg))  # candidate 1
level0.append(("svm", svm))  # candidate 2
# level0.append(("perceptron", perceptron)) # candidate 3
# level0.append(("nb", nb)) # candidate 4
# level0.append(("tree", tree)) # candidate 5

level1 = LogisticRegression(
    random_state=random_state, max_iter=60
)  # meta learner uses candidate 1

stacked_121 = StackingClassifier(
    estimators=level0, final_estimator=level1, cv=5, n_jobs=-1, passthrough=False
)

# evaluate stacked model

scores["stacked_121"] = eval_model(
    stacked_121, train_vectors, data_slim["region_variety"]
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  3.8min remaining:  5.7min
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or

[Train] Top-5 prediction mean accuracy: 0.978 (+/- 0.005)
[Test] Top-5 prediction mean accuracy: 0.476 (+/- 0.015)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.9min finished


In [17]:
import dill

model_summary = {
    "models": [nb, perceptron, svm, tree, stacked_131, stacked_121],
    "scores": scores,
}

with open("../models/model_summary.pkl", "wb") as f:
    dill.dump(model_summary, f)