In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load keywords and labels

data = {
    "train": pd.concat(
        [
            pd.read_csv("../data/wine_keywords_train.csv"),
            pd.read_csv("../data/wine_keywords_val.csv"),
        ]
    ).dropna(),
    "test": pd.read_csv("../data/wine_keywords_test.csv").dropna(),
}

In [3]:
data["train"].head()

Unnamed: 0,keywords,region_variety
0,core adequate acidity moderate extraction medi...,France-Languedoc-Roussillon:Cabernet Sauvignon
1,complexity varietal character black plum light...,US-California:Merlot
2,rhubarb cranberry fruit red apple light simple...,US-Oregon:Pinot Noir
3,impressive fullness ripeness black cherry leat...,"Italy-Veneto:Corvina, Rondinella, Molinara"
4,dusty tones mineral saffron pollen concentrate...,Germany-Mosel:Riesling


In [4]:
# Count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(data["train"]["keywords"])

train_vectors = vectorizer.transform(data["train"]["keywords"])
test_vectors = vectorizer.transform(data["test"]["keywords"])

In [5]:
print("Vectorizer #features:", len(vectorizer.get_feature_names()))
print("Vectorizer features:", vectorizer.get_feature_names()[500:600])

Vectorizer #features: 18261
Vectorizer features: ['agoston', 'agreeability', 'agreeable', 'agressive', 'agricoltori', 'agriculture', 'agrinatura', 'agro', 'agua', 'aguia', 'agustin', 'ahi', 'aid', 'aidil', 'aids', 'aiken', 'aims', 'aiolo', 'air', 'airborne', 'aires', 'airfield', 'airiness', 'airing', 'airs', 'airtime', 'airy', 'airén', 'aix', 'aka', 'akin', 'al', 'alabaster', 'alain', 'alamos', 'alan', 'alana', 'alance', 'alarid', 'alarming', 'alaska', 'alastro', 'alayt', 'alazan', 'alba', 'alban', 'albana', 'albanello', 'albar', 'albarino', 'albariño', 'albarossa', 'albe', 'albeggio', 'albera', 'alberdi', 'albert', 'alberta', 'alberto', 'albola', 'alcamo', 'alcantara', 'alchemist', 'alchemy', 'alcholic', 'alcineo', 'alcohol', 'alcoholic', 'alconte', 'aldegheri', 'alder', 'alderbrook', 'ale', 'aleatico', 'alejandro', 'alella', 'alene', 'alentejano', 'alentejo', 'aleramico', 'alert', 'alessandro', 'alessano', 'alessio', 'alex', 'alexander', 'alexandra', 'alexandre', 'alexandria', 'alexa



In [6]:
random_state = 42

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, top_k_accuracy_score
from tqdm import tqdm

# Create a custom function to evaluate the list of models:


def top_5_acc(estimator, X, y):
    try:
        y_pred_proba = estimator.predict_proba(X)
    except AttributeError:
        y_pred_proba = estimator.decision_function(X)
    return top_k_accuracy_score(y, y_pred_proba, k=5)


def eval_model(model, X, y, n_jobs=-1):
    """Evaluate a list of models using cross-validation.

    Args:
        models (dict): A dictionary of models to evaluate.
        X (array-like): Training data.
        y (array-like): Training labels.

    Returns:
        scores (list): Dictionary of scores (another dict) for each model.
    """
    # for name, model in tqdm(models.items()):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    cv_scores = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=top_5_acc,
        n_jobs=n_jobs,
        verbose=1,
        return_train_score=True,
    )

    print(
        f"[Train] Top-5 prediction mean accuracy: {cv_scores['train_score'].mean():.3f} (+/- {cv_scores['train_score'].std() * 2:.3f})"
    )
    print(
        f"[Test] Top-5 prediction mean accuracy: {cv_scores['test_score'].mean():.3f} (+/- {cv_scores['test_score'].std() * 2:.3f})"
    )
    return cv_scores

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier


# Create a dictionary to collect model metrics
scores = dict()

In [8]:
# evaluate naïve bayes
nb = MultinomialNB()
scores["nb"] = eval_model(nb, train_vectors, data["train"]["region_variety"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   12.0s remaining:   18.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.2s finished


[Train] Top-5 prediction mean accuracy: 0.598 (+/- 0.002)
[Test] Top-5 prediction mean accuracy: 0.551 (+/- 0.002)


In [9]:
# evaluate perceptron
perceptron = Perceptron(random_state=random_state, early_stopping=True)
scores["perceptron"] = eval_model(
    perceptron, train_vectors, data["train"]["region_variety"]
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   44.6s remaining:  1.1min


[Train] Top-5 prediction mean accuracy: 0.774 (+/- 0.011)
[Test] Top-5 prediction mean accuracy: 0.487 (+/- 0.016)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   45.2s finished


In [16]:
# evaluate linear SVM
svm = LinearSVC(random_state=random_state)
scores["svm"] = eval_model(svm, train_vectors, data["train"]["region_variety"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   36.0s remaining:   53.9s


KeyboardInterrupt: 

In [11]:
# evaluate decision tree
tree = DecisionTreeClassifier(random_state=random_state, max_depth=20)
scores["tree"] = eval_model(tree, train_vectors, data["train"]["region_variety"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    7.6s remaining:   11.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.7s finished


[Train] Top-5 prediction mean accuracy: 0.410 (+/- 0.005)
[Test] Top-5 prediction mean accuracy: 0.334 (+/- 0.005)


In [12]:
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from xgboost import XGBClassifier

In [13]:
# # evaluate adaboost
# ada = AdaBoostClassifier(random_state=random_state, n_estimators=20)
# scores["ada"] = eval_model(ada, train_vectors, data["train"]["region_variety"])

In [14]:
# # evaluate gradient boosting
# gb = GradientBoostingClassifier(random_state=random_state, n_estimators=10, max_depth=5)
# scores["gb"] = eval_model(gb, train_vectors, data["train"]["region_variety"])

In [15]:
# # evaluate XGBoost
# xgb = XGBClassifier(random_state=random_state, n_estimators=10, max_depth=10, n_jobs=-1)
# scores["xgb"] = eval_model(xgb, train_vectors, data["train"]["region_variety"])

## Some thoughts on stacking strategies:

1. To align with our BERT model, we intentionally selected the top 5 prediction accuracy as our evaluation metrics in training. This is a little tricky. The `make_scorer()` function takes two parameters `needs_proba` and `needs_threshold` which are both `False` by default. But for specific models, we need to turn the parameters on to calculate the similar 
2. We discarded the ensemble models in stacking, since the training time of ensemble models in our case is comparatively long due to the size and feature number of the dataset.

We managed to show the efforts, and would focus more on the report.

In [19]:
from sklearn.ensemble import StackingClassifier

level0 = list()
level0.append(("nb", nb))
level0.append(("perceptron", perceptron))
level0.append(("svm", svm))
level0.append(("tree", tree))
# level0.append(("ada", ada)) # the accuracy is too low, plus the parameter turning is quite time-consuming in this stage
# level0.append(("gb", gb))
# level0.append(("xgb", xgb))

level1 = LinearSVC(random_state=random_state)
stacked = StackingClassifier(
    estimators=level0, final_estimator=level1, cv=5, n_jobs=-1, passthrough=False
)
# stacked.fit(train_vectors, data["train"]["region_variety"])

In [22]:
# evaluate the stacked model

In [23]:
import dill

model_summary = {
    "models": [nb, perceptron, svm, tree, stacked],
    "scores": scores,
}

with open("../models/model_summary.pkl", "wb") as f:
    dill.dump(model_summary, f)

In [16]:
# import json
# from json import JSONEncoder

# # Encode np.array to JSON

# class NumpyArrayEncoder(JSONEncoder):
#     def default(self, obj):
#         if isinstance(obj, np.ndarray):
#             return obj.tolist()
#         return JSONEncoder.default(self, obj)


# with open("../models/scores.json", "w") as f:
#     json.dump(scores, f, cls=NumpyArrayEncoder)