In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load keywords and labels

data = {}

for split in ["train", "val", "test"]:
    df = pd.read_csv(f"../data/wine_keywords_{split}.csv")
    # Drop na
    df = df.dropna()
    data[split] = df

In [3]:
data["train"].head()

Unnamed: 0,keywords,region_variety
0,core adequate acidity moderate extraction medi...,France-Languedoc-Roussillon:Cabernet Sauvignon
1,complexity varietal character black plum light...,US-California:Merlot
2,rhubarb cranberry fruit red apple light simple...,US-Oregon:Pinot Noir
3,impressive fullness ripeness black cherry leat...,"Italy-Veneto:Corvina, Rondinella, Molinara"
4,dusty tones mineral saffron pollen concentrate...,Germany-Mosel:Riesling


In [4]:
# Count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(data["train"]["keywords"])

train_vectors = vectorizer.transform(data["train"]["keywords"])
val_vectors = vectorizer.transform(data["val"]["keywords"])
test_vectors = vectorizer.transform(data["test"]["keywords"])

In [5]:
print("Vectorizer #features:", len(vectorizer.get_feature_names()))
print("Vectorizer features:", vectorizer.get_feature_names()[500:600])

Vectorizer #features: 17856
Vectorizer features: ['agustin', 'ahi', 'aid', 'aidil', 'aids', 'aiken', 'aims', 'aiolo', 'air', 'airborne', 'aires', 'airfield', 'airiness', 'airing', 'airs', 'airtime', 'airy', 'airén', 'aix', 'aka', 'akin', 'al', 'alabaster', 'alain', 'alamos', 'alan', 'alana', 'alance', 'alarid', 'alarming', 'alaska', 'alastro', 'alayt', 'alazan', 'alba', 'alban', 'albana', 'albanello', 'albar', 'albarino', 'albariño', 'albarossa', 'albe', 'albeggio', 'albera', 'alberdi', 'albert', 'alberta', 'alberto', 'albola', 'alcamo', 'alcantara', 'alchemist', 'alchemy', 'alcholic', 'alcineo', 'alcohol', 'alcoholic', 'alconte', 'aldegheri', 'alder', 'alderbrook', 'ale', 'aleatico', 'alejandro', 'alene', 'alentejano', 'alentejo', 'aleramico', 'alert', 'alessandro', 'alessano', 'alessio', 'alex', 'alexander', 'alexandra', 'alexandre', 'alexandria', 'alexandrine', 'alexia', 'alexis', 'alfalfa', 'alfonso', 'alfred', 'alfredo', 'alfresco', 'alfrocheiro', 'aliança', 'alicante', 'alice', '

In [6]:
# Train a naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(train_vectors, data["train"]["region_variety"])

# Evaluate the model
from sklearn.metrics import accuracy_score


def evaluate_model(model, vectors, labels):
    predictions = model.predict(vectors)
    return accuracy_score(labels, predictions)


print(
    "Naive Bayes Train accuracy:",
    evaluate_model(clf, train_vectors, data["train"]["region_variety"]),
)
print(
    "Naive Bayes Val accuracy:",
    evaluate_model(clf, val_vectors, data["val"]["region_variety"]),
)

Naive Bayes Train accuracy: 0.3661986509019251
Naive Bayes Val accuracy: 0.32162346521145974


In [7]:
# Train a perceptron
from sklearn.linear_model import Perceptron

clf = Perceptron(n_jobs=-1)
clf.fit(train_vectors, data["train"]["region_variety"])

print(
    "Perceptron Train accuracy:",
    evaluate_model(clf, train_vectors, data["train"]["region_variety"]),
)
print(
    "Perceptron Val accuracy:",
    evaluate_model(clf, val_vectors, data["val"]["region_variety"]),
)

Perceptron Train accuracy: 0.6240336516598454
Perceptron Val accuracy: 0.3211118690313779


In [8]:
# Train a LinearSVC
from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(train_vectors, data["train"]["region_variety"])

print(
    "LinearSVC Train accuracy:",
    evaluate_model(clf, train_vectors, data["train"]["region_variety"]),
)
print(
    "LinearSVC Val accuracy:",
    evaluate_model(clf, val_vectors, data["val"]["region_variety"]),
)

LinearSVC Train accuracy: 0.8405714718811581
LinearSVC Val accuracy: 0.4201909959072306
