# Generative Model for Wine Tasting Notes

Wine tasting notes are often viewed as an invaluable part of the wine purchasing decision. Wine tasting notes by professionals, such as Wine Spectator and Wine Advocate, are often printed and used by commercial retailers as an immutable guide to wines sold.

However, objectivity in these notes can often be difficult due to expression of only 1/3 of nose receptor phenotypes in humans, leading to vast differences in tasting experience. Furthermore, tasting events often contain 100s, or 1000s, or wines, which can cause fatigue in even the most experienced of tasters.

Finally, tasting is often confounded by the lack of a common language, and indeed, a common cultural background against which to compare aromas and smells.

The WSET has produced a <a href='https://www.wsetglobal.com/media/3119/wset_l3_wines_sat_en_jun-2016.pdf'>tasting rubric</a>, in an attempt to create a common wine vocabulary, quantifying not just aroma but body and taste, as well.

In 2002, A.C. Noble created a wine wheel during her time at UC DAVIS. This wine wheel was revolutionary, not because it created a common language, but because it came with instructions to create a reproducible standard for each aromatic note from common supermarket ingredients.

<img src="img/Davis-Wine-Aroma-Wheel1.jpg" alt="wine_wheel" width="300"/>

My project will examine 1000s of tastings notes created for top 100 rated wines from Wine Spectator to determine if there is a common wine vocabulary and whether a deep-learning model can be trained to generate appropriate looking wine-tasting notes based on wine type, with reasonable accuracy.

In [11]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import time
import io
import os
import unicodedata

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.data.experimental import AUTOTUNE

In [12]:
print(tf.__version__)

2.3.0


In [13]:
path_name = 'wine_df_cleaned.csv'

In [14]:
wine_df = pd.read_csv(path_name)

In [15]:
wine_df.shape

(49064, 5)

In [16]:
# wine_df = pd.read_csv(path_name)
# wine_notes = [note for note in wine_df['wine_notes'][:10000]]
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(wine_notes)
# tensor = tokenizer.texts_to_sequences(wine_notes)
# input_sequences = []
# for line in tensor:
#     for i in range(1, len(line)):
#         n_gram_sequence = line[:i+1]
#         input_sequences.append(n_gram_sequence)
# xs, ys = input_sequences[::-1], input_sequences[-1]
# xs = tf.keras.preprocessing.sequence.pad_sequences(xs, maxlen=74, padding='pre')
# xs

In [17]:
# Sequence Generator

class Wine_Generator(tf.keras.utils.Sequence):
    def __init__(self, path, num_examples, batch_size):
        self.wine_tokenizer = None
        self.batch_size = batch_size
        self.num_examples = num_examples
        self.data = pd.read_csv(path)
    
    def __len__(self) :
        return (np.ceil(len(wine_df[:self.num_examples]) / self.batch_size)).astype(np.int)
    
    def create_dataset(self):
        # path : path to wine_notes.txt file
        # num_examples : Limit the total number of training example for faster training (set num_examples = len(lines) to use full data)
        wine_notes = [self.preprocess_sentence(note) for note in self.data['wine_notes'][:self.num_examples]]   
        return wine_notes
    
    def preprocess_sentence(self, w):
        w = re.sub(r'(importer: )[\s\S]+', "", w)
        w = re.sub(r'(tel. )(\(\d+\))\s\d+\-\d+[,;]?', "", w)
        w = re.sub(r'(www.)\w+.\w+', "",w)
        # w = '<start> ' + w + ' <end>'
        return w
    
    def tokenize(self):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self.data)
        return tokenizer
    
    def get_tokenizer(self):
        self.data = self.create_dataset()
        self.wine_tokenizer = self.tokenize()
        return self.wine_tokenizer
    
    def __getitem__(self, index):
        wine_notes = self.wine_tokenizer.texts_to_sequences(self.data)
        batch = wine_notes[index * self.batch_size : (index+1) * self.batch_size]
        
        input_sequences = []
        for line in batch:
            for i in range(1, len(line)):
                n_gram_sequence = line[:i+1]
                input_sequences.append(n_gram_sequence)
        sentence, labels = input_sequences[::-1], input_sequences[-1]
        
        padded_sentence = tf.keras.preprocessing.sequence.pad_sequences(sentence, maxlen=75, padding='pre')

        return np.array(padded_sentence), np.array(labels)

In [18]:
# tf.data

class WinenoteDataset:
    def __init__(self):
        self.wine_tokenizer = None

    def preprocess_sentence(self, w):
        w = re.sub(r'(importer: )[\s\S]+', "", w)
        w = re.sub(r'(tel. )(\(\d+\))\s\d+\-\d+[,;]?', "", w)
        w = re.sub(r'(www.)\w+.\w+', "",w)
        # w = '<start> ' + w + ' <end>'
        return w

    def create_dataset(self, path, num_examples):
        # path : path to wine_notes.txt file
        # num_examples : Limit the total number of training example for faster training (set num_examples = len(lines) to use full data)
        wine_df = pd.read_csv(path)
        wine_notes = [note for note in wine_df['wine_notes'][:num_examples]]
        return wine_notes

    def tokenize(self, notes):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(notes)
        tensor = tokenizer.texts_to_sequences(notes)
        padded_tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, maxlen=75, padding='pre')

        return padded_tensor, tokenizer

    def load_dataset(self, path, num_examples=None):
        wine_notes = self.create_dataset(path, num_examples)
        wine_notes, self.wine_tokenizer = self.tokenize(wine_notes)

        return wine_notes, self.wine_tokenizer

    def call(self, num_examples, BATCH_SIZE):
        tensor, self.wine_tokenizer = self.load_dataset(path_name, num_examples)
        wine_dataset = tf.data.Dataset.from_tensor_slices(tensor)
        wine_dataset = wine_dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)
        
        return wine_dataset, self.wine_tokenizer

In [14]:
# seq gen
batch_size = 512
# Limiting the training examples for faster training
num_examples = 10000
path = 'wine_df_cleaned.csv'

wine_seq_generator = Wine_Generator(path, num_examples, batch_size)
wine_tokenizer = wine_seq_generator.get_tokenizer()

In [15]:
# compile model first

model.fit(
    wine_seq_generator,
    steps_per_epoch = (num_examples // batch_size),
    epochs = 5,
    verbose = 1,
#     callbacks = [earlystop, checkpoints] 
    )

Epoch 1/5


InvalidArgumentError:  indices[14869,23] = 7901 is not in [0, 3260)
	 [[node sequential/embedding/embedding_lookup (defined at \AppData\Local\Temp/ipykernel_25408/3408629741.py:7) ]] [Op:__inference_train_function_10516]

Errors may have originated from an input operation.
Input Source operations connected to node sequential/embedding/embedding_lookup:
 sequential/embedding/embedding_lookup/6431 (defined at \anaconda3\envs\tensorflow\lib\contextlib.py:112)

Function call stack:
train_function


In [19]:
# td.data
batch_size = 256
# Limiting the training examples for faster training
num_examples = 2000

dataset_creator = WinenoteDataset()
wine_dataset, wine_tokenizer = dataset_creator.call(num_examples, batch_size)

In [20]:
example_input_batch = next(iter(wine_dataset))
example_input_batch

<tf.Tensor: shape=(256, 75), dtype=int32, numpy=
array([[   0,    0,    0, ..., 4082,   83,  235],
       [   0,    0,    0, ..., 4083,   83,   65],
       [   0,    0,    0, ..., 1868,   83,  235],
       ...,
       [   0,    0,    0, ...,  847, 1168, 1169],
       [   0,    0,    0, ...,  383,  237,  414],
       [   0,    0,    0, ...,  182,  237,  414]])>

In [21]:
n_epochs = 5
max_len = 75         # length of output sequence 
total_words = len(wine_tokenizer.index_word)+1

num_train_steps = (num_examples // batch_size + 1) * n_epochs    # len(tokenized_dataset['input_ids']) // batch_size * n_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5,
    end_learning_rate=0.,
    decay_steps=num_train_steps
    )

model = Sequential()
model.add(Embedding(total_words, 250, input_length=max_len-1))
model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(learning_rate=0.01)
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['sparse_categorical_accuracy'])

checkpoint_path = "/model_1/checkpoints/model_1.cpkt"

earlystop = EarlyStopping(monitor='sparse_categorical_accuracy', min_delta=0, patience=5, verbose=0, mode='auto')
checkpoints = ModelCheckpoint(
    filepath=checkpoint_path, monitor='sparse_categorical_accuracy', verbose=1, save_best_only=False,
    save_weights_only=False, mode='auto', save_freq='epoch',
)

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 74, 250)           1784750   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 74, 300)           481200    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 300)               541200    
_________________________________________________________________
dense_1 (Dense)              (None, 7139)              2148839   
Total params: 4,955,989
Trainable params: 4,955,989
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
# input_sequences = []
# for wine_notes in wine_dataset.take(steps_per_epoch):
#     for line in wine_notes.numpy():
#         for i in range(1, len(line)):
#             if line[i] != 0:
#                 n_gram_sequence = line[:i+1]
#                 input_sequences.append(n_gram_sequence)
# input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=75, padding='pre')
# sentences, labels = input_sequences[:,:-1], input_sequences[:,-1]
# print(sentences.shape, labels.shape)

In [None]:
steps_per_epoch = num_examples//batch_size

for epoch in range(n_epochs):
    for wine_notes in wine_dataset.take(steps_per_epoch):
        input_sequences = []
        for line in wine_notes.numpy():
            for i in range(1, len(line)):
                if line[i] != 0:
                    n_gram_sequence = line[:i+1]
                    input_sequences.append(n_gram_sequence)
        input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=75, padding='pre')
        sentences, labels = input_sequences[:,:-1], input_sequences[:,-1]
        model.fit(
            sentences, 
            labels, 
            verbose=1, 
            callbacks = [earlystop, checkpoints], 
            batch_size = batch_size
        )

Epoch 00001: saving model to /model_1/checkpoints\model_1.cpkt
INFO:tensorflow:Assets written to: /model_1/checkpoints\model_1.cpkt\assets
Epoch 00001: saving model to /model_1/checkpoints\model_1.cpkt
INFO:tensorflow:Assets written to: /model_1/checkpoints\model_1.cpkt\assets
Epoch 00001: saving model to /model_1/checkpoints\model_1.cpkt
INFO:tensorflow:Assets written to: /model_1/checkpoints\model_1.cpkt\assets
Epoch 00001: saving model to /model_1/checkpoints\model_1.cpkt
INFO:tensorflow:Assets written to: /model_1/checkpoints\model_1.cpkt\assets
Epoch 00001: saving model to /model_1/checkpoints\model_1.cpkt
INFO:tensorflow:Assets written to: /model_1/checkpoints\model_1.cpkt\assets
Epoch 00001: saving model to /model_1/checkpoints\model_1.cpkt
INFO:tensorflow:Assets written to: /model_1/checkpoints\model_1.cpkt\assets
Epoch 00001: saving model to /model_1/checkpoints\model_1.cpkt
INFO:tensorflow:Assets written to: /model_1/checkpoints\model_1.cpkt\assets
Epoch 00001: saving model t

In [18]:
def plot_graphs(model, string):
    plt.plot(model.history[string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.show()

In [21]:
plot_graphs(model, 'accuracy')
plot_graphs(model, 'loss')

TypeError: 'NoneType' object is not subscriptable

In [None]:
def tasting_note(seed_text, length):
    last_word = " "
    for _ in range(length):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        output_word = ""         
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        if output_word == last_word:
            break
        seed_text += " " + output_word
        last_word = output_word
    print(seed_text)

In [None]:
tasting_note('Aromatic and beautiful', 50)

In [None]:
tasting_note('Lush', 50)

In [None]:
tasting_note('Red Wine', 50)

In [None]:
tasting_note('Wine', 50)

In [None]:
tasting_note('Keyboard', 50)

In [None]:
tasting_note('Bukit Pasoh', 50)

In [None]:
model.save('model_1A.h5')

In [None]:
model_2 = tf.keras.models.load_model('model_1.h5')

In [None]:
model_2.summary()