# Model 3 - Extended Model, Large Dataset

## Model Overview

Model 3 is the same of Model 2, an extension of the basic model, 1, with 3 LSTM layers, with dropouts and normalizations included.

However, this model is trained on the entire data set, unlike model 2, which is only trained on the Wine Spectator wine notes.

## Imports

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import time
import io
import os
import unicodedata

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, LayerNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.data.experimental import AUTOTUNE

## Loading Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
path_name = '/content/drive/MyDrive/DSI24/wine_df_cleaned.csv'

In [5]:
test_table = pd.read_csv('/content/drive/MyDrive/DSI24/test_table.csv')
wine_df = pd.read_csv(path_name)
wine_notes = [note for note in wine_df['wine_notes']]

In [6]:
# path_name = 'wine_df_cleaned.csv'

In [7]:
# wine_df = pd.read_csv(path_name)

## Preprocessing Data

In [9]:
# tf.data

class WinenoteDataset:
    def __init__(self):
        self.wine_tokenizer = None

    def preprocess_sentence(self, w):
        w = w + ' <end>'
        return w

    def create_dataset(self, path, num_examples):
        # path : path to wine_notes.txt file
        # num_examples : Limit the total number of training example for faster training (set num_examples = len(lines) to use full data)
        wine_df = pd.read_csv(path)
        wine_notes = [self.preprocess_sentence(note) for note in wine_df['wine_notes'][:num_examples]]
        return wine_notes

    def tokenize(self, notes):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(notes)
        tensor = tokenizer.texts_to_sequences(notes)
        padded_tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, maxlen=75, padding='pre')

        return padded_tensor, tokenizer

    def load_dataset(self, path, num_examples=None):
        wine_notes = self.create_dataset(path, num_examples)
        wine_notes, self.wine_tokenizer = self.tokenize(wine_notes)

        return wine_notes, self.wine_tokenizer
    
    def call(self, num_examples, batch_size):
        tensor, self.wine_tokenizer = self.load_dataset(path_name, num_examples)
        tensor = tf.convert_to_tensor(tensor, dtype=tf.int64)
        wine_dataset = tf.data.Dataset.from_tensor_slices(tensor)
        wine_dataset = wine_dataset.batch(batch_size, drop_remainder = True, num_parallel_calls = AUTOTUNE
                                         ).prefetch(AUTOTUNE)
        
        return wine_dataset, self.wine_tokenizer

In [10]:
# batch_size = 512
# num_examples = 3000
# Limiting the training examples for faster training
batch_size = 4096
num_examples = len(wine_df)

dataset_creator = WinenoteDataset()
wine_dataset, wine_tokenizer = dataset_creator.call(num_examples, batch_size)

## Model Creation

In [11]:
total_words = len(wine_tokenizer.word_index)
max_sequence_len = 75

In [12]:
checkpoint_path = "/model_1/checkpoints/model_1.cpkt"

earlystop = EarlyStopping(monitor='sparse_categorical_accuracy', min_delta=0, patience=5, verbose=0, mode='auto')
checkpoints = ModelCheckpoint(
    filepath=checkpoint_path, monitor='sparse_categorical_accuracy', verbose=1, save_best_only=False,
    save_weights_only=False, mode='auto', save_freq='epoch'
)

In [13]:
n_epochs = 10

model = Sequential()
model.add(Embedding((total_words + 1), 2500, input_length=max_sequence_len-1))

model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.4))
model.add(LayerNormalization(epsilon=1e-6))

model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.4))
model.add(LayerNormalization(epsilon=1e-6))

model.add(Bidirectional(LSTM(150)))
model.add(Dropout(0.4))
model.add(LayerNormalization(epsilon=1e-6))

model.add(Dense(total_words, activation='softmax'))
adam = Adam(learning_rate=0.01)
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['sparse_categorical_accuracy'])
#earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 74, 2500)          62955000  
_________________________________________________________________
bidirectional (Bidirectional (None, 74, 300)           3181200   
_________________________________________________________________
dropout (Dropout)            (None, 74, 300)           0         
_________________________________________________________________
layer_normalization (LayerNo (None, 74, 300)           600       
_________________________________________________________________
bidirectional_1 (Bidirection (None, 74, 300)           541200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 74, 300)           0         
_________________________________________________________________
layer_normalization_1 (Layer (None, 74, 300)           6

In [14]:
steps_per_epoch = (num_examples//batch_size) + 1

for epoch in range(n_epochs):
    start = time.time()
    print(f'Epoch {epoch + 1} starting...')
    for wine_notes in wine_dataset.take(steps_per_epoch):
        input_sequences = []
        for line in wine_notes.numpy():
            for i in range(1, len(line)):
                if line[i] != 0:
                    n_gram_sequence = line[:i+1]
                    input_sequences.append(n_gram_sequence)
        input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=75, padding='pre')
        sentences, labels = input_sequences[:,:-1], input_sequences[:,-1]
        model.fit(
            sentences, 
            labels, 
            verbose=1, 
        )
    print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

Epoch 1 starting...
Time taken for 1 epoch: 5279.82 secs

Epoch 2 starting...
Time taken for 1 epoch: 5251.89 secs

Epoch 3 starting...
Time taken for 1 epoch: 5260.47 secs

Epoch 4 starting...
Time taken for 1 epoch: 5216.30 secs

Epoch 5 starting...
Time taken for 1 epoch: 5210.48 secs

Epoch 6 starting...
Time taken for 1 epoch: 5210.60 secs

Epoch 7 starting...
Time taken for 1 epoch: 5221.32 secs

Epoch 8 starting...
Time taken for 1 epoch: 5252.40 secs

Epoch 9 starting...
Time taken for 1 epoch: 5250.26 secs

Epoch 10 starting...
Time taken for 1 epoch: 5270.19 secs



## Text Generation

### Text Generation Functions

In [19]:
wine_tokenizer.texts_to_sequences(['<end>'])

[[6]]

In [21]:
def tasting_note(seed_text, length):
    for _ in range(length):
        token_list = wine_tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predict_x=model.predict(token_list)
        classes_x=np.argmax(predict_x,axis=-1)
        output_word = ""         
        for word, index in wine_tokenizer.word_index.items():
            if index == classes_x:
                output_word = word
                break
        if index == 6:
            break
        seed_text += " " + output_word
    print(seed_text)

### Text Generation

In [22]:
tasting_note('Aromatic', 75)

Aromatic aromas and dark and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry


In [23]:
tasting_note('Lush', 50)

Lush aromas and dark berry and dark and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark


In [24]:
tasting_note('Red Wine', 50)

Red Wine aromas and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and


In [25]:
tasting_note('Wine', 50)

Wine and dark berry aromas and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and


In [26]:
tasting_note('Keyboard', 50)

Keyboard bright ruby and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry


In [27]:
tasting_note('Bukit Pasoh', 50)

Bukit Pasoh bright ruby and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry and dark berry


In [28]:
model.save('model_3.h5')

In [29]:
model_2 = tf.keras.models.load_model('model_3.h5')

OSError: ignored

In [None]:
model_2.summary()