# Training Notebook 
---

Now that we've done preprocessing, we'd like to confirm that we haven't negatively impacted model performance. Recall that the overall goal of rewriting the preprocessing and training as we've done is to replicate the behavior of the existing model as best as possible while simplifying productionization and operationalization. 

The preprocessing_fn and the trainer module are the main things we have to worry about when writing the TFX pipeline, much of everything else (e.g. schema generation, passing data between components) is handled by the framework. The code in this notebook will be pulled out to create the trainer module

In [None]:
import json
import tempfile
import pprint
import os
import tensorflow as tf
import tfx
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils
from sklearn.model_selection import train_test_split
import apache_beam as beam

import pandas as pd
import numpy as np

from google.cloud import storage
import pickle

#### **Dataset Creation**

In [None]:
TRANSFORM_DIR = 'gs://ml-sandbox-tagging-tfx-experiments/preprocessing_notebook'
TRANSFORMED_TRAIN = 'train_transformed'
TRANSFORMED_TEST = 'test_transformed'
LABEL_FILE_NAME = 'tags'
VOCAB_FILE_NAME = 'vocab'

tf_transform_output = tft.TFTransformOutput(TRANSFORM_DIR)

NUM_TAGS = tf_transform_output.vocabulary_size_by_name(LABEL_FILE_NAME)
tag_file = tf_transform_output.vocabulary_file_by_name('tags')

vocab_file = tf_transform_output.vocabulary_file_by_name('vocab')
vocab_df = pd.read_csv(vocab_file, header=None)
VOCAB_SIZE = tf_transform_output.vocabulary_size_by_name(VOCAB_FILE_NAME)

MAX_STRING_LENGTH = 277

def create_tag_lookup_table():
    table = tf.lookup.StaticVocabularyTable(
        tf.lookup.TextFileInitializer(
            tag_file,
            key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
            value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER,
            delimiter=None),
        num_oov_buckets=1)
    return table

table = create_tag_lookup_table()

In [None]:
def label_transform(x, y):
    """Use the number of classes to convert the sparse tag indicies to dense"""
    # Need to add one for out-of-vocabulary tags in eval dataset
    return (x, tf.cast(tf.sparse.to_indicator(table.lookup(y), vocab_size=NUM_TAGS + 1), tf.int32))

def _input_fn(file_pattern, tf_transform_output, batch_size=64, shuffle=True, epochs=None):
    """Generates features and label for tuning/training.
    Args:
        file_pattern: input tfrecord file pattern.
        tf_transform_output: A TFTransformOutput.
        batch_size: representing the number of consecutive elements of
          returned dataset to combine in a single batch
    Returns:
        A dataset that contains (features, indices) tuple where features
        is a dictionary of Tensors, and indices is a single Tensor of
        label indices.
    """
    transformed_feature_spec = (
        tf_transform_output.transformed_feature_spec().copy()
    )

    dataset = tf.data.experimental.make_batched_features_dataset(
        file_pattern=file_pattern,
        batch_size=batch_size,
        features=transformed_feature_spec,
        reader=tf.data.TFRecordDataset,
        shuffle=shuffle,
        label_key='series_ep_tags',
        num_epochs=epochs
    )
    return dataset.map(label_transform)

In [None]:
# example creation of the dataset
train_dataset = _input_fn(
    file_pattern=os.path.join(TRANSFORM_DIR, TRANSFORMED_TRAIN + '*'),
    tf_transform_output=tf_transform_output,
    batch_size=64
)

eval_dataset = _input_fn(
    file_pattern=os.path.join(TRANSFORM_DIR, TRANSFORMED_TEST + '*'),
    tf_transform_output=tf_transform_output,
    batch_size=64,
    shuffle=False,
    epochs=1
)

#### **Training**

In [None]:
from tensorflow.keras import callbacks, layers
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall

In [None]:
FILE_NAME = 'gs://ml-sandbox-101-tagging/data/processed/training_data/glove_data/glove_embedding_index.pkl'

**NOTE:** These custom metrics were copied from the previous training notebooks for a fair comparison to the old model. In the future, we will likely just use the keras default metrics. 

In [None]:
from tensorflow.keras import backend as K


def recall_score(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def precision_score(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def f1(y_true, y_pred):
    def recall_score(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision_score(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [None]:
class AutoTaggingModel:
    def __init__(
        self,
        embedding_file: str,
        embedding_dim: int,
        train_embedding: bool,
        output_size: int,
        vocab_size: int,
        max_string_length: int,
    ):
        self.__embedding_file = embedding_file
        self.__embedding_dim = embedding_dim
        self.__vocab_size = vocab_size
        self.__train_embedding = train_embedding
        self.__output_size = output_size
        self.__max_string_length = max_string_length
        
        self.__initialize_embedding_matrix()
    
    def __initialize_embedding_matrix(self):
        storage_client = storage.Client()
        
        # Better way to do this with os.path?
        split_path = FILE_NAME.split('/')
        bucket_name = split_path[2]
        blob_name = ('/').join(split_path[3:])
        
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(blob_name)
        
        pickle_in = blob.download_as_string()
        file = pickle.loads(pickle_in)
        
        self.embedding_matrix = np.zeros((self.__vocab_size, 
                                     self.__embedding_dim))
        
        for i, word in enumerate(vocab_df.values):
            embedding_vector = file.get(word[0])
            if embedding_vector is not None:
                self.embedding_matrix[i] = embedding_vector
        
    def embedding_layer(self):
        return layers.Embedding(
            input_dim=self.__vocab_size,
            output_dim=self.__embedding_dim,
            weights=[self.embedding_matrix],
            input_length=self.__max_string_length,
            trainable=self.__train_embedding,
        )

    def n_grams_channel(self, inputs, n_words_filter: int):
        channel = layers.Conv2D(256, kernel_size=(n_words_filter, self.__embedding_dim), activation="relu")(inputs)
        channel_mp = layers.MaxPool2D(pool_size=(channel.shape[1], 1))(channel)
        channel_final = layers.Flatten()(channel_mp)
        return channel_final
    
    def define_model(self):
        inputs = layers.Input(shape=(self.__max_string_length,), name='features')
        embedding = self.embedding_layer()(inputs) 
        channel_inputs = layers.Reshape(target_shape=(self.__max_string_length, self.__embedding_dim, 1))(embedding)
        channel1_final = self.n_grams_channel(channel_inputs, 3)
        channel2_final = self.n_grams_channel(channel_inputs, 4)
        channel3_final = self.n_grams_channel(channel_inputs, 5)
        channels_final = layers.Concatenate()(
            [channel1_final, channel2_final, channel3_final]
        )
        channels_final = layers.Dropout(rate=0.4)(channels_final)
        channels_final = layers.Dense(2000, "relu")(channels_final)
        predictions = layers.Dense(self.__output_size, "sigmoid")(channels_final)
        model = Model(inputs=inputs, outputs=predictions)

        return model

    def get_model(self):
        strategy = tf.distribute.MirroredStrategy()
        with strategy.scope():
            model = self.define_model()
            
            metrics = [tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
            metrics = [precision_score, recall_score, f1]
            model.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
                loss=BinaryCrossentropy(),
                metrics=metrics,
            )
        return model

In [None]:
model = AutoTaggingModel(
    embedding_dim=300,
    train_embedding=True,
    embedding_file=FILE_NAME,
    output_size=NUM_TAGS + 1,
    vocab_size=VOCAB_SIZE + 1,
    max_string_length=MAX_STRING_LENGTH
).get_model()

Previous training of model after 8 epochs:

loss: 0.0031 - recall_score: 0.7846 - precision_score: 0.9444 - f1: 0.8570 - val_loss: 0.0099 - val_recall_score: 0.4392 - val_precision_score: 0.7245 - val_f1: 0.5467

In [None]:
early_stopping_callback = callbacks.EarlyStopping(monitor='val_loss',
    min_delta=0.0001,
    patience=4,
    verbose=0,  
    mode='auto',  
    restore_best_weights=True)  

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.1, patience=2, verbose=0, mode='auto',
    min_delta=0.0001
) 



In [None]:
history = model.fit(
    train_dataset, 
    validation_data=eval_dataset,
    steps_per_epoch=1400, 
    epochs=40, 
    callbacks=[early_stopping_callback, reduce_lr]
)

Training looks quite good! We are able to reach similar levels on each of the metrics as we did with the previous model (see `Glove_embedding.ipynb` notebook). 

In [None]:
model.save('models/1/')

In [None]:
pd.DataFrame(history.history).to_csv('models/training_statistics.csv')

### Saving Model with TF Transform for Serving

In [None]:
def _get_serve_tf_examples_fn(model, tf_transform_output):
    """Returns a function that parses a serialized tf.Example."""

    model.tft_layer = tf_transform_output.transform_features_layer()

    @tf.function
    def serve_tf_examples_fn(serialized_tf_examples):
        """Returns the output to be used in the serving signature."""
        feature_spec = tf_transform_output.raw_feature_spec()
        feature_spec.pop('series_ep_tags')
        
        parsed_features = tf.io.parse_example(
            serialized_tf_examples, feature_spec
        )

        transformed_features = model.tft_layer(parsed_features)

        outputs = model(transformed_features)
        return {"outputs": outputs}

    return serve_tf_examples_fn

In [None]:
signatures = {
    "serving_default": _get_serve_tf_examples_fn(model, tf_transform_output).get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string, name="examples")
    ),
}
model.save(
    'models/2', save_format="tf", signatures=signatures
)

In [None]:
!gsutil -m cp -r models/ gs://ml-sandbox-tagging-tfx-experiments/

In [None]:
!rm -rf models/

In [None]:
%%bash
saved_model_cli show \
    --dir gs://ml-sandbox-tagging-tfx-experiments/models/2 \
    --tag_set serve \
    --signature_def serving_default

Seems to be some GPU related issue that prevents calling the model with the `saved_model_cli` from within the same GPU VM used to train the model. When running from another notebook this works fine.

In [None]:
%%bash
saved_model_cli run \
    --dir gs://ml-sandbox-tagging-tfx-experiments/models/2 \
    --tag_set serve \
    --signature_def serving_default \
    --input_exprs 'examples=[b"\n*\n(\n\x08features\x12\x1c\n\x1a\n\x18klassifiziere mich bitte"]'