# Train covfefe-flow

## Google Colaboratory (optional)

With [Google Colaboratory](https://colab.research.google.com), Google's free cloud service for AI developers, we can train our machine learning (ML) models on Google's Tesla K80 GPU for free.

(source: [https://medium.com/deep-learning-turkey/google-colab-free-gpu-tutorial-e113627b9f5d](https://medium.com/deep-learning-turkey/google-colab-free-gpu-tutorial-e113627b9f5d))

### Folder structure within Google Drive
```
|-- Google Drive
    |-- covfefe-flow
        |-- train-covfefe-flow.ipynb
        |-- data
            |-- tweets.txt
```

### Enable GPU
In the `train-covfefe-flow.ipynb` notebook click `Runtime` > `Change runtime type` > Choose `Runtime type`: `Python 3` and `Hardware accelerator`: `GPU`

In [None]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [None]:
import os

!mkdir -p drive
!google-drive-ocamlfuse drive
!pip install -q pandas seaborn keras

os.chdir("drive/covfefe-flow")

##### Google Colaboratory infos

In [None]:
import tensorflow as tf
from tensorflow.python.client import device_lib

# Is GPU working?
tf.test.gpu_device_name()

# Which GPU is used?
device_lib.list_local_devices()

# CPU info
!cat /proc/cpuinfo

# RAM info
!cat /proc/meminfo

## Train covfefe-flow

### Settings

In [None]:
maxlen = 35
END_OF_TWEET = '\n'
TWEET_MAX_LEN = 280

### Preparations

In [None]:
import string


def get_vocabulary_and_dictionaries():
    printable_chars = [char for char in string.printable if char not in ('\t', '\r', '\x0b', '\x0c')]
    extra_chars = ['✅', '🏆', '📈', '📉', '🎥', '💰', '📸', '…']
    vocabulary = sorted(printable_chars + extra_chars)
    char_to_id = dict((char, i + 1) for i, char in enumerate(vocabulary))
    char_to_id[''] = 0
    id_to_char = dict((char_to_id[char], char) for char in char_to_id)
    vocabulary_size = len(char_to_id)
    return vocabulary, char_to_id, id_to_char, vocabulary_size


vocabulary, char_to_id, id_to_char, vocabulary_size = get_vocabulary_and_dictionaries()
print('Vocabulary size: ', vocabulary_size)

### Data loading

Load tweets from [data/tweets.txt](./data/tweets.txt).

In [None]:
import pandas as pd
import csv


def read_file(file_name):
    return pd.read_csv(file_name, quoting=csv.QUOTE_NONE)

In [None]:
print('Load tweets...')
original_tweets = read_file('data/tweets.txt')
print(len(original_tweets), ' tweets loaded.')

original_tweets.head(5)

In [None]:
# Missing characters (not in vocabulary)
missing_chars = {}
for pd_original_tweet in original_tweets.itertuples():
    original_tweet = getattr(pd_original_tweet, 'text')
    for char in original_tweet:
        if char not in vocabulary:
            if char not in missing_chars:
                missing_chars[char] = 0
            missing_chars[char] += 1
for missing_char, frequency in sorted(missing_chars.items(), key=lambda x: x[1], reverse=True):
    print(missing_char, ':', frequency)

### Data cleaning & data preparation
- HTML decoding (particularly `&amp;`, `&lt;` and `&gt;`)
- unify characters such as `'`, `"` and `-`
- limit characters to vocabulary

In [None]:
import warnings

# suppress "looks like a URL" UserWarnings in Beautiful Soup
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')


def clean_tweet(original_tweet, vocabulary):
    tweet = original_tweet\
            .strip()\
            .replace('“', '"')\
            .replace('”', '"')\
            .replace('’', '\'')\
            .replace('‘', '\'')\
            .replace('—', '-')\
            .replace('–', '-')\
            .replace('&amp', '&')\
            .replace('&gt;', '>')\
            .replace('&lt;', '<')
    tweet += END_OF_TWEET
    return ''.join(list(filter(lambda char: char in vocabulary, tweet)))

In [None]:
print('Cleaning tweets...')
cleaned_tweets = original_tweets.applymap(lambda tweet: clean_tweet(tweet, vocabulary))
tweets = cleaned_tweets
print('Done!')

tweets.head(5)

### Data exploration

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

cleaned_tweets_lengths = cleaned_tweets.applymap(len).rename(index=str, columns={'text': 'length'})
plt.figure(figsize=(18, 6))
sns.distplot(cleaned_tweets_lengths, kde=False, bins=100, axlabel='Tweet length')
plt.show()

short_cleaned_tweets_lengths = cleaned_tweets_lengths[cleaned_tweets_lengths.length < 50]
plt.figure(figsize=(18, 6))
sns.distplot(short_cleaned_tweets_lengths, kde=False, hist_kws={'cumulative': True}, axlabel='Tweet length')
plt.show()

### Build model

- [LSTM](https://keras.io/layers/recurrent/#lstm): Long short-term memory
- Regularization: to counteract overfitting
- Dropout: to counteract overfitting

In [None]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation
from keras.optimizers import RMSprop
from keras.regularizers import l2  # , l1, l1_l2
import numpy as np


step_size = 3

lstm_size = 96
initial_learning_rate = 0.003
regularizer = l2
regularizer_penalty = 0.01
# dropout = 0.2


# Build the LSTM model
print('Build LSTM model...')
model = Sequential()
model.add(LSTM(lstm_size,
               input_shape=(maxlen, vocabulary_size),
               # kernel_regularizer=regularizer(regularizer_penalty),
               # recurrent_regularizer=regularizer(regularizer_penalty),
               # bias_regularizer=regularizer(regularizer_penalty),
               # activity_regularizer=regularizer(regularizer_penalty),
               # dropout=dropout,
               # recurrent_dropout=dropout,
               return_sequences=True
               )
          )
model.add(LSTM(lstm_size,
               # kernel_regularizer=regularizer(regularizer_penalty),
               # recurrent_regularizer=regularizer(regularizer_penalty),
               # bias_regularizer=regularizer(regularizer_penalty),
               # activity_regularizer=regularizer(regularizer_penalty),
               # dropout=dropout,
               # recurrent_dropout=dropout,
               return_sequences=True
               )
          )
model.add(LSTM(lstm_size,
               kernel_regularizer=regularizer(regularizer_penalty),
               # recurrent_regularizer=regularizer(regularizer_penalty),
               # bias_regularizer=regularizer(regularizer_penalty),
               # activity_regularizer=regularizer(regularizer_penalty),
               # dropout=dropout,
               # recurrent_dropout=dropout
               )
          )
model.add(Dense(vocabulary_size,
                kernel_regularizer=regularizer(regularizer_penalty),
                bias_regularizer=regularizer(regularizer_penalty)
                )
          )
model.add(Activation('softmax'))

# RMSprop is recommended for RNNs
optimizer = RMSprop(lr=initial_learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


print('Done!')

### Train model

[ReduceLROnPlateau](https://keras.io/callbacks/#reducelronplateau): really great way to adjust (reduce) the learning rate when the validation loss stops improving

In [None]:
# Configuration
MODELS_FOLDER = 'models'
MODEL_NAME = 'covfefe-flow'
model_checkpoint_folder = '{models_folder}/{model_name}_checkpoints/'.format(
    models_folder=MODELS_FOLDER,
    model_name=MODEL_NAME
)

In [None]:
import random
import sys
import os

from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, LambdaCallback, TensorBoard


batch_size = 128
num_epochs = 60
validation_split = 0.05


sentences = []
next_chars = []
for pd_tweet in tweets.itertuples():
    tweet = getattr(pd_tweet, 'text')
    # sentences shorter than 'maxlen'
    for l in range(1, min(maxlen, len(tweet))):
        sentences.append(tweet[:l])
        next_chars.append(tweet[l])

    # sentences longer than 'maxlen'
    for j in range(0, len(tweet) - maxlen, step_size):
        sentences.append(tweet[j: j + maxlen])
        next_chars.append(tweet[j + maxlen])
print('#Sentences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, vocabulary_size), dtype=np.bool)
y = np.zeros((len(sentences), vocabulary_size), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_to_id[char]] = 1
    y[i, char_to_id[next_chars[i]]] = 1


# helper function to sample an index from a probability array
def sample(input_predictions, temperature=1.0):
    predictions = np.asarray(input_predictions).astype('float64')
    exp_predictions = np.exp(np.log(predictions) / temperature)
    normalized_predictions = exp_predictions / np.sum(exp_predictions)
    probabilities = np.random.multinomial(1, normalized_predictions, 1)
    return np.argmax(probabilities)


def on_epoch_end(epoch, logs):
    print()
    print('----- Generating tweet after Epoch: %d' % epoch)

    random_tweet_index = random.randint(0, len(tweets) - 1)
    random_tweet = tweets[random_tweet_index]
    if len(random_tweet) > maxlen:
        start_index = random.randint(0, len(random_tweet) - maxlen - 1)
        sentence_seed_len = maxlen
    else:
        start_index = 0
        sentence_seed_len = random.randint(1, len(random_tweet) - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = random_tweet[start_index: start_index + sentence_seed_len]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(TWEET_MAX_LEN - sentence_seed_len):
            x_pred = np.zeros((1, maxlen, vocabulary_size))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_id[char]] = 1.0

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = id_to_char[next_index]

            if next_char == END_OF_TWEET:
                break

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


# Callbacks
reduce_lr_callback = ReduceLROnPlateau(factor=0.2, patience=1, verbose=1)
model_checkpoint_path = model_checkpoint_folder + 'model.{epoch:02d}-{val_loss:.2f}.hdf5'
if not os.path.exists(model_checkpoint_folder):
    os.mkdir(model_checkpoint_folder)
model_checkpoint_callback = ModelCheckpoint(model_checkpoint_path, verbose=1, save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
tensorboard_callback = TensorBoard(
    write_grads=True,
    batch_size=batch_size,
)


# Train!
model.fit(x, y,
          batch_size=batch_size,
          epochs=num_epochs,
          callbacks=[reduce_lr_callback, model_checkpoint_callback, print_callback, tensorboard_callback],
          validation_split=validation_split)

print('Training done!')

#### 📉 TensorBoard

[TensorBoard](https://github.com/tensorflow/tensorboard) is used for visualizing the learning progress.
It needs to be started within the Docker container:

```bash
docker exec -it covfefeflow_train_1 bash
```

And in the container:
```bash
tensorboard --logdir "logs"
```

#### Save model for re-import

In [None]:
model.save('{models_folder}/model.h5'.format(models_folder=MODELS_FOLDER))

#### Load model from checkpoint

In [None]:
from keras.models import load_model

model_checkpoint_name = '<MODEL_CHECKPOINT_NAME_HERE>'
model = load_model('{model_checkpoint_folder}/{model_checkpoint_name}'.format(
    model_checkpoint_folder=model_checkpoint_folder,
    model_checkpoint_name=model_checkpoint_name
))
print('Loaded model "{model_name}".'.format(model_name=model.name))

#### Save model for TensorFlow Serving

In [None]:
import os
import shutil

from tensorflow.python.saved_model import builder as saved_model_builder
from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def
from keras import backend as K
from tensorflow.python.saved_model import tag_constants, signature_constants

export_base_path = MODELS_FOLDER
model_folder = MODEL_NAME
model_version = 1

export_path = os.path.join(export_base_path, model_folder, str(model_version))
# remove model folder if it already exists
if os.path.exists(export_path) and os.path.isdir(export_path):
    shutil.rmtree(export_path)
builder = saved_model_builder.SavedModelBuilder(export_path)

signature = predict_signature_def(inputs={"inputs": model.input},
                                  outputs={"outputs": model.output})

print('Input:', model.input)
print('Output:', model.output)

with K.get_session() as sess:
    builder.add_meta_graph_and_variables(sess=sess,
                                         tags=[tag_constants.SERVING],
                                         signature_def_map={
                                            signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature
                                         })
    builder.save()

print('Model saved!')