## 0. Imports

In [None]:
from matplotlib import pyplot as plt
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.backend import clear_session
from keras.utils.data_utils import get_file
from keras.callbacks import ModelCheckpoint, CSVLogger
import numpy as np
import random
import sys
import time
import io
import json
from sklearn.model_selection import train_test_split
from keras_tqdm import TQDMNotebookCallback
from keras.backend import clear_session

In [None]:
import plotly.plotly as py
import plotly.tools as tools
import plotly.figure_factory as ff
import plotly.graph_objs as go

In [None]:
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import os
import pickle
import unicodedata
from tqdm import tqdm_notebook, tqdm

## 1. Config

### 1.1. Model and training

In [None]:
layers = 2
dropout = 0.7
n_hidden = 512

In [None]:
BATCH_SIZE = 128
epochs = 50
starting_epoch = 0

### 1.2. Plotly

In [None]:
tools.set_credentials_file(username='USERNAME', api_key='APIKEY')

### 1.3. Experiment path

In [None]:
experiment_name = ".".join([str(n_hidden) + "." + str(dropout)]*layers)

In [None]:
path = os.path.join(r'training_files',experiment_name)

In [None]:
if not os.path.exists(path):
    os.makedirs(path)

## 2. Data

### 2.1. Read

In [None]:
df_movies = pd.read_pickle("df_movies_cluster.pickle")

In [None]:
christmas_transcripts = df_movies[df_movies.cluster_name=="christmas"].subtitles_text.values

In [None]:
all_text = ' '.join([x.lower() for x in christmas_transcripts])

### 2.2. Encode

In [None]:
chars = sorted(list(set(all_text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 5
sentences = []
next_chars = []
for i in range(0, len(all_text) - maxlen, step):
    sentences.append(all_text[i: i + maxlen])
    next_chars.append(all_text[i + maxlen])
print('nb sequences:', len(sentences))

In [None]:
with open("char_indices.pickle",'rb') as file:
    char_indices = pickle.load(file)
    
with open("indices_char.pickle",'rb') as file:
    indices_char = pickle.load( file)

In [None]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in tqdm(enumerate(sentences)):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

### 2.3. Train-test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=11, test_size =0.2, train_size=0.2)

## 3. Model

In [None]:
clear_session()

### 3.1. Define model

In [None]:
model = Sequential()
for i in range(layers-1):
    model.add(LSTM(n_hidden, input_shape=(maxlen, len(chars)),recurrent_dropout=dropout , return_sequences=True))
model.add(LSTM(n_hidden,input_shape=(maxlen, len(chars)),recurrent_dropout=dropout))
model.add(Dense(len(chars), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.summary()

### 3.2. Callbacks

#### 3.2.1. JSON logging callback

In [None]:
def epoch_end(epoch, logs):
    with open(os.path.join(path,'loss.logg.json'), mode='a') as file:
        file.write(json.dumps({
            "epoch": epoch,
            "loss": logs['loss'],
            "val_loss": logs["val_loss"],
            "time": time.time()
        }) + "\n")

In [None]:
json_logging_callback = LambdaCallback(
    on_epoch_end=epoch_end 
)

#### 3.2.2. Model weights callback

In [None]:
filepath=os.path.join(path,"periodic_weights.{epoch:02d}-{val_loss:.2f}.hdf5")
checkpoint_5 = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=False, mode='min', period=5)

filepath=os.path.join(path,"best_weights.{epoch:02d}-{val_loss:.2f}.hdf5")
checkpoint_best = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

### 3.3. Training

In [None]:
model.fit(x_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=epochs,
          initial_epoch = 27,
          verbose=False,
          validation_data=(x_test,y_test),
          callbacks=[json_logging_callback,checkpoint_best, checkpoint_5, TQDMNotebookCallback(leave_inner=True,leave_outer=True)]
         )

## 4. Analyze logs

In [None]:
def get_unified_loss(path):
    records = []    
    for f in os.listdir(path):
        if f.endswith(".json"):
            with open(os.path.join(path,f),"r") as file:
                for l in file.readlines():
                    records.append(json.loads(l.strip('\n')))
            
    return sorted(records, key=lambda x: x["epoch"])

In [None]:
def get_traces(analysis, experiments):
    traces = []
    for key, exp in experiments.items():
        path = os.path.join('training_files',exp)
        loss = get_unified_loss(path)
        
        times = [x["time"] for x in loss]
        diffs = [times[i+1] - time for i, time in enumerate(times[:-1])]
        print(np.mean(diffs))

        traces.append(go.Scatter(
            x=list(range(len(loss))),
            y=[x["loss"] for x in loss],
            name=key + " training"
        ))

        traces.append(go.Scatter(
            x=list(range(len(loss))),
            y=[x["val_loss"] for x in loss],
            name=key + " test"
        ))



    layout = go.Layout(
        title='Effect of changing '+ analysis,
        xaxis=dict(
            title='Epoch',
            titlefont=dict(
                size=18,
                color='#7f7f7f'
            )
        ),
        yaxis=dict(
            title='Loss',
            titlefont=dict(
                size=18,
                color='#7f7f7f'
            )
        )
    )

    return traces, layout

### 4.1. Dropout rate

In [None]:
analysis = "dropout"
experiments = {
    "0 dropout":'512.0.0.512.0.0', 
    "0.35 dropout": '512.0.35.512.0.35',
    "0.7 dropout": '512.0.7.512.0.7'
}

In [None]:
traces, layout = get_traces(analysis, experiments)
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='td_medium_nlp_'+  analysis)

### 4.2. N° of Layers

In [None]:
analysis = "layers"
experiments = {
    "1 layer":'512.0.7', 
    "2 layers": '512.0.7.512.0.7',
    "3 layers": '512.0.7.512.0.7.512.0.7'
}

In [None]:
traces, layout = get_traces(analysis, experiments)
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='td_medium_nlp_'+  analysis)

### 4.3. Hidden units

In [None]:
analysis = "hidden units"
experiments = {
    "128 units":'128.0.7.128.0.7', 
    "256 units": '256.0.7.256.0.7',
    "512 units": '512.0.7.512.0.7'
}

In [None]:
traces, layout = get_traces(analysis, experiments)
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='td_medium_nlp_'+  analysis)

### 4.4. Full training

In [None]:
analysis = "data"
experiments = {
    "20% training data":'512.0.7.512.0.7', 
    "80% training data": '512.0.7.512.0.7.full'
}

In [None]:
traces, layout = get_traces(analysis, experiments)
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='td_medium_nlp_'+  analysis)

## 5. Text generation

### 5.1. Load best model

In [None]:
experiment_name = "512.0.7.512.0.7"
path = os.path.join(r'training_files',experiment_name, 'best_weights.hdf5')

In [None]:
model.load_weights(path)

### 5.2. Generate with diversity

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
diversity = 0.5

In [None]:
start_seed = "are you okay, didn't you hear the jingle bells"
start_seed = start_seed[:40]
print(start_seed)

In [None]:
sentence = start_seed
generated = ''
generated += start_seed

for i in tqdm_notebook(range(1000)):
    x_pred = np.zeros((1, maxlen, len(chars)))
    for t, char in enumerate(sentence):
        x_pred[0, t, char_indices[char]] = 1.

    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, diversity)
    next_char = indices_char[next_index]

    generated += next_char
    sentence = sentence[1:] + next_char

print(generated)