## Dynamic Isometry As a Consequence of Weight Orthogonality for Faster and Better Convergence

**Experimental part**

Ester Hlav

Columbia University


Department of Mathematics

May 2019


### Load packages

In [None]:
import keras
import numpy as np
import numpy
from keras.models import Sequential
from keras.layers import LSTM, SimpleRNN, Dense, CuDNNLSTM, CuDNNGRU
from keras.optimizers import Adam
from keras.utils import np_utils
from keras.initializers import Orthogonal
from keras.callbacks import Callback
from keras import backend as K
from numpy.linalg import svd
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
import pickle
import flair
import torch
from flair.data import Sentence
from flair.embeddings import WordEmbeddings
import tqdm

K.tensorflow_backend._get_available_gpus()

### Read data and process it

##### 1. For Sentiment Analysis task

In [None]:
def read_file_embed(file, SEQ_LENGTH, embeddings='glove'):
    
    # define embedding
    if embeddings == 'word2vec':
        emb = WordEmbeddings('en')
    elif embeddings == 'glove':
        emb = WordEmbeddings('glove')
    
    padded = []
    with open(file, 'r') as f:
        for cnt, line in tqdm.tqdm_notebook(enumerate(f)):
            # read line and embed
            line = f.readline()
            if line not in ['', ' ', '\n']:
                sent = Sentence(line)
                emb.embed(sent)
                sentvec = np.array([tok.embedding.numpy() for tok in sent])

                # pad or cut to requested SEQ_LENGTH
                padded.append(SEQ_LENGTH-sentvec.shape[0])
                if sentvec.shape[0]>SEQ_LENGTH:
                    sentvec = sentvec[:SEQ_LENGTH]
                elif sentvec.shape[0]<SEQ_LENGTH:
                    npad = ((0, SEQ_LENGTH-sentvec.shape[0]), (0, 0))
                    sentvec = np.pad(sentvec, pad_width=npad, mode='constant', constant_values=0)
                assert sentvec.shape[0] == SEQ_LENGTH

                # reshape for staking
                sentvec = sentvec.reshape((1, SEQ_LENGTH, -1))

                # stack
                if cnt == 0:
                    # if first element then init
                    sentences = sentvec
                else:
                    sentences = np.vstack((sentences, sentvec))
    print ('Mean/std of padding: {}/{}'.format(np.mean(padded), np.std(padded)))
    return (sentences)

def get_sent_analysis(SEQ_LENGTH, embeddings='glove', percentValTest=[0.1, 0.1]):
    
    # read positive and negative sentences
    pos_data = read_file_embed('rt-polarity.pos', SEQ_LENGTH, embeddings)
    neg_data = read_file_embed('rt-polarity.neg', SEQ_LENGTH, embeddings)
    X = np.vstack((pos_data, neg_data))
    N = X.shape[0]
    Y = np.zeros(N)
    Y[:pos_data.shape[0]] = 1
    
    # shuffle dataset
    indexes = np.random.choice(N, size=N, replace=False)
    X = X[indexes]
    Y = Y[indexes]
    
    # split train, val, test
    ntrain = int(N*(1-sum(percentValTest)))
    nval = int(N*(1-percentValTest[-1]))
    X_train, X_val, X_test = X[:ntrain], X[ntrain:nval], X[nval:]
    Y_train, Y_val, Y_test = Y[:ntrain], Y[ntrain:nval], Y[nval:]
    return (X_train, Y_train, X_val, Y_val, X_test, Y_test)

In [None]:
X_train, Y_train, X_val, Y_val, X_test, Y_test = get_sent_analysis(50, embeddings='word2vec')

In [None]:
for i in [X_train, Y_train, X_val, Y_val, X_test, Y_test]:
    print (i.shape)

##### 2. For Sequential MNIST

In [None]:
from keras.datasets import mnist

def get_data_seqMNIST(VAL_PERCENT, normalization_input=True, bypixel=False):
    # Load pre-shuffled MNIST data into train and test sets
    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    
    if bypixel:
        X_train = X_train.reshape(X_train.shape[0], 28*28, 1)
        X_test = X_test.reshape(X_test.shape[0], 28*28, 1)

    ntrain = int(X_train.shape[0]*(1-VAL_PERCENT))
    X_train, X_val = X_train[:ntrain], X_train[ntrain:]
    y_train, y_val = y_train[:ntrain], y_train[ntrain:]
    
    if normalization_input:
        X_train = X_train.astype("float32") / 255 #- 0.5
        X_val = X_val.astype("float32") / 255 #- 0.5
        X_test = X_test.astype("float32") / 255 #- 0.5
        
    Y_train = np_utils.to_categorical(y_train, 10)
    Y_val = np_utils.to_categorical(y_val, 10)
    Y_test = np_utils.to_categorical(y_test, 10)

    print ("Shapes of X_train, X_val and X_test:")
    print (X_train.shape, X_val.shape, X_test.shape)
    print ("Shapes of Y_train, Y_val and Y_test:")
    print (Y_train.shape, Y_val.shape, Y_test.shape)
    
    return (X_train, Y_train, X_val, Y_val, X_test, Y_test)

VAL_PERCENT = 0.2
X_train, Y_train, X_val, Y_val, X_test, Y_test = get_data_seqMNIST(VAL_PERCENT, bypixel=False)

### Define plotting function to visualize results

In [None]:
def plot_metric(result, names, config):
    values = {}
    for name in names:
        values[name] = result[name]
        
    epochs = np.arange(len(values[names[0]]))
    
    plt.figure(figsize=(12,10))
    for name in names:
        plt.plot(epochs, values[name], label=name)
    plt.xlabel('epochs')
    plt.ylabel(name)
    plt.title("Evolution of {} during training of config {}".format(" and ".join(names), config))
    plt.legend()
    plt.show()
    
def plot_metrics(results, names):
    values = {}
    colors = ['red', 'blue', 'green', 'orange', 'yellow', 'pink', 'purple', 'black', 'cyan']
    configs = list(results.keys())
    for config in configs:
        values[config] = {}
        for name in names:
            values[config][name] = results[config][name]
        
    epochs = np.arange(len(values[configs[0]][names[0]]))
    
    fig, ax = plt.subplots(len(names), figsize=(12,10*len(names)))
    for i, name in enumerate(names):
        for j, config in enumerate(configs):
            c = colors[j]
            ax[i].plot(epochs, values[config][name], label=name+' '+config, color=c)
        ax[i].set_xlabel('epochs')
        ax[i].set_ylabel(name)
        ax[i].set_title("Evolution of {} during training".format(name))
        ax[i].legend()
    plt.show()
    
def plot_sv(result, config):
    svs = result['sv']
    means = np.array([np.mean(sv) for sv in svs])
    stds = np.array([np.std(sv) for sv in svs])
    maxs = np.array([np.max(sv) for sv in svs])
    mins = np.array([np.min(sv) for sv in svs])
    last_sv = means[-1]
    first_sv = means[0]
    epochs = np.arange(means.shape[0])
    
    fig, ax = plt.subplots(figsize=(12,10))
    # last sv
    ax.axhline(y=last_sv, linestyle='--', color="black", alpha=0.3)
    trans = transforms.blended_transform_factory(
    ax.get_yticklabels()[0].get_transform(), ax.transData)
    ax.text(0, last_sv, "{:.3f}".format(last_sv), color="black", transform=trans, 
            ha="right")
    # first sv
    ax.axhline(y=first_sv, linestyle='--', color="black", alpha=0.3)
    trans = transforms.blended_transform_factory(
    ax.get_yticklabels()[0].get_transform(), ax.transData)
    ax.text(0, first_sv, "{:.3f}".format(first_sv), color="black", transform=trans, 
            ha="right")
    plt.plot(epochs, means, label='mean', color='blue')
    plt.fill_between(epochs, means-stds, means+stds, color='orange')
    plt.plot(epochs, maxs, label='max', color='red')
    plt.plot(epochs, mins, label='min', color='green')
    plt.xlabel('epochs')
    plt.ylabel('singular values')
    plt.title('Evolution of statistics of singular values during training of config {}'.format(config))
    plt.legend(loc='upper left')
    plt.show()
    
def plot_svs(results):
    colors = ['red', 'blue', 'green', 'orange', 'yellow', 'pink', 'purple', 'black', 'cyan']
    svss = {} 
    configs = list(results.keys())
    for config in configs:
        svss[config] = {}
        svs = results[config]['sv']
        svss[config]['means'] = np.array([np.mean(sv) for sv in svs])
        svss[config]['stds'] = np.array([np.std(sv) for sv in svs])
    epochs = np.arange(svss[list(results.keys())[0]]['means'].shape[0])

    fig, ax = plt.subplots(figsize=(12,10))
    for i, config in enumerate(configs):
        c = colors[i]
        plt.plot(epochs, svss[config]['means'], label='mean {}'.format(config), color=c)
        plt.fill_between(epochs, svss[config]['means']- svss[config]['stds'],  
                         svss[config]['means']+ svss[config]['stds'], alpha=0.5, color=c)
    plt.xlabel('epochs')
    plt.ylabel('singular values')
    plt.title('Evolution of statistics of singular values during training for different configs')
    plt.legend(loc='upper left')
    plt.show()

### Define gain-adjusted soft orthogonal regularizer, callback to track eigen values and the model building/training pipeline

In [None]:
NDIM = 128
BATCH_SIZE = 128
EPOCHS = 50
TASK = 'sentiment' #'seqMNIST'#

def orth_reg_gain(W, gain, reg_orth):
    return reg_orth * K.mean(K.square(1/(gain**2) * K.dot(K.transpose(W), W)-1))

class SingularValuesCallback(Callback):
    def __init__(self, layer_name):
        self.layer_name = layer_name
        self.sv = []
    
    def on_train_begin(self, logs=None):
        layer_dict = dict([(layer.name, layer) for layer in self.model.layers])
        self.layer = layer_dict[self.layer_name]
        self.sv.append(svd(self.layer.get_weights()[1], compute_uv=False))
        
    def on_epoch_end(self, epoch, logs=None):
        # compute singular values and save
        svs = svd(self.layer.get_weights()[1], compute_uv=False)
        self.sv.append(svs)

class History(Callback):
    """Callback that records events into a `History` object.
    This callback is automatically applied to
    every Keras model. The `History` object
    gets returned by the `fit` method of models.
    Reinplemented to be able to compute loss at epoch 0, like for the sv
    """
    def __init__(self, train_data, val_data):
        self.train_data = train_data
        self.val_data = val_data
        self.epoch = []
        self.history = {}

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        self.epoch.append(epoch)
        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

def train(config, data='seqMNIST'): 
    model = Sequential()
    if data == 'seqMNIST':
        model.add(SimpleRNN(NDIM, return_sequences=False, 
              input_shape=(X_train.shape[1], X_train.shape[2]),
              #activation = 'sigmoid',
              recurrent_initializer=config['recurrent_initializer'],
              recurrent_regularizer=config['recurrent_regularizer'], name='rnn_layer'))
    elif data == 'sentiment':
        model.add(CuDNNLSTM(NDIM, return_sequences=False, 
              input_shape=(X_train.shape[1], X_train.shape[2]),
              recurrent_initializer=config['recurrent_initializer'],
              recurrent_regularizer=config['recurrent_regularizer'], name='rnn_layer'))
    #model.add(LSTM(64, return_sequences=False))

    if data == 'seqMNIST':
        model.add(Dense(10, activation='softmax'))
        # Compile the model
        model.compile(optimizer=Adam(lr=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
    elif data == 'sentiment':
        model.add(Dense(1, activation='sigmoid'))
        # Compile the model
        model.compile(optimizer=Adam(lr=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
    

    svc = SingularValuesCallback('rnn_layer')
    history = History([X_train, Y_train], [X_val, Y_val])
    model.fit(X_train, Y_train, batch_size=BATCH_SIZE, 
                        epochs=EPOCHS, validation_data=(X_val, Y_val), callbacks=[history, svc])
    
    results = history.history
    results['sv'] = svc.sv
    
    return (model, results)

### Define different configurations

In [None]:
configs = {
    'Orthogonal_Reg_0.01': {
        'GAIN_ORTH': 1,
        'recurrent_initializer': Orthogonal(gain=1.0),
        'recurrent_regularizer': lambda x: orth_reg_gain(x, 1.0, 0.01)
    }, 
    'Orthogonal_Reg_0.1': {
        'GAIN_ORTH': 1,
        'recurrent_initializer': Orthogonal(gain=1.0),
        'recurrent_regularizer': lambda x: orth_reg_gain(x, 1.0, 0.1)
    }, 
    'Orthogonal_Reg_0.5': {
        'GAIN_ORTH': 1,
        'recurrent_initializer': Orthogonal(gain=1.0),
        'recurrent_regularizer': lambda x: orth_reg_gain(x, 1.0, 0.5)
    }, 
    'Orthogonal_NoReg': {
        'GAIN_ORTH': 1,
        'recurrent_initializer': Orthogonal(gain=1.0),
        'recurrent_regularizer': None
    },
    'NoOrthogonal': {
        'GAIN_ORTH': 1,
        'recurrent_initializer': 'glorot_uniform',
        'recurrent_regularizer': None
    }
    
}

In [None]:
GAIN = 1.025

configs2 = {
    'Orthogonal_Gain{}_adj_Reg1'.format(GAIN): {
        'GAIN_ORTH': GAIN,
        'recurrent_initializer': Orthogonal(gain=GAIN),
        'recurrent_regularizer': lambda x: orth_reg_gain(x, GAIN, 1)
    },
    'Orthogonal_Gain{}_adj_Reg0.5'.format(GAIN): {
        'GAIN_ORTH': GAIN,
        'recurrent_initializer': Orthogonal(gain=GAIN),
        'recurrent_regularizer': lambda x: orth_reg_gain(x, GAIN, 0.5)
    },
    'Orthogonal_Gain{}_Reg1'.format(GAIN): {
        'GAIN_ORTH': GAIN,
        'recurrent_initializer': Orthogonal(gain=GAIN),
        'recurrent_regularizer': lambda x: orth_reg_gain(x, 1.00, 1)
    },
    'Orthogonal_Gain{}_Reg0.5'.format(GAIN): {
        'GAIN_ORTH': GAIN,
        'recurrent_initializer': Orthogonal(gain=GAIN),
        'recurrent_regularizer': lambda x: orth_reg_gain(x, 1.00, 0.5)
    },
    'Orthogonal_NoReg': {
        'GAIN_ORTH': 1,
        'recurrent_initializer': Orthogonal(gain=1.0),
        'recurrent_regularizer': None
    },
    'NoOrthogonal': {
        'GAIN_ORTH': 1,
        'recurrent_initializer': 'glorot_normal',
        'recurrent_regularizer': None
    }  
}

In [None]:
GAIN = 1.25

configs3 = {
    'Orthogonal_Gain{}_adj_Reg1'.format(GAIN): {
        'GAIN_ORTH': GAIN,
        'recurrent_initializer': Orthogonal(gain=GAIN),
        'recurrent_regularizer': lambda x: orth_reg_gain(x, GAIN, 1)
    },
    'Orthogonal_Gain{}_adj_Reg0.5'.format(GAIN): {
        'GAIN_ORTH': GAIN,
        'recurrent_initializer': Orthogonal(gain=GAIN),
        'recurrent_regularizer': lambda x: orth_reg_gain(x, GAIN, 0.5)
    },
    'Orthogonal_Gain{}_Reg1'.format(GAIN): {
        'GAIN_ORTH': GAIN,
        'recurrent_initializer': Orthogonal(gain=GAIN),
        'recurrent_regularizer': lambda x: orth_reg_gain(x, 1.00, 1)
    },
    'Orthogonal_Gain{}_Reg0.5'.format(GAIN): {
        'GAIN_ORTH': GAIN,
        'recurrent_initializer': Orthogonal(gain=GAIN),
        'recurrent_regularizer': lambda x: orth_reg_gain(x, 1.00, 0.5)
    },
    'Orthogonal_Gain{}_NoReg'.format(GAIN): {
        'GAIN_ORTH': GAIN,
        'recurrent_initializer': Orthogonal(gain=GAIN),
        'recurrent_regularizer': None
    },
    'Orthogonal_NoReg': {
        'GAIN_ORTH': 1,
        'recurrent_initializer': Orthogonal(gain=1.0),
        'recurrent_regularizer': None
    },
    'NoOrthogonal': {
        'GAIN_ORTH': 1,
        'recurrent_initializer': 'glorot_uniform',
        'recurrent_regularizer': None
    }  
}

### Run the training for the configurations picked

In [None]:
cfgs = configs3
results = {}
for config in cfgs.keys():
    print ("###"*20)
    print ("Starting training of config {}".format(config))
    print ("###"*20)
    model, result = train(cfgs[config], TASK)
    results[config] = result

### Plot results

In [None]:
plot_svs(results)
plot_metrics(results, ['loss', 'val_loss'])
plot_metrics(results, ['acc', 'val_acc'])