# Deep Learning

***Name***: *Aviv Nutovitz*


### Import Packages

In [1]:
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Activation, Embedding, LSTM, GRU, TimeDistributed
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras.utils.np_utils import to_categorical
from keras.optimizers import Adam, SGD
import matplotlib.pyplot as plt
import os, time
from google.colab import files
import collections
import tensorflow as tf
import numpy as np
import math
import pandas as pd

SAVE_RESULTS_TO_DRIVE = True
FOLDER_ID = 'XXX'

Using TensorFlow backend.


In [None]:
# first upload the data files from your local file system:
uploadedFiles = files.upload()

### Model Architecture

In [0]:
#class Model:
class Model:
  """
  modelTypeDict - dictionary with the type of regularization and the reuquired parameters: 
  1. {"L":"none"} (LSTM based network without dropout)
  2. {"LD":"0.5"} (LSTM based network with dropout)
  3. {"G":"none"} (GRU based network without dropout)
  4. {"GD":"0.5"} (GRU based network with dropout)
  """
  @staticmethod
  def build(model_type_dict, vocabulary, hidden_size=200, num_steps=20):
    # initialize the model
    model = Sequential()
    # parameters for the diffrent types of regularization
    type_key = next(iter(model_type_dict))
    type_value = model_type_dict[type_key]
    
    # embedding layer
    model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
    
    # for LSTM model
    if 'L' in type_key:
      # first layer
      model.add(LSTM(hidden_size, return_sequences=True))
      # add dropouts
      if 'D' in type_key:
        model.add(Dropout(float(type_value)))
      # scound layer
      model.add(LSTM(hidden_size, return_sequences=True))
      # add dropouts
      if 'D' in type_key:
        model.add(Dropout(float(type_value)))
    # for GRU model
    else:
      # first layer
      model.add(GRU(hidden_size, return_sequences=True))
      # add dropouts
      if 'D' in type_key:
        model.add(Dropout(float(type_value)))
      # scound layer
      model.add(GRU(hidden_size, return_sequences=True))
      # add dropouts
      if 'D' in type_key:
        model.add(Dropout(float(type_value)))
    
    model.add(TimeDistributed(Dense(vocabulary)))
    model.add(Activation('softmax'))
    optimizer = Adam(0.0005, clipnorm=10)
#     optimizer = Nadam(lr=0.1, clipnorm=10)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy'])
    return model
    
def build_model(model_type, vocabulary):
  model = Model.build(model_type, vocabulary)
  return model


### Data Preprocessing

In [0]:
class BatchGenerator:

    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        # this will track the progress of the batches sequentially through the
        # data set - once the data reaches the end of the data set it will reset
        # back to zero
        self.current_idx = 0
        # skip_step is the number of words which will be skipped before the next
        # batch is skimmed from the data set
        self.skip_step = skip_step

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                    # reset the index back to the start of the data set
                    self.current_idx = 0
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps + 1]
                # convert all of temp_y into a one hot representation
                y[i, :, :] = to_categorical(temp_y, num_classes=self.vocabulary)
                self.current_idx += self.skip_step
            yield x, y

In [0]:
# helper functions
def read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().replace("\n", "<eos>").split()

def build_vocab(filename):
    data = read_words(filename)

    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))

    return word_to_id

def file_to_word_ids(filename, word_to_id):
    data = read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]
  
def load_data():
  # build the complete vocabulary, then convert text data to list of integers
  train_path = 'ptb.train.txt'
  valid_path = 'ptb.valid.txt'
  test_path = 'ptb.test.txt'

  word_to_id = build_vocab(train_path)
  train_data = file_to_word_ids(train_path, word_to_id)
  valid_data = file_to_word_ids(valid_path, word_to_id)
  test_data = file_to_word_ids(test_path, word_to_id)
  vocabulary = len(word_to_id)
  reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))

  return train_data, valid_data, test_data, vocabulary, reversed_dictionary

###Drive REST API

In [0]:
def set_drive_api():
  # connect to drive
  !pip install -U -q PyDrive
  from pydrive.auth import GoogleAuth
  from pydrive.drive import GoogleDrive
  from google.colab import auth
  from oauth2client.client import GoogleCredentials
  from google.colab import files

  # 1. Authenticate and create the PyDrive client.
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)

  # PyDrive reference:
  # https://gsuitedevs.github.io/PyDrive/docs/build/html/index.html

### Save Model & Results to Google Drive

In [0]:
def save_model_to_drive(net_type, model):
  # parameters
  net_type_to_save = next(iter(net_type))
  full_model_name = "model_{}.h5".format(net_type_to_save)
  # create is localy on the remote machine 
  model.save(full_model_name)
  # save to drive
  save(full_model_name)
  print("Saved full model {} to disk".format(net_type_to_save))

def save_resluts_to_drive_as_txt(net_type, val_categorical_accuracy_list, val_loss_list):
  net_type_to_save = next(iter(net_type))
  model_results = "model_results_{}.txt".format(net_type_to_save)
  results_df = pd.DataFrame({'loss': t_loss_list, 'acc': t_acc_list, 'epochs': t_epochs_list, 'val_loss':val_loss_list, 'val_acc':val_categorical_accuracy_list})
  # write results locally
  results_df.to_csv(model_results, header=None, index=None, sep=' ')
  save(model_results)
  
def save(file_name):
  # 2. Create & upload a file json file to drive
  uploaded = drive.CreateFile({'title': file_name, "parents": [{"kind": "drive#fileLink","id": FOLDER_ID}]})
  uploaded.SetContentFile(file_name)
  uploaded.Upload()
  print('Uploaded file with ID {}'.format(uploaded.get('id')))

###Upload Saved Results

In [0]:
def load_model_from_drive(net_type):
  net_type_to_save = next(iter(net_type))
  model_file_name = 'model_{}.h5'.format(net_type_to_save)
  # download form drive
  dowanload_file_drive_by_file_name(model_file_name)
  new_model = keras.models.load_model(model_file_name)
  return new_model
  
def load_resluts_from_drive_as_txt(net_type):
  net_type_to_save = next(iter(net_type))
  model_file_name = 'model_results_{}.txt'.format(net_type_to_save)
  # download form drive
  dowanload_file_drive_by_file_name(model_file_name)
  # build the data frame
  data = pd.read_csv(model_file_name, sep=" ", header=None)
  data.columns = ["acc", "epochs", "loss", val_loss_list, val_loss_list]
  return data

def dowanload_file_drive_by_file_name(file_name):
  file_list = drive.ListFile({'q': "'"+FOLDER_ID+"' in parents and trashed=false"}).GetList()
  for file1 in file_list:
    if file1['title'] == file_name:
      print('title: %s, id: %s' % (file1['title'], file1['id']))
      downloaded = drive.CreateFile({'id': file1['id']})

### Train Model and Plot Graphs:
- Train model
- Evaluate using test set
- Saving the model to be loaded for testing purposes

In [0]:
t_loss_list, t_acc_list, t_epochs_list = [], [], []

class TestCallback(Callback):
    def __init__(self, t_data):
        self.t_data = t_data

    def on_epoch_end(self, epoch, logs={}):
        print("\nEvaluating loss on train at the end of epoch " + str(int(epoch) + 1) + "...")
        t1 = time.time()
        steps_per_epoch = len(self.t_data)//(batch_size*num_steps)
        data_generator = BatchGenerator(self.t_data, num_steps, batch_size, vocabulary)
        loss, acc = self.model.evaluate_generator(data_generator.generate(), steps=steps_per_epoch)
        t_loss_list.append(loss)
        t_acc_list.append(acc)
        t_epochs_list.append(epoch)
        print('Loss: {}, acc: {}'.format(loss, acc))
        t2 = time.time()
        print("Total evaluation time: %0.2fs\n" % (t2 - t1))


# train model
def train_type(net_type, train_data, valid_data, num_steps, batch_size, vocabulary, epochs=20):
  
  np.random.seed(2018)
  tf.set_random_seed(2018)
  
  # build & compile the model for the training
  model = build_model(net_type, vocabulary)
  
  checkpoint_path = "training_{}/cp.ckpt".format(next(iter(net_type)))
  checkpoint_dir = os.path.dirname(checkpoint_path)
  # create dir if not exsit
  !mkdir {checkpoint_dir}
  
  # Create checkpoint callback
  cp_callback = ModelCheckpoint(checkpoint_path, monitor='loss', verbose=1, save_best_only=True, mode='min')
  t_callback = TestCallback(train_data)
  
  train_data_generator = BatchGenerator(train_data, num_steps, batch_size, vocabulary, skip_step=num_steps)
  valid_data_generator = BatchGenerator(valid_data, num_steps, batch_size, vocabulary, skip_step=num_steps)
  
  # Measure training time
  t3 = time.time() 
  
  history = model.fit_generator(train_data_generator.generate(), 
                                len(train_data)//(batch_size*num_steps), epochs,
                                validation_data=valid_data_generator.generate(),
                                validation_steps=len(valid_data)//(batch_size*num_steps), 
                                callbacks=[cp_callback, t_callback])
  
  # history results 
  val_categorical_accuracy_list = history.history['val_categorical_accuracy']
  val_loss_list = history.history['val_loss']
  
  # save results to drive
  if SAVE_RESULTS_TO_DRIVE:
    set_drive_api()
    save_model_to_drive(net_type, model)
    save_resluts_to_drive_as_txt(net_type, val_loss_list, val_categorical_accuracy_list)
  
  t4 = time.time()
  print("Total training time: %0.2fs" % (t4 - t3))
  
  ## return the saved model path and history
  return checkpoint_path, history

In [0]:
# plot graphs
def plot_graphs(net_type):
  # get dataframe form drive
  df = load_resluts_from_drive_as_txt(net_type)
  # set results to lists
  t_loss_list, t_acc_list, t_epochs_list, val_categorical_accuracy_list, val_loss_list = df["loss"], df["acc"], df["epochs"], df["val_acc"], df["val_loss"]
 
  val_perplexity_lst = [math.exp(x) for x in val_loss_list]
  t_perplexity_lst = [math.exp(x) for x in t_loss_list]
  
  # summarize history for accuracy
  plt.plot(t_epochs_list, t_acc_list)
  plt.plot(np.arange(0, len(val_perplexity_lst), 1), val_categorical_accuracy_list)
  plt.title('model categorical accuracy for {}'.format(next(iter(net_type))))
  plt.ylabel('categorical accuracy')
  plt.xlabel('epoch')
  plt.legend(['train', 'valid'], loc='upper left')
  plt.show()
  
  plt.plot(t_epochs_list, t_perplexity_lst)
  plt.plot(np.arange(0, len(val_perplexity_lst), 1),val_perplexity_lst)
  plt.title('model perplexity for {}'.format(next(iter(net_type))))
  plt.ylabel('perplexity')
  plt.xlabel('epoch')
  plt.legend(['train', 'valid'], loc='upper left')
  plt.show()

### Load model and check test final results

In [0]:
def final_perplexity_per_model_type(net_type, data, vocabulary, num_steps, batch_size, checkpoint_path, is_test=True):
  # build & compile the model for the training
  new_model = None
  if SAVE_RESULTS_TO_DRIVE:
    new_model = load_model_from_drive(net_type)
  else:
    new_model = build_model(net_type, vocabulary)
    new_model.load_weights(checkpoint_path)
  
  # setup data
  data_generator = BatchGenerator(data, num_steps, batch_size, vocabulary)
  
  loss, acc = new_model.evaluate_generator(data_generator.generate(), steps=len(data)//(batch_size*num_steps))
  print("Restored model, perplexity: {:5.2f}".format(math.exp(loss)))
  if is_test:
    print("------------------------ Done test stage: {} ------------------------".format(net_type))
  else:
    print("------------------------ Done validate stage: {} ------------------------".format(net_type))

### Run Different Types

In [0]:
# get the data for the training
train_data, valid_data, test_data, vocabulary, reversed_dictionary = load_data()

# set parameters for the run
num_steps = 20
batch_size = 20

# pick one to the following types
type_1 = {"L":"none"}
checkpoint_path_type_1 = None

type_2 = {"LD":"0.5"}
checkpoint_path_type_2 = None

type_3 = {"G":"none"}
checkpoint_path_type_3 = None

type_4 = {"GD":"0.5"}
checkpoint_path_type_4 = None


#### LSTM without Dropouts 

In [None]:
t_loss_list, t_acc_list, t_epochs_list = [], [], []
checkpoint_path_type_1, history_1 = train_type(type_1, train_data, valid_data, num_steps, batch_size, vocabulary, epochs=25)
print("Model saved into: {}".format(checkpoint_path_type_1))

In [0]:
# plot results
plot_graphs(type_1)

In [None]:
# for validate
final_perplexity_per_model_type(type_1, valid_data, vocabulary, num_steps, batch_size, checkpoint_path_type_1, is_test=False)

In [None]:
# for test
final_perplexity_per_model_type(type_1, test_data, vocabulary, num_steps, batch_size, checkpoint_path_type_1)

#### LSTM with Dropouts 

In [0]:
t_loss_list, t_acc_list, t_epochs_list = [], [], []
checkpoint_path_type_2, history_2 = train_type(type_2, train_data, valid_data, num_steps, batch_size, vocabulary, epochs=25)
print("Model saved into: {}".format(checkpoint_path_type_2))

In [0]:
# graphs 2
plot_graphs(history_2, type_2, t_loss_list, t_acc_list, t_epochs_list)

In [0]:
# for validate
final_perplexity_per_model_type(type_2, valid_data, vocabulary, num_steps, batch_size, checkpoint_path_type_2, is_test=False)

In [0]:
# for test
final_perplexity_per_model_type(type_2, test_data, vocabulary, num_steps, batch_size, checkpoint_path_type_2)

#### GRU without Dropouts 

In [0]:
t_loss_list, t_acc_list, t_epochs_list = [], [], []
checkpoint_path_type_3, history_3 = train_type(type_3, train_data, valid_data, test_data, num_steps, batch_size, vocabulary)
print("Model saved into: {}".format(checkpoint_path_type_3))

In [0]:
# plot results
plot_graphs(history_3, type_3, t_loss_list, t_acc_list, t_epochs_list)

In [0]:
# for validate
final_perplexity_per_model_type(type_3, valid_data, vocabulary, num_steps, batch_size, checkpoint_path_type_3, is_test=False)

In [0]:
# for test
final_perplexity_per_model_type(type_3, test_data, vocabulary, num_steps, batch_size, checkpoint_path_type_3)

#### GRU with Dropouts 

In [0]:
t_loss_list, t_acc_list, t_epochs_list = [], [], []
checkpoint_path_type_4, history_4 = train_type(type_4, train_data, valid_data, test_data, num_steps, batch_size, vocabulary, epochs=40)
print("Model saved into: {}".format(checkpoint_path_type_4))

In [0]:
# plot results
plot_graphs(history_4, type_4, t_loss_list, t_acc_list, t_epochs_list)

In [0]:
# for validate
final_perplexity_per_model_type(type_4, valid_data, vocabulary, num_steps, batch_size, checkpoint_path_type_4, is_test=False)

In [0]:
# for test
final_perplexity_per_model_type(type_4, test_data, vocabulary, num_steps, batch_size, checkpoint_path_type_4)