# Unsupervised training of the encoder layers

## Initial setup
### Imports

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from datetime import datetime
from models import Autoencoder, EncoderStack
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import normalize
import dataset
import neptune
import os

### Set experiment configuration

In [2]:
config = {
    "N_NODES": [2000, 1000, 500],
    "DROPOUT": [0.1],
    "BATCH_SIZE": 15,
    "EPOCHS": 30,
    "TEST_RATIO": 0.30,
    "DATA_BUCKET": "sdae-geo",
    "DATA_OBJECT": "GEO_data_batch_corr_final.csv",
    "DATA_LABELS": " GBM_class.csv",
    "VERBOSITY": 2,
    "LOG_DIR": "./log_dir",
    "PATIENCE":3
}


## Initialize Netptune and Tensorboard logging

In [3]:
os.environ['NEPTUNE_API_TOKEN']="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiYjNiYmZhYjEtNzc3ZS00Y2NhLWI5NTgtYWU0MmQyMWJhM2I0In0="
os.environ['NEPTUNE_PROJECT']="jgeof/sdae"
os.environ['NEPTUNE_NOTEBOOK_ID']="ecd86e96-4da7-44e3-9a17-43da2dfcae35"
os.environ['NEPTUNE_NOTEBOOK_PATH']="constrained-SDAE/sdae.ipynb"
!neptune tensorboard ./log_dir --project ${NEPTUNE_PROJECT}
%load_ext tensorboard

Loading ./log_dir/20200805-234845/log/validation/events.out.tfevents.1596671334.tensorflow-2-2-20200804-235820.7235.31463.v2...
20200805-234845/log/validation/events.out.tfevents.1596671334.tensorflow-2-2-20200804-235820.7235.31463.v2 is already synced
Loading ./log_dir/20200805-234845/log/train/events.out.tfevents.1596671325.tensorflow-2-2-20200804-235820.7235.27435.v2...
20200805-234845/log/train/events.out.tfevents.1596671325.tensorflow-2-2-20200804-235820.7235.27435.v2 is already synced
Loading ./log_dir/20200805-234845/log/train/events.out.tfevents.1596671326.tensorflow-2-2-20200804-235820.profile-empty...
20200805-234845/log/train/events.out.tfevents.1596671326.tensorflow-2-2-20200804-235820.profile-empty is already synced
Loading ./log_dir/20200805-234845/log/train/plugins/profile/2020_08_05_23_48_46/tensorflow-2-2-20200804-235820.overview_page.pb...
20200805-234845/log/train/plugins/profile/2020_08_05_23_48_46/tensorflow-2-2-20200804-235820.overview_page.pb is already synced
Lo

### Init new Neptune experiment

In [None]:
neptune.init(os.environ['NEPTUNE_PROJECT'], api_token=os.environ['NEPTUNE_API_TOKEN'])
experiment = neptune.create_experiment(name='configuration', params=config)
config['EXP_DIR'] = os.path.join(config["LOG_DIR"], experiment.id)

NVMLError: NVML Shared Library Not Found - GPU usage metrics may not be reported.


### Start tensorboard server

Start Tensorboard by running the following command in a terminal:

```bash
tensorboard --logdir ./log_dir --bind_all
```

**Tensorboard cannot server over HTTPS, use external HTTP url: http://34.77.45.86:6006/**

# Load and preprocess the data

## Load data from Google Storage

In [None]:
dataframe = dataset.load_gs_data(config['DATA_BUCKET'], config['DATA_OBJECT'], config['EXP_DIR'])

## Normalize data feature-wize

In [None]:
data = dataframe.values
data = normalize(data)

## Split into training and testing sets

In [None]:
rs = ShuffleSplit(n_splits=1, test_size=config['TEST_RATIO'], random_state=0)
split_itterator = rs.split(data)
i_train, i_test = next(split_itterator)
train_path = os.path.join(config['LOG_DIR'], "train_indices.npy")
test_path = os.path.join(config['LOG_DIR'], "test_indices.npy")
np.save(train_path, i_train)
np.save(test_path, i_test)
neptune.log_artifact(train_path')
neptune.log_artifact(test_path)

x_train, x_test = data[i_train], data[i_test]

print("{}% of samples for training: {} training, {} testing.".format(int(config['TEST_RATIO']*100), len(i_train), len(i_test)))

# Train encoder layers

In [None]:
encoder_models = []

x_train_out, x_test_out = x_train, x_test
for idx, num_hidden in enumerate(config["N_NODES"]):
    info = "Training layer {} with {} hidden nodes..\n".format(idx, num_hidden)
    neptune.log_text(info)
    print(info)
    encoder = Autoencoder(x_train_out.shape[1], num_hidden, config['EXP_DIR'])
    
    recon_mse = encoder.fit(x_train_out, x_test_out, batch_size=config["BATCH_SIZE"], 
        num_epochs=config["EPOCHS"], verbose=config["VERBOSITY"], patience=config["PATIENCE"])
    
    x_train_out = encoder.encoder_model.predict(x_train_out)
    x_test_out = encoder.encoder_model.predict(x_test_out)
    print("\nTraining losss: ", recon_mse[0])
    print("\nTesting loss: ", recon_mse[1])
    encoder_models.append(encoder)