#### Ideas
- Train a very large CNN-dense network on big computer:
    - Use window step size of 1
    - Problem: sequences are so long that the model is more likely to overfit than to learn useful things
    - Solution: do random masking on data as sort of regularization
    - 1D CNNs with smaller stride, followed by just dense should be a decent architecture
    - If its not too difficult, do CNN for dim-reduction followed by transformer block
- Split a sequence into chunks and do manual feature engineering:
    - Pro: Solves the overfitting problem with long sequences
    - Pro: trains faster
    - Con: Removes one of the main benefits of NNs (automatic feature engineering)
    - Con: requires clever and careful feature engineering
    - Con: might be more computationally heavy if feat eng is done on the fly
- Try transformer/self-attention architecture
- Try the feature engineering approach for validation
    - Maybe try continuous prediction? 

### Imports

In [1]:
import sys
if not '../' in sys.path:
    sys.path.append('../')

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader

import importlib

In [2]:
second_earthquake = 50085877
test_len = 150000

### Read in and preprocess data

In [3]:
data_dir = '../data/'
preprocessed_dir = data_dir + 'preprocessed/'

In [4]:
train_data = pd.read_csv(data_dir + 'train.csv',  dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32}).values
test_dir = data_dir + 'test/'
train_desc = pd.Series.from_csv(preprocessed_dir + 'training_data_description.csv')

  infer_datetime_format=infer_datetime_format)


In [5]:
## Drop some of the training data for memory efficiency
data_frac = 1.0
train_data = train_data[:int(data_frac * len(train_data))].copy()

In [6]:
def scale(acoustic_data, standard=True):
    if not standard:
        ## Puts values in range [-1, 1]
        acoustic_data = 2 * (acoustic_data - train_desc['mean']) / (train_desc['max'] - train_desc['min'])
    else:
        acoustic_data = (acoustic_data - train_desc['mean']) / train_desc['std']
        
    return acoustic_data

In [7]:
train_data[:, 0] = scale(train_data[:, 0])

### Hyperparameters

In [81]:
import utils.dataset
importlib.reload(utils.dataset)
from utils.dataset import *

seq_len, n_features = engineer_features(train_data[0:test_len, 0]).shape

In [82]:
## Model config
chunk_size = 1000
config = dict(
    
    data_dir = data_dir,
    use_cuda = torch.cuda.is_available(),
    seq_len = seq_len,
    n_features = n_features,
    
    ## Training parameters
    batch_size = 32,
    lr = 0.001,
    num_epochs = 20,
    clip = 0.1, # Gradient clipping
    eval_step = 0.1, # how often to evaluate, per epoch. E.g., 0.5 -> 2 times per epoch
    patience = 10, # patience (in nr of evals) for early stopping. If None, will not use early stopping 
    revert_after_training = True, # If true, reverts model parameters after training to best found during early stopping
    
    ## Model hyperparameters
    model_choice = 1,
    optim_choice = 0,
    hidden_size = 48,
    dropout = 0.2,
    dense_size = 10,
    bidirectional = False,
    num_layers = 1,
)

device = torch.device("cuda" if config['use_cuda'] else "cpu")
print("Using {}.".format(device))

Using cuda.


### Create dataset

In [83]:
# valid_frac = 0.2
# n_train_data = int(len(train_data) * (1-valid_frac))

X_train = train_data[second_earthquake + 1:]
X_valid = train_data[:second_earthquake + 1]

train_dataset = EarthquakeDatasetTrain(X_train, window_step=1, mask_prob=0.0)
valid_dataset = EarthquakeDatasetTrain(X_valid, window_step=150000)

train_loader = RandomLoader(train_dataset, 
                            batch_size=config['batch_size'],
                            num_epoch_steps=1000)

valid_loader = DataLoader(valid_dataset, 
                          batch_size=50, 
                          shuffle=False)

print("{:,} train examples.".format(len(train_dataset)))
print("{:,} valid examples.".format(len(valid_dataset)))

578,909,602 train examples.
332 valid examples.


### Create model

In [84]:
import utils.models
importlib.reload(utils.models)
from utils.models import *

import utils.model_wrapper
importlib.reload(utils.model_wrapper)
from utils.model_wrapper import *
model = ModelWrapper(config=config)

In [85]:
summary, n_params = model.get_summary()
print("{:,} total parameters".format(n_params))
summary

9,429 total parameters


Unnamed: 0,name,# params
0,rnn.weight_ih_l0,1728
1,rnn.weight_hh_l0,6912
2,rnn.bias_ih_l0,144
3,rnn.bias_hh_l0,144
4,dense.weight,480
5,dense.bias,10
6,classifier.weight,10
7,classifier.bias,1


### Train Model

In [128]:
## DEBUG
for batch in train_loader:
    break

output = model.net.forward(batch['features'].to(device))

In [86]:
## Preparatory training with higher learning rate and lower gradient clipping
config_changes = dict(
    num_epochs = 20,
    eval_step = 0.2,
    patience = 20,
    revert_after_training = True,
    clip = None,
    lr = 0.0005,
)
model.update_config(config_changes)

_ = model.train(train_loader, valid_loader, verbose=2)
print("Preperatory training finished!")

---------- EPOCH 1/20 ----------

New best!
Step: 200/1000
Total steps: 200
Training Loss (smooth): 4.231
Validation Loss: 3.124
Maximum GPU consumption so far: 0.031 [GB]

New best!
Step: 400/1000
Total steps: 400
Training Loss (smooth): 3.488
Validation Loss: 3.081
Maximum GPU consumption so far: 0.031 [GB]

New best!
Step: 600/1000
Total steps: 600
Training Loss (smooth): 3.326
Validation Loss: 3.079
Maximum GPU consumption so far: 0.031 [GB]

Step: 800/1000
Total steps: 800
Training Loss (smooth): 3.354
Validation Loss: 3.087
Maximum GPU consumption so far: 0.031 [GB]

Step: 1000/1000
Total steps: 1000
Training Loss (smooth): 3.331
Validation Loss: 3.094
Maximum GPU consumption so far: 0.031 [GB]

---------- EPOCH 2/20 ----------

Step: 200/1000
Total steps: 1200
Training Loss (smooth): 3.262
Validation Loss: 3.085
Maximum GPU consumption so far: 0.031 [GB]

Step: 400/1000
Total steps: 1400
Training Loss (smooth): 3.250
Validation Loss: 3.090
Maximum GPU consumption so far: 0.031 [

Step: 1000/1000
Total steps: 12000
Training Loss (smooth): 3.105
Validation Loss: 3.057
Maximum GPU consumption so far: 0.031 [GB]

---------- EPOCH 13/20 ----------

Best validation loss: 3.0559898374310457
At step: 8001
Preperatory training finished!


In [None]:
## Full training
config_changes = dict(
    num_epochs = 100,
    patience = config['patience'],
    revert_after_training = True,
    clip = config['clip'],
    lr = config['lr'],
)
model.update_config(config_changes)

_ = model.train(train_loader, valid_loader)

In [48]:
#model.save_state('../checkpoints/', 'model0')

In [65]:
#model = ModelWrapper(pretrained_path='../checkpoints/model0')

In [16]:
preds = model.predict(valid_loader)

In [17]:
preds.mean()

5.2348948

In [18]:
preds.std()

0.031452578

In [25]:
X_train[:, 1].mean()

5.7194834

In [23]:
X_valid[:, 1].mean()

5.2020006

In [24]:
X_valid[:, 1].std()

3.5223672

### Predict

In [13]:
test_dataset = EarthquakeDatasetTest(test_dir)
test_loader = DataLoader(test_dataset,
                         batch_size=100, 
                         shuffle=False, 
                         num_workers=4)

In [14]:
preds, ids = model.predict(test_loader)

In [15]:
submission = pd.DataFrame({
    'seg_id' : ids,
    'time_to_failure' : preds,
})

In [16]:
submission.to_csv('../submission.csv', index=None)