#### Ideas
- Train a very large CNN-dense network on big computer:
    - Use window step size of 1
    - Problem: sequences are so long that the model is more likely to overfit than to learn useful things
    - Solution: do random masking on data as sort of regularization
    - 1D CNNs with smaller stride, followed by just dense should be a decent architecture
    - If its not too difficult, do CNN for dim-reduction followed by transformer block
- Split a sequence into chunks and do manual feature engineering:
    - Pro: Solves the overfitting problem with long sequences
    - Pro: trains faster
    - Con: Removes one of the main benefits of NNs (automatic feature engineering)
    - Con: requires clever and careful feature engineering
    - Con: might be more computationally heavy if feat eng is done on the fly
- Try transformer/self-attention architecture
- Try the feature engineering approach for validation
    - Maybe try continuous prediction? 

### Imports

In [1]:
import sys
if not '../' in sys.path:
    sys.path.append('../')

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
import dask

import importlib

### Read in and preprocess data

In [2]:
data_dir = '../data/'
preprocessed_dir = data_dir + 'preprocessed/'

In [3]:
train_data = pd.read_csv(data_dir + 'train.csv',  dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32}).values
test_dir = data_dir + 'test/'

In [4]:
## Drop some of the training data for memory efficiency
data_frac = 1.0
train_data = train_data[:int(data_frac * len(train_data))].copy()

In [5]:
train_desc = pd.Series.from_csv(preprocessed_dir + 'training_data_description.csv')

  infer_datetime_format=infer_datetime_format)


In [6]:
def scale(acoustic_data, standard=True):
    if not standard:
        ## Puts values in range [-1, 1]
        acoustic_data = 2 * (acoustic_data - train_desc['mean']) / (train_desc['max'] - train_desc['min'])
    else:
        acoustic_data = (acoustic_data - train_desc['mean']) / train_desc['std']
        
    return acoustic_data

In [7]:
train_data[:, 0] = scale(train_data[:, 0])

### Hyperparameters

In [12]:
## Model config
config = dict(
    
    data_dir = data_dir,
    use_cuda = torch.cuda.is_available(),
    seq_len = 150000,
    
    ## Training parameters
    batch_size = 16,
    lr = 0.001,
    num_epochs = 20,
    clip = 0.1, # Gradient clipping
    eval_step = 0.1, # how often to evaluate, per epoch. E.g., 0.5 -> 2 times per epoch
    patience = 10, # patience (in nr of evals) for early stopping. If None, will not use early stopping 
    revert_after_training = True, # If true, reverts model parameters after training to best found during early stopping
    
    ## Model hyperparameters
    model_choice = 0,
    optim_choice = 0,
    n_filters = [64, 128, 128, 256, 256, 512],
    kernel_size = [3000, 50, 10, 5, 5, 5],
    conv_stride = [50, 10, 5, 3, 2, 2],
    dense_size = [1000],
    dropout = 0.3,
)

device = torch.device("cuda" if config['use_cuda'] else "cpu")
print("Using {}.".format(device))

Using cuda.


### Create dataset

In [31]:
import utils.dataset
importlib.reload(utils.dataset)
from utils.dataset import *

In [34]:
valid_frac = 0.2
n_train_data = int(len(train_data) * (1-valid_frac))

X_train = train_data[:n_train_data - 150000]
X_valid = train_data[n_train_data:]

train_dataset = EarthquakeDatasetTrain(X_train, window_step=1000, mask_prob=0.2)
valid_dataset = EarthquakeDatasetTrain(X_valid, window_step=150000)

train_loader = DataLoader(train_dataset, 
                          batch_size=config['batch_size'], 
                          shuffle=True, 
                          num_workers=4)

valid_loader = DataLoader(valid_dataset, 
                         batch_size=100, 
                         shuffle=False, 
                         num_workers=4)

print("{:,} train examples.".format(len(train_dataset)))
print("{:,} valid examples.".format(len(valid_dataset)))

503,016 train examples.
837 valid examples.


### Create model

In [35]:
import utils.models
importlib.reload(utils.models)
from utils.models import *

import utils.model_wrapper
importlib.reload(utils.model_wrapper)
from utils.model_wrapper import *
model = ModelWrapper(config=config)

Intermediate sizes:
150000 3001
3001 301
301 61
61 21
21 11
11 6


In [36]:
summary, n_params = model.get_summary()
print("{:,} total parameters".format(n_params))
summary

4,992,353 total parameters


Unnamed: 0,name,# params
0,convs.0.0.weight,192000
1,convs.0.0.bias,64
2,convs.0.1.weight,64
3,convs.0.1.bias,64
4,convs.1.0.weight,409600
5,convs.1.0.bias,128
6,convs.1.1.weight,128
7,convs.1.1.bias,128
8,convs.2.0.weight,163840
9,convs.2.0.bias,128


### Train Model

In [None]:
## DEBUG
config_changes = dict(
    num_epochs = 1,
    eval_step = 0.001,
    patience = 2,
    revert_after_training = True,
    clip = 0.5,
    lr = 0.01,
)
model.update_config(config_changes)

_ = model.train(train_loader, valid_loader, verbose=2)

In [37]:
## Preparatory training with higher learning rate and lower gradient clipping
config_changes = dict(
    num_epochs = 20,
    eval_step = 0.1,
    patience = 30,
    revert_after_training = True,
    clip = 0.1,
    lr = 0.001,
)
model.update_config(config_changes)

_ = model.train(train_loader, valid_loader, verbose=2)
print("Preperatory training finished!")

---------- EPOCH 1/20 ----------

New best!
Step: 3143/31439
Total steps: 3143
Training Loss (smooth): 2.980
Validation Loss: 3.253
Maximum GPU consumption so far: 0.439 [GB]

Step: 6286/31439
Total steps: 6286
Training Loss (smooth): 2.985
Validation Loss: 3.268
Maximum GPU consumption so far: 0.439 [GB]

Step: 9429/31439
Total steps: 9429
Training Loss (smooth): 2.973
Validation Loss: 3.279
Maximum GPU consumption so far: 0.439 [GB]

New best!
Step: 12572/31439
Total steps: 12572
Training Loss (smooth): 3.044
Validation Loss: 3.243
Maximum GPU consumption so far: 0.439 [GB]

New best!
Step: 15715/31439
Total steps: 15715
Training Loss (smooth): 2.933
Validation Loss: 3.242
Maximum GPU consumption so far: 0.439 [GB]

New best!
Step: 18858/31439
Total steps: 18858
Training Loss (smooth): 2.975
Validation Loss: 3.228
Maximum GPU consumption so far: 0.439 [GB]

New best!
Step: 22001/31439
Total steps: 22001
Training Loss (smooth): 2.933
Validation Loss: 3.213
Maximum GPU consumption so f

KeyboardInterrupt: 

In [None]:
## Full training
config_changes = dict(
    num_epochs = 100,
    patience = config['patience'],
    revert_after_training = True,
    clip = config['clip'],
    lr = config['lr'],
)
model.update_config(config_changes)

_ = model.train(train_loader, valid_loader)

In [48]:
#model.save_state('../checkpoints/', 'model0')

In [65]:
#model = ModelWrapper(pretrained_path='../checkpoints/model0')

In [None]:
preds = model.predict(valid_loader)

In [None]:
preds.mean()

In [None]:
preds.std()

In [None]:
X_train[1].mean()

In [None]:
X_train[1].std()

### Predict

In [13]:
test_dataset = EarthquakeDatasetTest(test_dir)
test_loader = DataLoader(test_dataset,
                         batch_size=100, 
                         shuffle=False, 
                         num_workers=4)

In [14]:
preds, ids = model.predict(test_loader)

In [15]:
submission = pd.DataFrame({
    'seg_id' : ids,
    'time_to_failure' : preds,
})

In [16]:
submission.to_csv('../submission.csv', index=None)