#### Ideas
- Train a very large CNN-dense network on big computer:
    - Use window step size of 1
    - Problem: sequences are so long that the model is more likely to overfit than to learn useful things
    - Solution: do random masking on data as sort of regularization
    - 1D CNNs with smaller stride, followed by just dense should be a decent architecture
    - If its not too difficult, do CNN for dim-reduction followed by transformer block
- Split a sequence into chunks and do manual feature engineering:
    - Pro: Solves the overfitting problem with long sequences
    - Pro: trains faster
    - Con: Removes one of the main benefits of NNs (automatic feature engineering)
    - Con: requires clever and careful feature engineering
    - Con: might be more computationally heavy if feat eng is done on the fly

### Imports

In [1]:
import sys
if not '../' in sys.path:
    sys.path.append('../')

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
import dask

import importlib

### Read in and preprocess data

In [2]:
data_dir = '../data/'
preprocessed_dir = data_dir + 'preprocessed/'

In [3]:
train_data = pd.read_csv(data_dir + 'train.csv',  dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32}).values
test_dir = preprocessed_dir + 'test/'

In [5]:
## Drop some of the training data for memory efficiency
data_frac = 0.5
train_data = train_data[:int(data_frac * len(train_data))].copy()

In [6]:
train_desc = pd.Series.from_csv(preprocessed_dir + 'training_data_description.csv')

  infer_datetime_format=infer_datetime_format)


In [7]:
def scale(acoustic_data):
    ## Puts values in range [-1, 1]
    acoustic_data = 2 * (acoustic_data - train_desc['mean']) / (train_desc['max'] - train_desc['min'])
    return acoustic_data

In [8]:
train_data[0] = scale(train_data[0])

### Hyperparameters

In [13]:
## Model config
config = dict(
    
    data_dir = data_dir,
    use_cuda = torch.cuda.is_available(),
    seq_len = 150000,
    
    ## Training parameters
    batch_size = 16,
    lr = 0.001,
    num_epochs = 20,
    clip = 0.1, # Gradient clipping
    eval_step = 0.5, # how often to evaluate, per epoch. E.g., 0.5 -> 2 times per epoch
    patience = 10, # patience (in nr of evals) for early stopping. If None, will not use early stopping 
    revert_after_training = True, # If true, reverts model parameters after training to best found during early stopping
    
    ## Model hyperparameters
    model_choice = 0,
    optim_choice = 0,
    n_filters = [16, 32],
    kernel_size = [1000, 50],
    conv_stride = [50, 10],
    dense_size = 1000,
    dropout = 0.3,
)

device = torch.device("cuda" if config['use_cuda'] else "cpu")
print("Using {}.".format(device))

Using cuda.


### Create dataset

In [29]:
import utils.dataset
importlib.reload(utils.dataset)
from utils.dataset import *

In [30]:
valid_frac = 0.2
n_train_data = int(len(train_data) * (1-valid_frac))

X_train = train_data[:n_train_data - 150000]
X_valid = train_data[n_train_data:]

train_dataset = EarthquakeDatasetFull(X_train, window_step=1000)
valid_dataset = EarthquakeDatasetFull(X_valid, window_step=150000)

train_loader = DataLoader(train_dataset, 
                          batch_size=config['batch_size'], 
                          shuffle=True, 
                          num_workers=4)

valid_loader = DataLoader(valid_dataset, 
                         batch_size=100, 
                         shuffle=False, 
                         num_workers=4)

print("{:,} train examples.".format(len(train_dataset)))
print("{:,} valid examples.".format(len(valid_dataset)))

251,358 train examples.
418 valid examples.


### Create model

In [27]:
import utils.models
importlib.reload(utils.models)
from utils.models import *

import utils.model_wrapper
importlib.reload(utils.model_wrapper)
from utils.model_wrapper import *
model = ModelWrapper(config=config)

Intermediate sizes:
150000 3001
3001 301


In [17]:
summary, n_params = model.get_summary()
print("{:,} total parameters".format(n_params))
summary

9,675,745 total parameters


Unnamed: 0,name,# params
0,convs.0.0.weight,16000
1,convs.0.0.bias,16
2,convs.0.1.weight,16
3,convs.0.1.bias,16
4,convs.1.0.weight,25600
5,convs.1.0.bias,32
6,convs.1.1.weight,32
7,convs.1.1.bias,32
8,dense.0.weight,9632000
9,dense.0.bias,1000


### Train Model

In [None]:
## DEBUG
config_changes = dict(
    num_epochs = 1,
    eval_step = 0.001,
    patience = 2,
    revert_after_training = True,
    clip = 0.5,
    lr = 0.01,
)
model.update_config(config_changes)

_ = model.train(train_loader, valid_loader, verbose=2)

In [31]:
## Preparatory training with higher learning rate and lower gradient clipping
config_changes = dict(
    num_epochs = 10,
    eval_step = 0.1,
    patience = 50,
    revert_after_training = True,
    clip = 0.5,
    lr = 0.01,
)
model.update_config(config_changes)

_ = model.train(train_loader, valid_loader, verbose=2)
print("Preperatory training finished!")

---------- EPOCH 1/10 ----------

New best!
Step: 1571/15710
Total steps: 1571
Training Loss (smooth): 3.102
Validation Loss: 3.805
Maximum GPU consumption so far: 0.483 [GB]

New best!
Step: 3142/15710
Total steps: 3142
Training Loss (smooth): 3.180
Validation Loss: 3.564
Maximum GPU consumption so far: 0.483 [GB]

Step: 4713/15710
Total steps: 4713
Training Loss (smooth): 3.079
Validation Loss: 3.699
Maximum GPU consumption so far: 0.483 [GB]



KeyboardInterrupt: 

In [103]:
## Full training
config_changes = dict(
    num_epochs = 100,
    patience = config['patience'],
    revert_after_training = True,
    clip = config['clip'],
    lr = config['lr'],
)
model.update_config(config_changes)

_ = model.train(train_loader, valid_loader)

---------- EPOCH 1/100 ----------

Step: 78/1571
Total steps: 78
Training Loss (smooth): 3.531
Validation Loss: 3.565
Maximum GPU consumption so far: 0.698 [GB]

Step: 156/1571
Total steps: 156
Training Loss (smooth): 3.274
Validation Loss: 3.665
Maximum GPU consumption so far: 0.698 [GB]

Step: 234/1571
Total steps: 234
Training Loss (smooth): 3.209
Validation Loss: 3.703
Maximum GPU consumption so far: 0.698 [GB]

Step: 312/1571
Total steps: 312
Training Loss (smooth): 3.194
Validation Loss: 3.723
Maximum GPU consumption so far: 0.698 [GB]

Step: 390/1571
Total steps: 390
Training Loss (smooth): 3.113
Validation Loss: 3.722
Maximum GPU consumption so far: 0.698 [GB]

Step: 468/1571
Total steps: 468
Training Loss (smooth): 3.075
Validation Loss: 3.690
Maximum GPU consumption so far: 0.698 [GB]

Step: 546/1571
Total steps: 546
Training Loss (smooth): 3.120
Validation Loss: 3.748
Maximum GPU consumption so far: 0.698 [GB]

Step: 624/1571
Total steps: 624
Training Loss (smooth): 3.124
Va

In [None]:
# model.save_state('checkpoints', 'model0')

In [None]:
# model.load_state('checkpoints/model0')

### Predict

In [120]:
test_dataset = EarthquakeDatasetTest(test_dir)
test_loader = DataLoader(test_dataset,
                         batch_size=100, 
                         shuffle=False, 
                         num_workers=4)

In [None]:
preds, ids = model.predict(test_loader)

In [132]:
submission = pd.DataFrame({
    'seg_id' : ids,
    'time_to_failure' : preds,
})

In [134]:
submission.to_csv('../submission.csv', index=None)