# Huge Tensor Bug

## Requirements
I'm running this on the standard 
[RAPIDS docker containers](https://hub.docker.com/r/rapidsai/rapidsai) and also
need the following `pip` dependencies installed:

In [1]:
!pip install torch==1.0.1 pytorch-ignite==0.1.2



## CODE

### Imports

In [2]:
from collections import defaultdict, OrderedDict
import datetime as dt
import glob
import os
import re
import subprocess
import tempfile
import time
#import dask
#from dask.delayed import delayed
#from dask.distributed import as_completed, Client, wait
#from dask_cuda import LocalCUDACluster
from ignite.engine import create_supervised_evaluator, create_supervised_trainer, Events
from ignite.handlers import EarlyStopping as IgniteEarlyStopping
from ignite.metrics import Loss, Metric
#import numpy as np
#import pyarrow.parquet as pq
from sklearn.metrics import auc, precision_recall_curve
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as torch_optim
from torch.utils import data as torch_data

# CUDF_VERSION = tuple(map(int, cudf.__version__.split(".")[:3]))
# assert CUDF_VERSION >= (0, 6, 0), "cudf version must be at least 0.6.0! Found {}!".format(CUDF_VERSION)

In [3]:
#from torch.utils.dlpack import from_dlpack, to_dlpack

In [4]:
#from librmm_cffi import librmm_config as rmm_cfg
#rmm_cfg.use_pool_allocator = True

In [5]:
#import cudf
#cudf.__version__

'0.7.0.dev0+1505.g7cb7142'

In [6]:
#import pyarrow
#pyarrow.__version__

'0.12.1'

In [7]:
import pdb

In [8]:
%load_ext autoreload

In [9]:
%autoreload 2

## Configuration

#### ETL - Discretization

In [11]:
max_quantiles = 20  # Used for computing histograms of continuous features
num_features = 2 ** 22  # When hashing features range will be [0, num_features)

#### Training - Model

In [12]:
embedding_size = 32
hidden_dims = [600,600,600,600]

device = 'cuda'
dropout = None  # Can add dropout probability in [0, 1] here
activation = nn.ReLU()

#### Training - Optimization

In [13]:
epoch_size = 10000000

train_batch_size = 2048
validation_batch_size = train_batch_size*2

log_interval = 100*2048//train_batch_size

learning_rate = 0.01
patience = 4
lr_multiplier = 0.5
max_epochs = 3  # Increase this for a more realistic training run 

## PyTorch DNN Model

In [14]:
def _make_hidden_layer(in_dim, out_dim, activation, dropout=None):
    if dropout:
        return nn.Sequential(nn.Linear(in_dim, out_dim), activation, nn.Dropout(p=dropout))
    return nn.Sequential(nn.Linear(in_dim, out_dim), activation)


class MortgageNetwork(nn.Module):
    """Mortgage Delinquency DNN."""

    def __init__(
        self,
        num_features,
        embedding_size,
        hidden_dims,
        use_cuda=True,
        activation=nn.ReLU(),
        dropout=None,
        embedding_bag_mode='mean'
    ):
        super(MortgageNetwork, self).__init__()
        self.input_size = num_features
        self.embedding_size = embedding_size
        if use_cuda and torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        self.activation = activation
        self.dropout = dropout

        self.embedding = nn.modules.EmbeddingBag(self.input_size, self.embedding_size,
                                                 mode=embedding_bag_mode)

        if len(hidden_dims) > 0:
            dims = [self.embedding_size] + hidden_dims
            hidden_layers = [
                _make_hidden_layer(dims[i], dims[i + 1], self.activation, self.dropout)
                for i in range(len(dims) - 1)
            ]
            self.hidden_layers = nn.ModuleList(hidden_layers)
            self.hidden_layers.extend([nn.Linear(dims[-1], 1)])
        else:
            self.hidden_layers = []

        self.to(self.device)

    def forward(self, x):
        """Forward pass."""
        out = self.embedding(x)
        out = self.activation(out)
        for layer in self.hidden_layers:
            out = layer(out)
        return out.squeeze()

## Training 

In [15]:
def run_training(model, batch_dataload=False, num_workers=0, use_cuDF=False):
    # Data
    if batch_dataload:
        train_dataset = load_torch_dataset(os.path.join(out_dir, "train"), epoch_size,
                                         batch_size=train_batch_size, use_cuDF=use_cuDF, num_files=1)
#         validation_dataset = load_torch_dataset(os.path.join(out_dir, "validation"),
#                                              batch_size=validation_batch_size, use_cuDF=use_cuDF, num_files=None)
#         test_dataset = load_torch_dataset(os.path.join(out_dir, "test"),
#                                              batch_size=validation_batch_size, use_cuDF=use_cuDF, num_files=None)

        #train_loader = torch_data.DataLoader(train_dataset,
        train_loader = batch_dataloader.BatchDataLoader(train_dataset,
                                         num_workers=0, shuffle=True)
#         validation_loader = batch_dataloader.BatchDataLoader(validation_dataset,
#                                              num_workers=0)
#         test_loader = batch_dataloader.BatchDataLoader(test_dataset,
#                                             num_workers=0)
    else:
        train_dataset = load_torch_dataset(os.path.join(out_dir, "train"), epoch_size, shuffle_files=False)
        validation_dataset = load_torch_dataset(os.path.join(out_dir, "validation"))
        test_dataset = load_torch_dataset(os.path.join(out_dir, "test"))

        train_loader = torch_data.DataLoader(train_dataset,
                                         batch_size=train_batch_size,
                                         num_workers=num_workers)
        validation_loader = torch_data.DataLoader(validation_dataset,
                                             batch_size=validation_batch_size,
                                             num_workers=num_workers)
        test_loader = torch_data.DataLoader(test_dataset,
                                            batch_size=validation_batch_size,
                                            num_workers=num_workers)        
    # Optimizer
    optimizer = torch_optim.Adam(model.parameters(), lr=learning_rate)
    
    # Loss Function
    loss_fn = lambda pred, target: F.binary_cross_entropy_with_logits(pred, target)

    trainer = create_supervised_trainer(model=model, optimizer=optimizer, loss_fn=loss_fn, device=device)

    # Events
    @trainer.on(Events.EPOCH_STARTED)
    def timer(engine):
        setattr(engine.state, "epoch_start", time.time())

    num_epoch_batches = len(train_loader)
    examples_per_epoch = num_epoch_batches * train_batch_size
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % num_epoch_batches + 1
        if iter % log_interval == 0:
            epoch_time_elapsed = time.time() - engine.state.epoch_start
            examples = engine.state.iteration * train_batch_size
            epoch_examples_per_second = (examples - (engine.state.epoch - 1) * examples_per_epoch) / epoch_time_elapsed
            print(
                "Epoch[{}] Iteration[{}/{}] Loss: {:.5f} Example/s: {:.3f} (Total examples: {})".format(
                    engine.state.epoch, iter, num_epoch_batches, engine.state.output,
                    epoch_examples_per_second, examples))


    trainer.run(train_loader, max_epochs=max_epochs)

## New Version of the Dataloader using dlpack

In [16]:
import batch_dataset, batch_dataloader

In [17]:
# This is the dataset that I discovered the issue with
def load_torch_dataset(root_path, num_samples=None, num_files=1, batch_size=1, use_cuDF=False):
    return batch_dataset.ParquetBatchDataset(root_path, batch_size = batch_size, num_files=num_files, use_cuDF=use_cuDF, file_offset=4)

In [21]:
# Create a dataset of random tensors that fit the model (45 wide longtensor, 1 wide float tensor target)
def load_torch_dataset(root_path, num_samples=None, num_files=1, batch_size=1, use_cuDF=False):
    return batch_dataset.RandomLongBatchDataset(num_samples = 15000000, batch_size = batch_size, cpu_mem=False)

In [23]:
out_dir=''

In [24]:
model = None
model = MortgageNetwork(num_features, embedding_size, hidden_dims,
                        dropout=dropout, activation=activation, use_cuda=True)

## Performance issue
Slowdown occurs around 12M examples.  Examples/s start getting slower and even when the epoch resets to access the memory that used to be fast performance is still poor and continues to degrade.

You can see epoch 2 is about 1/2 as fast and performance keeps getting worse.

I've tried to offset the dataloader so that it starts in that memory region and the slowdown is immediate and starts at around 25K examples/s, even worse than the worst results here.

In [25]:
run_training(model, batch_dataload=True, num_workers=0, use_cuDF=False)

Epoch[1] Iteration[100/7324] Loss: 0.00000 Example/s: 86520.171 (Total examples: 204800)
Epoch[1] Iteration[200/7324] Loss: 0.00000 Example/s: 98507.323 (Total examples: 409600)
Epoch[1] Iteration[300/7324] Loss: 0.00000 Example/s: 103318.410 (Total examples: 614400)
Epoch[1] Iteration[400/7324] Loss: 0.00000 Example/s: 105624.233 (Total examples: 819200)
Epoch[1] Iteration[500/7324] Loss: 0.00000 Example/s: 107112.090 (Total examples: 1024000)
Epoch[1] Iteration[600/7324] Loss: 0.00000 Example/s: 108130.254 (Total examples: 1228800)
Epoch[1] Iteration[700/7324] Loss: 0.00000 Example/s: 108869.712 (Total examples: 1433600)
Epoch[1] Iteration[800/7324] Loss: 0.00000 Example/s: 109449.659 (Total examples: 1638400)
Epoch[1] Iteration[900/7324] Loss: 0.00000 Example/s: 109884.480 (Total examples: 1843200)
Epoch[1] Iteration[1000/7324] Loss: 0.00000 Example/s: 110242.788 (Total examples: 2048000)
Epoch[1] Iteration[1100/7324] Loss: 0.00000 Example/s: 110541.374 (Total examples: 2252800)
Epo