# Huge Tensor Bug

## Requirements
I'm running this on the standard 
[RAPIDS docker containers](https://hub.docker.com/r/rapidsai/rapidsai) and also
need the following `pip` dependencies installed:

In [1]:
!pip install torch==1.0.1 pytorch-ignite==0.1.2



If you're running this on your local machine you should have most things installed

## CODE

### Imports

In [1]:
#from collections import defaultdict, OrderedDict
#import glob
import os
#import re
#import subprocess
import time
from ignite.engine import create_supervised_evaluator, create_supervised_trainer, Events
from ignite.handlers import EarlyStopping as IgniteEarlyStopping
from ignite.metrics import Loss, Metric
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as torch_optim
from torch.utils import data as torch_data

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

## Configuration

#### ETL - Discretization

In [4]:
max_quantiles = 20  # Used for computing histograms of continuous features
num_features = 2 ** 22  # When hashing features range will be [0, num_features)

#### Training - Model

In [5]:
embedding_size = 32
hidden_dims = [600,600,600,600]

device = 'cuda'
dropout = None  # Can add dropout probability in [0, 1] here
activation = nn.ReLU()

#### Training - Optimization

In [6]:
epoch_size = 10000000

train_batch_size = 2048
validation_batch_size = train_batch_size*2

log_interval = 250*2048//train_batch_size

learning_rate = 0.01
patience = 4
lr_multiplier = 0.5
max_epochs = 3  # Increase this for a more realistic training run 

## PyTorch DNN Model

In [7]:
def _make_hidden_layer(in_dim, out_dim, activation, dropout=None):
    if dropout:
        return nn.Sequential(nn.Linear(in_dim, out_dim), activation, nn.Dropout(p=dropout))
    return nn.Sequential(nn.Linear(in_dim, out_dim), activation)


class MortgageNetwork(nn.Module):
    """Mortgage Delinquency DNN."""

    def __init__(
        self,
        num_features,
        embedding_size,
        hidden_dims,
        use_cuda=True,
        activation=nn.ReLU(),
        dropout=None,
        embedding_bag_mode='mean'
    ):
        super(MortgageNetwork, self).__init__()
        self.input_size = num_features
        self.embedding_size = embedding_size
        if use_cuda and torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        self.activation = activation
        self.dropout = dropout

        self.embedding = nn.modules.EmbeddingBag(self.input_size, self.embedding_size,
                                                 mode=embedding_bag_mode)

        if len(hidden_dims) > 0:
            dims = [self.embedding_size] + hidden_dims
            hidden_layers = [
                _make_hidden_layer(dims[i], dims[i + 1], self.activation, self.dropout)
                for i in range(len(dims) - 1)
            ]
            self.hidden_layers = nn.ModuleList(hidden_layers)
            self.hidden_layers.extend([nn.Linear(dims[-1], 1)])
        else:
            self.hidden_layers = []

        self.to(self.device)

    def forward(self, x):
        """Forward pass."""
        out = self.embedding(x)
        out = self.activation(out)
        for layer in self.hidden_layers:
            out = layer(out)
        return out.squeeze()

## Training 

In [8]:
def run_training(model, batch_dataload=False, num_workers=0, use_cuDF=False, shuffle=True):
    # Data
    if batch_dataload:
        train_dataset = load_torch_dataset(os.path.join(out_dir, "train"), epoch_size,
                                         batch_size=train_batch_size, use_cuDF=use_cuDF, num_files=1)
#         validation_dataset = load_torch_dataset(os.path.join(out_dir, "validation"),
#                                              batch_size=validation_batch_size, use_cuDF=use_cuDF, num_files=None)
#         test_dataset = load_torch_dataset(os.path.join(out_dir, "test"),
#                                              batch_size=validation_batch_size, use_cuDF=use_cuDF, num_files=None)

        #train_loader = torch_data.DataLoader(train_dataset,
        train_loader = batch_dataloader.BatchDataLoader(train_dataset,
                                          shuffle=shuffle)
#         validation_loader = batch_dataloader.BatchDataLoader(validation_dataset,
#                                              num_workers=0)
#         test_loader = batch_dataloader.BatchDataLoader(test_dataset,
#                                             num_workers=0)
    else:
        train_dataset = load_torch_dataset(os.path.join(out_dir, "train"), epoch_size, shuffle_files=False)
        validation_dataset = load_torch_dataset(os.path.join(out_dir, "validation"))
        test_dataset = load_torch_dataset(os.path.join(out_dir, "test"))

        train_loader = torch_data.DataLoader(train_dataset,
                                         batch_size=train_batch_size,
                                         num_workers=num_workers)
        validation_loader = torch_data.DataLoader(validation_dataset,
                                             batch_size=validation_batch_size,
                                             num_workers=num_workers)
        test_loader = torch_data.DataLoader(test_dataset,
                                            batch_size=validation_batch_size,
                                            num_workers=num_workers)        
    # Optimizer
    optimizer = torch_optim.Adam(model.parameters(), lr=learning_rate)
    
    # Loss Function
    loss_fn = lambda pred, target: F.binary_cross_entropy_with_logits(pred, target)

    trainer = create_supervised_trainer(model=model, optimizer=optimizer, loss_fn=loss_fn, device=device)

    # Events
    @trainer.on(Events.EPOCH_STARTED)
    def timer(engine):
        setattr(engine.state, "epoch_start", time.time())

    num_epoch_batches = len(train_loader)
    examples_per_epoch = num_epoch_batches * train_batch_size
    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(engine):
        iter = (engine.state.iteration - 1) % num_epoch_batches + 1
        if iter % log_interval == 0:
            epoch_time_elapsed = time.time() - engine.state.epoch_start
            examples = engine.state.iteration * train_batch_size
            epoch_examples_per_second = (examples - (engine.state.epoch - 1) * examples_per_epoch) / epoch_time_elapsed
            print(
                "Epoch[{}] Iteration[{}/{}] Loss: {:.5f} Example/s: {:.3f} (Total examples: {})".format(
                    engine.state.epoch, iter, num_epoch_batches, engine.state.output,
                    epoch_examples_per_second, examples))


    trainer.run(train_loader, max_epochs=max_epochs)

## Script to load large dataset into GPU memory (Random LongTensor)
#### Each load_torch_dataset function below represents a test I did to narrow down the cause of the issue.  Only run one of these

In [9]:
import random_batch_dataset as batch_dataset, batch_dataloader

#### This function demonstrates the issue

In [10]:
# Initial test: Create a dataset of random tensors that fit the model (45 wide longtensor, 1 wide float tensor target)
def load_torch_dataset(root_path, num_samples=None, num_files=1, batch_size=1, use_cuDF=False):
    return batch_dataset.RandomLongBatchDataset(num_samples = 15000000, batch_size = batch_size, cpu_mem=False)

#### This alternative function shows it's not an issue when the tensor is split up

In [11]:
# Secondary test: Splitting the single tensor into 3 tensors removes the issue
def load_torch_dataset(root_path, num_samples=None, num_files=1, batch_size=1, use_cuDF=False):
    return batch_dataset.MultiRandomLongBatchDataset(num_samples = 15000000, batch_size = batch_size, cpu_mem=False)

#### This function shows that when the tensors are created separately and then concatenated the issue returns

In [13]:
# Third test: Concatenating the three tensors in the second test into a single tensor.  Issue shows up again
def load_torch_dataset(root_path, num_samples=None, num_files=1, batch_size=1, use_cuDF=False):
    return batch_dataset.ConcatRandomLongBatchDataset(num_samples = 15000000, batch_size = batch_size, cpu_mem=False)

## Performance issue
Slowdown occurs around 12M examples.  Examples/s start getting slower and even when the epoch resets to access the memory that used to be fast performance is still poor and continues to degrade.

You can see epoch 2 is about 1/2 as fast and performance keeps getting worse.

I've tried to offset the dataloader so that it starts in that memory region and the slowdown is immediate and starts at around 25K examples/s, even worse than the worst results here.

**Note, further testing has shown that this effect only occurs when shuffling the dataset**  Set shuffle=False and the slowdown doesn't occur so this likely has something to do with the random_perm indexing.  I've tried to test that below but it didn't seem to have the same impact

In [11]:
out_dir=''

In [12]:
model = None
model = MortgageNetwork(num_features, embedding_size, hidden_dims,
                        dropout=dropout, activation=activation, use_cuda=True)

In [13]:
run_training(model, batch_dataload=True, shuffle=True)

shuffling batch
Epoch[1] Iteration[250/7324] Loss: 0.00000 Example/s: 112718.623 (Total examples: 512000)
Epoch[1] Iteration[500/7324] Loss: 0.00000 Example/s: 113272.252 (Total examples: 1024000)
Epoch[1] Iteration[750/7324] Loss: 0.00000 Example/s: 113484.826 (Total examples: 1536000)
Epoch[1] Iteration[1000/7324] Loss: 0.00000 Example/s: 113583.441 (Total examples: 2048000)
Epoch[1] Iteration[1250/7324] Loss: 0.00000 Example/s: 113610.944 (Total examples: 2560000)
Epoch[1] Iteration[1500/7324] Loss: 0.00000 Example/s: 113637.484 (Total examples: 3072000)
Epoch[1] Iteration[1750/7324] Loss: 0.00000 Example/s: 113676.748 (Total examples: 3584000)
Epoch[1] Iteration[2000/7324] Loss: 0.00000 Example/s: 113687.516 (Total examples: 4096000)
Epoch[1] Iteration[2250/7324] Loss: 0.00000 Example/s: 113704.248 (Total examples: 4608000)
Epoch[1] Iteration[2500/7324] Loss: 0.00000 Example/s: 113719.250 (Total examples: 5120000)
Epoch[1] Iteration[2750/7324] Loss: 0.00000 Example/s: 113729.681 (T

# Testing tensor indexing

In [11]:
def index(tensor, rmin, rmax):
    for i in range(rmin,rmax):
        y = tensor[i:i+2048]
        y = y*y

In [5]:
features = torch.empty(15000000, 45, dtype=torch.int64, device='cuda').random_(0, 2**22)
#features.cuda()

In [12]:
%time index(features,0,1000000)   

CPU times: user 12 s, sys: 28 ms, total: 12.1 s
Wall time: 12.1 s


In [13]:
%time index(features,13000000,14000000)   

CPU times: user 12 s, sys: 68 ms, total: 12.1 s
Wall time: 12.1 s


In [16]:
idx = torch.randperm(len(features), dtype=torch.int64, device='cuda')
features = features[idx]

In [17]:
%time index(features,0,1000000)   

CPU times: user 11.9 s, sys: 20 ms, total: 11.9 s
Wall time: 11.9 s


In [18]:
%time index(features,13000000,14000000)   

CPU times: user 12.1 s, sys: 32 ms, total: 12.2 s
Wall time: 12.2 s
