# TODO:

1. [x] load dataset into tensor, convert to float32  
    - [ ] apply normalization
2. [ ] Implement DataParallel training  
    - [ ] increase minibatch size to 128 for 32 per device
3. [ ] try training and benchmark speed
4. [ ] fix simulation script to get the correct labels and retrain

gpu datasheet (we have sxm version): https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf

TODO: change dataparallel to distributed data parallel at some point, and move everything from the notebook into a training script

Links:
https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices'


In [2]:
import numpy as np
import scipy.io
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, DataLoader
from pathlib import Path
import h5py
import numpy as np
import dask.array as da
from torchvision.transforms import Normalize
from sklearn.model_selection import train_test_split
import sklearn
import pandas as pd

In [6]:
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))
    
# use devices 0-3

NVIDIA A100-SXM4-80GB
NVIDIA A100-SXM4-80GB
NVIDIA A100-SXM4-80GB
NVIDIA A100-SXM4-80GB
NVIDIA DGX Display


#### Load matlab data

In [7]:
feats = h5py.File('samplesChirp.mat', 'r')
labels = h5py.File('labelsChirp.mat', 'r')
# labels = scipy.io.loadmat('labelsChirp.mat')
feats, labels

(<HDF5 file "samplesChirp.mat" (mode r)>,
 <HDF5 file "labelsChirp.mat" (mode r)>)

In [8]:
feats['samples'], labels['labels']['position']

(<HDF5 dataset "samples": shape (1920, 300, 1024), type "<f8">,
 <HDF5 dataset "position": shape (480, 3), type "<f8">)

In [9]:
feats_da = da.from_array(feats['samples']).astype('float32') # cast to float32
feats_da = feats_da[:,None,:,:] # add channel dimension
feats_da

Unnamed: 0,Array,Chunk
Bytes,2.20 GiB,56.25 MiB
Shape,"(1920, 1, 300, 1024)","(48, 1, 300, 1024)"
Count,121 Tasks,40 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.20 GiB 56.25 MiB Shape (1920, 1, 300, 1024) (48, 1, 300, 1024) Count 121 Tasks 40 Chunks Type float32 numpy.ndarray",1920  1  1024  300  1,

Unnamed: 0,Array,Chunk
Bytes,2.20 GiB,56.25 MiB
Shape,"(1920, 1, 300, 1024)","(48, 1, 300, 1024)"
Count,121 Tasks,40 Chunks
Type,float32,numpy.ndarray


In [10]:
labels_da = da.from_array(labels['labels']['position']).astype('float32')
labels_da = da.repeat(labels_da, 4, axis=0) # expand to same shape as features for now, later fix simulation
labels_da

Unnamed: 0,Array,Chunk
Bytes,22.50 kiB,7.50 kiB
Shape,"(1920, 3)","(640, 3)"
Count,12 Tasks,3 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 22.50 kiB 7.50 kiB Shape (1920, 3) (640, 3) Count 12 Tasks 3 Chunks Type float32 numpy.ndarray",3  1920,

Unnamed: 0,Array,Chunk
Bytes,22.50 kiB,7.50 kiB
Shape,"(1920, 3)","(640, 3)"
Count,12 Tasks,3 Chunks
Type,float32,numpy.ndarray


In [10]:
X = torch.Tensor(feats_da.compute())
Y = torch.Tensor(labels_da.compute())
X.shape, Y.shape, X.dtype, Y.dtype

(torch.Size([1920, 1, 300, 1024]),
 torch.Size([1920, 3]),
 torch.float32,
 torch.float32)

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [18]:
X_train_mean = X_train.mean()
X_train_std = X_train.std()

NORMALIZATION:
1. create custom Dataset class based on TensorDataset that will apply a normalization transform if provided
2. create train and test datasets, pass in X_train_mean and X_train_std

In [31]:
class CustomTensorDataset(Dataset):
    def __init__(self, tensors, transforms=None):
        # check to make sure number of samples match
        assert all(tensors[0].shape[0] == tens.shape[0] for tens in tensors)
        self.tensors = tensors
        self.transforms = transforms
        
    def __getitem__(self, index):
        x = self.tensors[0][index]
        
        if self.transforms is not None:
            x = self.transforms(x)
            
        y = self.tensors[1][index]
        return x, y
    
    def __len__(self):
        return self.tensors[0].shape[0]

In [55]:
train_dataset = CustomTensorDataset([X_train, Y_train], Normalize(X_train_mean, X_train_std))
test_dataset = CustomTensorDataset([X_test, Y_test], Normalize(X_train_mean, X_train_std))
train_dataset, test_dataset

(<__main__.CustomTensorDataset at 0x7f5a58098880>,
 <__main__.CustomTensorDataset at 0x7f5a58098e20>)

### sample shape for spectrogram dataset: (minibatch_size, 1, 300, 1024)

In [56]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=True)
train_loader, test_loader

(<torch.utils.data.dataloader.DataLoader at 0x7f5a58081520>,
 <torch.utils.data.dataloader.DataLoader at 0x7f5a58081640>)

### Define models and functions

In [11]:
class MyCNN(nn.Module):
    def __init__(self):
        super(MyCNN, self).__init__()
        self.seq = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # nn.Dropout2d(p=0.2),
            
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # nn.Dropout2d(p=0.2),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # nn.Dropout2d(p=0.2),
        )
        # linear_in_dim = int(300/2/2/2*1024/2/2/2*64)
        linear_in_dim = 303104
        self.linear1 = nn.Linear(linear_in_dim, 100)
        # self.dropout1 = nn.Dropout(p=0.2)
        self.linear2 = nn.Linear(100, 20)
        # self.dropout2 = nn.Dropout(p=0.2)
        self.linear3 = nn.Linear(20, 3)

    
    def forward(self, x):
        out = self.seq(x)
        out = out.view(out.size(0), -1) # flatten to (batch size, int)
        out = F.relu(self.linear1(out))
        # out = self.dropout1(out)
        out = F.relu(self.linear2(out))
        # out = self.dropout2(out)
        out = self.linear3(out)
        return out
        
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.seq = nn.Sequential(
            nn.Flatten(),
            nn.Linear(768, 20),
            nn.ReLU(),
            nn.Linear(20, 10),
            nn.ReLU(),
            nn.Linear(10, 3)
        )
    def forward(self, x):
        return self.seq(x)

def EucLoss(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
    assert a.shape == b.shape
    assert b.shape[-1] == 3
    return torch.sum((a-b).square(), dim=-1).sqrt().mean()

def EucLossSquared(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
    assert a.shape == b.shape
    assert b.shape[-1] == 3
    return torch.sum((a-b).square(), dim=-1).mean()


In [19]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = SimpleNN().to(device)

model = nn.DataParallel(MyCNN(), device_ids=[0,1,2,3]).cuda()
model

DataParallel(
  (module): MyCNN(
    (seq): Sequential(
      (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU()
      (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (8): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (9): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (10): ReLU()
      (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (linear1): Linear(in_features=303104, out_features=100, bias=True)
    (linear2): Linear(in_features=100, out_feature

In [20]:
crit = EucLoss
# crit = nn.L1Loss()

optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# optimizer = torch.optim.RMSprop(model.parameters())


#### Training/Evaluating NN

In [21]:
num_epochs = 500 
loss_tracker = np.zeros((num_epochs, 2))

num_train_batches = len(train_loader)
num_test_batches = len(test_loader)

for epoch in range(num_epochs):
    train_loss = 0
    test_loss = 0

    
    model = model.train()
    
    for batch_idx, (ft, lbl) in enumerate(train_loader):
        # ft, lbl = ft.to(device), lbl.to(device)
        optimizer.zero_grad()
        output = model(ft)
        lbl = lbl.cuda()
        loss = crit(output, lbl)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * ft.shape[0]
    train_loss /= len(train_dataset)
    loss_tracker[epoch, 0] = train_loss
        
        
    model = model.eval()
    
    with torch.no_grad():
        for batch_idx, (ft, lbl) in enumerate(test_loader):
            # ft, lbl = ft.to(device), lbl.to(device)
            output = model(ft)
            lbl = lbl.cuda()
            loss = crit(output, lbl)
            test_loss += loss.item() * ft.shape[0]
    test_loss /= len(test_dataset)
    loss_tracker[epoch, 1] = test_loss
            
    print('Epoch {} | Training loss = {} | Test loss = {}'.format(epoch, train_loss, test_loss))
    
    
    
img_dir = Path('./loss_plots')
img_dir.mkdir(parents=True, exist_ok=True)

plt.figure()
plt.plot(loss_tracker)
plt.title('Training vs Testing Loss (Mean Loss Per Batch)')
plt.legend(['Training loss', 'Test loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.savefig(img_dir / 'spectrogram_dset_dgx.png')

# save model
# model_dir = Path('./models')
# model_dir.mkdir(parents=True, exist_ok=True)
# torch.save(model.state_dict(), model_dir / 'spectrogram_dset_dgx.pth')



NameError: name 'train_loader' is not defined

### Notes:

CNN seemed to help accuracy, as well as more linear layers. However, it is overfitting heavily. Batchnorm didn't really make a difference, dropout seems to make things worse. try running on stampede