## 0. Set Args

In [1]:
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser

parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
args = parser.parse_args('')

args.data_dir = '/datadrive'
args.seed = 123
args.lr = 0.01

## 1. Off the shelf implementation

In [2]:
import torch
from torch import nn
from torch.nn import functional as F

In [3]:
class LogisticRegressionLazy(nn.Module):

    def __init__(self, nx):
        super(LogisticRegressionLazy, self).__init__()
        self.scorer = nn.Linear(nx, 1)

    def forward(self, X):
        '''
        X has shape (m, nx)
        '''
        # shape (m, 1)
        z = self.scorer(X)
        # shape (m, 1)
        a = torch.sigmoid(z)
        return z, a

## 2. With custom linear module and sigmoid function

In [4]:
# extending pytorch (demo of custom function with custom forward backward, custom LinearFunction)
# https://pytorch.org/docs/master/notes/extending.html
# https://github.com/pytorch/pytorch/blob/c9bb990707d4bfe524f3f1c4a77ff85fed1cd2a2/torch/csrc/api/include/torch/nn/functional/loss.h

# pytorch Autograd function (RELU example)
# https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html

# discussion custom threshold forward and backward
# https://discuss.pytorch.org/t/how-to-call-only-backward-path-of-pytorch-function/22839/2

# Define custom autograd.Function and put the function in nn.Module
# https://discuss.pytorch.org/t/how-to-call-the-backward-function-of-a-custom-module/7853

class LogisticRegressionCustom(nn.Module):
    '''Linear and sigmoid with custom backward'''
    def __init__(self, nx, init_weight, init_bias):
        super(LogisticRegressionCustom, self).__init__()
        self.scorer = CustomLinearLayer(nx, init_weight, init_bias)
        self.sigmoid = CustomSigmoidFunction.apply
        
    def forward(self, X):
        '''
        X has shape (m, nx)
        '''
        # shape(m, ny=1)
        z = self.scorer(X)
        # shape(m, ny=1)
        a = self.sigmoid(z)
        return z, a  

class CustomSigmoidFunction(torch.autograd.Function):
    '''
    doesn't get backprop through because loss function takes in logit directly
    '''
    
    @staticmethod
    def forward(ctx, inp):
        '''
        inp: shape(m, ny)
        '''
        ctx.save_for_backward(inp)
        return 1 / (1 + torch.exp(-inp))
        
    @staticmethod
    def backward(ctx, dA):
        '''
        Demonstration purpose. Not used in overall backprop since our loss function computes with logits.
        dA: shape(m, ny)
        '''
        # retrieve cache
        inp, = ctx.saved_tensors
        grad_inp = None
        
        A = 1.0 / (1.0 + torch.exp(-inp))
        # shape(m, ny)
        grad_inp = A * (1 - A) * dA
        
        return grad_inp

class CustomLinearFunction(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, inp, wt, b):
        '''
        inp: shape(nx, m)
        wt: shape(ny=1, nx)
        b: shape(ny=1, 1)
        '''
        ctx.save_for_backward(inp, wt, b)
        # (ny, m) = (ny, nx)(nx, m) + (ny, 1)t
        z = wt.mm(inp) + b
        assert z.shape == (1, inp.shape[1])
        # (ny, m)
        return z
        
    @staticmethod
    def backward(ctx, dZ):
        '''
        dZ: shape(ny, m)
        '''
        
        # retrieve cache
        inp, wt, b = ctx.saved_tensors
        m = inp.shape[1]
        grad_inp, grad_wt, grad_b = None, None, None
        
        # Z = W dot X.T + b 
        # shape(nx, m)
        grad_inp = wt.t().mm(dZ)
        # shape(ny=1, nx)
        grad_wt = dZ.mm(inp.t())
        # shape(ny=1, 1)
        grad_b = torch.sum(dZ, dim=1, keepdim=True)
        
        return grad_inp, grad_wt, grad_b

    
class CustomLinearLayer(nn.Module):
    '''Linear with custom backward'''
    def __init__(self, nx, init_weight, init_bias):
        super(CustomLinearLayer, self).__init__()
        # init weight and bias
        self.weight = nn.Parameter(torch.tensor(init_weight))
        self.bias = nn.Parameter(torch.tensor(init_bias))
        
    def forward(self, X):
        '''
        X has shape (m, nx)
        '''
        # (m, ny=1)
        z = CustomLinearFunction.apply(X.t(), self.weight, self.bias).t()
        return z      

In [5]:
# gradient check Sigmoid
inp_test = torch.rand(10, 1, requires_grad=True).double()
assert torch.autograd.gradcheck(CustomSigmoidFunction.apply, (inp_test,), raise_exception=True)

In [6]:
# gradient check CustomLinear
inp_test = torch.rand(5, 1000, requires_grad=True).double()
wt_test = torch.rand(1, 5,requires_grad=True).double()
b_test = torch.rand(1, 1,requires_grad=True).double()
assert torch.autograd.gradcheck(CustomLinearFunction.apply, (inp_test, wt_test, b_test), raise_exception=True)

## 3. With custom loss function

In [7]:
# stable loss implementation
# tensorflow demo
# https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits
# Pytorch source code
# https://github.com/pytorch/pytorch/blob/7d6d5f4be0da26079bc81ca49265cde713a75051/aten/src/ATen/native/Loss.cpp#L201

# how to write a pytorch loss autograd.function with backward vs nn.module with only forward
# https://discuss.pytorch.org/t/custom-loss-autograd-module-what-is-the-difference/69251

# DeepLearning Specialization Homework
# https://github.com/Chucooleg/DeepLearning_Specialization_Assignments/blob/master/course%201%20Assignments/Week%202/Logistic%20Regression%20as%20a%20Neural%20Network/Logistic_Regression_with_a_Neural_Network_mindset_v6a.ipynb

class CustomBCEWithLogitLoss(torch.autograd.Function):
    '''
    Custom Binary Cross Entropy Loss with Logits.
    Implementation Goal -- Numerically stable implementation.
    '''
    
    @staticmethod
    def forward(ctx, Z, Y):
        '''
        Z: Pre-Activations(i.e. Logits), shape(m, ny=1)
        Y: Predictions, shape(m, ny=1)
        '''
        ctx.save_for_backward(Z, Y)
        
        # this intuitive version is not numerically stable if Z is a large -ve number
#         A = 1 / (1 + torch.exp(-Z))
#         loss = - torch.mean(Y * torch.log(A) + (1 - Y) * torch.log(1 - A))
        
        # follow this tensorflow implmentation
        # https://www.tensorflow.org/api_docs/python/tf/nn/sigmoid_cross_entropy_with_logits
        loss = torch.max(Z, torch.zeros(Z.shape, dtype=Z.dtype)) - Z * Y + torch.log(1 + torch.exp(-torch.abs(Z)))
        loss = torch.mean(loss)

        return loss
    
    @staticmethod
    def backward(ctx, grad_output):
 
        # retrieve cache
        Z, Y = ctx.saved_tensors
        grad_Z, grad_Y = None, None
        m = Z.shape[0]
        
        # https://github.com/pytorch/pytorch/blob/7d6d5f4be0da26079bc81ca49265cde713a75051/aten/src/ATen/native/Loss.cpp#L226
        grad_Z = (torch.sigmoid(Z) - Y) * grad_output / m
        grad_Y = - Z * grad_output / m
        
        return grad_Z, grad_Y

In [8]:
# Gradcheck a custom loss function
# https://discuss.pytorch.org/t/how-to-check-the-gradients-of-custom-implemented-loss-function/8546

# gradient check CustomBCEWithLogitLoss
Z_test = torch.rand(10, 1,requires_grad=True).double()
Y_test = torch.rand(10, 1,requires_grad=True).double()
assert torch.autograd.gradcheck(CustomBCEWithLogitLoss.apply, (Z_test, Y_test), raise_exception=True)

## 3. With custom optimizer

In [9]:
# Custom Optimizer Tutorial
# http://mcneela.github.io/machine_learning/2019/09/03/Writing-Your-Own-Optimizers-In-Pytorch.html
# https://huggingface.co/transformers/_modules/transformers/optimization.html#AdamW

from torch.optim import Optimizer

class CustomSGD(Optimizer):
    
    def __init__(self, params, lr=1e-3):
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
        defaults = dict(lr=lr)
        super(CustomSGD, self).__init__(params, defaults)
        
    def step(self, closure=None):
        '''performs single optimization step'''
        loss = None
        
        for group in self.param_groups:
            for p in group['params']:
                
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
                
                p.data.add_(grad, alpha=-group['lr'])
        
        return loss     

## 3. Main Train & Pred Loop

In [10]:
def train(model, train_loader, valid_loader, loss_criterion, optimizer, args, epochs=20):
    '''
    Train model and report losses on train and dev sets per epoch
    '''
    
    history = {
        'train_losses': [],
        'valid_losses': [],        
        'valid_accuracy': [],
        'weights': [],
        'bias': [],
    }

    # save parameters
    write_param_history(model, history)
    
    for epoch_i in range(epochs):

        # train
        model.train()
        batch_losses = []
        for batch_i, batch_data in enumerate(train_loader):
            logits, activations = model(batch_data['X'])
            loss = loss_criterion(logits, batch_data['y'])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            batch_losses.append(loss.item())
        history['train_losses'].append(sum(batch_losses) / len(batch_losses))

        # validate
        batch_val_losses, batch_val_accuracies = pred(model, valid_loader, loss_criterion)
        history['valid_losses'].append(sum(batch_val_losses) / len(batch_val_losses))
        history['valid_accuracy'].append(sum(batch_val_accuracies) / len(batch_val_accuracies))

        # save parameters
        write_param_history(model, history)
        
    return history

def write_param_history(model, history):
    weights = model.scorer.weight.clone().detach().numpy()
    bias = model.scorer.bias.data.clone().detach().numpy()
    history['weights'].append(weights)
    history['bias'].append(bias)    

In [11]:
@torch.no_grad()
def pred(model, test_loader, loss_criterion):
    '''Propogate forward on dev or test set, report loss and accuracy.'''
    
    # evaluate
    model.eval()
    batch_losses = []
    batch_accuracies = []
    for batch_i, batch_data in enumerate(test_loader):
        logits, activations = model(batch_data['X'])
        loss = loss_criterion(logits, batch_data['y'])
        batch_losses.append(loss.item())
        accuracy = torch.mean((activations > 0.5).type(torch.FloatTensor).eq(batch_data['y']).type(torch.FloatTensor))
        batch_accuracies.append(accuracy.item())
    
    return batch_losses, batch_accuracies

## 4. Make Toy Dataset

In [12]:
# Pytorch Dataloader
# https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

# Pytorch Data Collate (Further reading, not implemented here)
# https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader

import os
import numpy as np
from sklearn.datasets import make_classification
from torch.utils.data import Dataset, DataLoader

class ToyDataset(Dataset):
    """Toy dataset for Logistic Regression."""

    def __init__(self, data_dir):
        """
        Args:
            data_dir (string): Path to the directory with data files.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        # shape (m, nx)
        self.X = np.load(os.path.join(data_dir, 'features.npy'))
        # shape (m, ny=1)
        self.y = np.load(os.path.join(data_dir, 'labels.npy'))

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        X = torch.from_numpy(self.X[idx, :]).type(torch.FloatTensor)
        y = torch.from_numpy(self.y[idx, :]).type(torch.FloatTensor)
        sample = {'X': X, 'y': y}

        return sample

In [13]:
# give permission to access /datadrive
!sudo chmod -R 777 /datadrive 

In [17]:
# construct and save toydataset

m_train, m_valid, m_test = 90, 500, 500
m_total = m_train + m_valid + m_test

X, y = make_classification(n_samples=m_total, n_features=10, n_informative=10, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=4, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=args.seed)
y = np.expand_dims(y, -1)

np.random.seed(123)
permutation = np.random.permutation(m_total)
print('First 10 training indices', permutation[:10])
print('X shape', X.shape)
print('y shape', y.shape)

train_indices = permutation[0:m_train]
valid_indices = permutation[m_train:m_train+m_valid]
test_indices = permutation[m_train+m_valid:]

# np.save(os.path.join(args.data_dir, 'toy_lr_1', 'train', 'features.npy'), X[train_indices])
# np.save(os.path.join(args.data_dir, 'toy_lr_1', 'train', 'labels.npy'), y[train_indices])

# np.save(os.path.join(args.data_dir, 'toy_lr_1', 'valid', 'features.npy'), X[valid_indices])
# np.save(os.path.join(args.data_dir, 'toy_lr_1', 'valid', 'labels.npy'), y[valid_indices])

# np.save(os.path.join(args.data_dir, 'toy_lr_1', 'test', 'features.npy'), X[test_indices])
# np.save(os.path.join(args.data_dir, 'toy_lr_1', 'test', 'labels.npy'), y[test_indices])


dataset_dir = 'toy_lr_1'
os.makedirs(os.path.join(args.data_dir, dataset_dir, 'train'), mode = 0o777, exist_ok = True) 
os.makedirs(os.path.join(args.data_dir, dataset_dir, 'valid'), mode = 0o777, exist_ok = True) 
os.makedirs(os.path.join(args.data_dir, dataset_dir, 'test'), mode = 0o777, exist_ok = True) 

np.save(os.path.join(args.data_dir, dataset_dir, 'train', 'features.npy'), X[train_indices])
np.save(os.path.join(args.data_dir, dataset_dir, 'train', 'labels.npy'), y[train_indices])

np.save(os.path.join(args.data_dir, dataset_dir, 'valid', 'features.npy'), X[valid_indices])
np.save(os.path.join(args.data_dir, dataset_dir, 'valid', 'labels.npy'), y[valid_indices])

np.save(os.path.join(args.data_dir, dataset_dir, 'test', 'features.npy'), X[test_indices])
np.save(os.path.join(args.data_dir, dataset_dir, 'test', 'labels.npy'), y[test_indices])

First 10 training indices [1037  655  547  487  307  689  856  309  260  229]
X shape (1090, 10)
y shape (1090, 1)


## 5. Train and compare results on toy dataset

In [18]:
batch_size = 5

training_set = ToyDataset(data_dir=os.path.join(args.data_dir, 'toy_lr_1', 'train'))
training_generator = torch.utils.data.DataLoader(training_set, batch_size=batch_size, shuffle=True)

validation_set = ToyDataset(data_dir=os.path.join(args.data_dir, 'toy_lr_1', 'valid'))
validation_generator = torch.utils.data.DataLoader(validation_set, batch_size=batch_size)

test_set = ToyDataset(data_dir=os.path.join(args.data_dir, 'toy_lr_1', 'test'))
test_generator = torch.utils.data.DataLoader(test_set, batch_size=batch_size)

m = training_set.X.shape[0]
nx = training_set.X.shape[1]
ny = 1

In [19]:
torch.manual_seed(args.seed)

# set off-the-shelf model, loss function and optimizer
model = LogisticRegressionLazy(nx)
loss_criterion_lazy = nn.BCEWithLogitsLoss(reduction='mean')
optimizer_lazy = torch.optim.SGD(model.parameters(), lr=args.lr)

history_off_the_shelf = train(model, training_generator, validation_generator, loss_criterion_lazy, optimizer_lazy, args, epochs=10)

In [20]:
history_off_the_shelf['weights'][0]

array([[-0.12895013,  0.01047492, -0.15705723,  0.11925378, -0.26944348,
         0.23180881, -0.22984707, -0.25141433, -0.19982024,  0.1432175 ]],
      dtype=float32)

In [21]:
history_off_the_shelf['bias'][0]

array([-0.11684369], dtype=float32)

In [22]:
torch.manual_seed(args.seed)

wt_arr = [[-0.12895013,  0.01047492, -0.15705723,  0.11925378, -0.26944348,
         0.23180881, -0.22984707, -0.25141433, -0.19982024,  0.1432175 ]]
bias_arr = [[-0.11684369]]

# set custom model, loss function and optimizer
model = LogisticRegressionCustom(nx, init_weight=wt_arr, init_bias=bias_arr)
loss_criterion = CustomBCEWithLogitLoss.apply
optimizer = CustomSGD(model.parameters(), lr=args.lr)

history_custom = train(model, training_generator, validation_generator, loss_criterion, optimizer, args, epochs=10)

### 5.1 Cross-comparison on train & valid loss, accuracy and parameter values

In [23]:
list(zip(history_custom['train_losses'], history_off_the_shelf['train_losses']))

[(0.5809669726424747, 0.577768819199668),
 (0.49381006757418316, 0.49348143819305634),
 (0.4390552697910203, 0.43901902271641624),
 (0.4032263747519917, 0.40312086708015865),
 (0.3805066785878605, 0.38032788783311844),
 (0.36376417097118163, 0.36256616645389134),
 (0.3502613811029328, 0.35040000246630776),
 (0.33932048827409744, 0.3394729180468453),
 (0.33107655743757886, 0.33110859327846104),
 (0.3242419502801365, 0.32440519788199)]

In [24]:
list(zip(history_custom['valid_losses'], history_off_the_shelf['valid_losses']))

[(0.5576036085188388, 0.5574356658756733),
 (0.5193752017617226, 0.5199329514801502),
 (0.49643942549824716, 0.4968894647061825),
 (0.48121415615081786, 0.48186307355761526),
 (0.4706562738120556, 0.47149196460843085),
 (0.4629448476433754, 0.4638245105743408),
 (0.4572421546280384, 0.45781891606748104),
 (0.4529050077497959, 0.453350076302886),
 (0.4493377766013145, 0.44998479798436164),
 (0.4465303386002779, 0.44703921392560003)]

In [25]:
list(zip(history_custom['valid_accuracy'], history_off_the_shelf['valid_accuracy']))

[(0.7000000117719174, 0.7000000117719174),
 (0.7420000120997429, 0.7400000122189522),
 (0.7500000119209289, 0.7480000120401382),
 (0.7720000118017196, 0.7660000118613243),
 (0.7760000112652778, 0.7760000112652778),
 (0.7780000108480454, 0.7760000106692314),
 (0.788000010251999, 0.788000010251999),
 (0.7900000101327896, 0.7900000101327896),
 (0.7920000100135803, 0.7920000100135803),
 (0.796000010073185, 0.7980000099539757)]

In [26]:
list(zip(history_custom['weights'], history_off_the_shelf['weights']))

[(array([[-0.12895013,  0.01047492, -0.15705723,  0.11925378, -0.26944348,
           0.23180881, -0.22984707, -0.25141433, -0.19982024,  0.1432175 ]],
        dtype=float32),
  array([[-0.12895013,  0.01047492, -0.15705723,  0.11925378, -0.26944348,
           0.23180881, -0.22984707, -0.25141433, -0.19982024,  0.1432175 ]],
        dtype=float32)),
 (array([[-0.1495411 , -0.02349061, -0.22837886,  0.07674928, -0.25467896,
           0.21042143, -0.23045243, -0.2415796 , -0.22928452,  0.05251009]],
        dtype=float32),
  array([[-0.15052389, -0.02369768, -0.22956073,  0.07711416, -0.2532521 ,
           0.20928231, -0.2290748 , -0.242651  , -0.22922324,  0.05096457]],
        dtype=float32)),
 (array([[-0.16749202, -0.05449502, -0.2866693 ,  0.04461355, -0.24168415,
           0.19611567, -0.23023418, -0.23195024, -0.252494  , -0.02308795]],
        dtype=float32),
  array([[-0.16806675, -0.05138322, -0.28755304,  0.04552607, -0.23789325,
           0.19817658, -0.22975099, -0.2333

In [27]:
list(zip(history_custom['bias'], history_off_the_shelf['bias']))

[(array([[-0.11684369]], dtype=float32), array([-0.11684369], dtype=float32)),
 (array([[-0.11910512]], dtype=float32), array([-0.11937625], dtype=float32)),
 (array([[-0.12003764]], dtype=float32), array([-0.11988522], dtype=float32)),
 (array([[-0.11861362]], dtype=float32), array([-0.11849485], dtype=float32)),
 (array([[-0.11625292]], dtype=float32), array([-0.11595139], dtype=float32)),
 (array([[-0.11301926]], dtype=float32), array([-0.11235729], dtype=float32)),
 (array([[-0.10927813]], dtype=float32), array([-0.10855978], dtype=float32)),
 (array([[-0.10487188]], dtype=float32), array([-0.10431983], dtype=float32)),
 (array([[-0.09997234]], dtype=float32), array([-0.09948463], dtype=float32)),
 (array([[-0.09504167]], dtype=float32), array([-0.09434004], dtype=float32)),
 (array([[-0.08987144]], dtype=float32), array([-0.08915619], dtype=float32))]

## 6. Further reading

In [None]:
# Pytorch Data Collate (Further reading, not implemented here)
# https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader

# Karpathy resource on a tiny implementation of  autodiff from scratch if anyone is interested. Engine.py is where the meat is
# https://github.com/karpathy/micrograd/blob/master/micrograd/engine.py