# ML Model with pyTorch

In [2]:
import os

import time
from time import gmtime, strftime

import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


## Import the preprocessed data

In [5]:
features = pd.read_csv('data/features_completed.csv', index_col=0)
labels = pd.read_csv('data/labels_completed.csv', index_col=0)

In [6]:
features.shape, labels.shape

((63288, 16), (63288, 1))

In [7]:
features

Unnamed: 0,age,income,reward,difficulty,duration,email,mobile,social,web,F,M,O,U,bogo,discount,informational
0,0.180723,0.466667,5,5,120,1,1,1,1,0,1,0,0,1,0,0
1,0.072289,0.333333,5,5,120,1,1,1,1,1,0,0,0,1,0,0
2,0.445783,0.488889,5,5,120,1,1,1,1,1,0,0,0,1,0,0
3,0.433735,0.766667,5,5,120,1,1,1,1,0,1,0,0,1,0,0
4,0.530120,0.566667,5,5,120,1,1,1,1,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63283,0.438476,0.393389,10,10,168,1,1,1,0,0,0,0,1,1,0,0
63284,0.438476,0.393389,10,10,168,1,1,1,0,0,0,0,1,1,0,0
63285,0.438476,0.393389,10,10,168,1,1,1,0,0,0,0,1,1,0,0
63286,0.759036,0.388889,10,10,168,1,1,1,0,1,0,0,0,1,0,0


## Create Training, Validation and Testdata
To avoid overfitting I split the train data additional in validation data.

In [8]:
from sklearn.model_selection import train_test_split 

In [9]:
# We split the dataset into 2/3 training and 1/3 testing sets.
X_train, X_test, Y_train, Y_test = train_test_split(features, targets, test_size=0.33)

# Then we split the training set further into 2/3 training and 1/3 validation sets.
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.33)

In [10]:
X_train.shape, X_val.shape

((28409, 16), (13993, 16))

In [11]:
# Define the data directory and make sure that the directory exists
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

## Create csv files for test, validation and train data

In [39]:
Y_test

Unnamed: 0,binary_target
29285,1
32350,1
34646,1
2864,0
39901,1
...,...
44106,0
11231,1
31283,0
62075,1


In [40]:
# We use pandas to save our test, train and validation data to csv files. Note that we make sure not to include header
# information or an index as this is required by the built in algorithms provided by Amazon. Also, for the train and
# validation data, it is assumed that the first entry in each row is the target variable.

pd.concat([Y_test, X_test], axis=1).to_csv(os.path.join(data_dir, 'test_completed_torch.csv'), header=False, index=False)

pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation_completed_torch.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train_completed_torch.csv'), header=False, index=False)

### Create a PyTorch Estimator

You've had some practice instantiating built-in models in SageMaker. All estimators require some constructor arguments to be passed in. When a custom model is constructed in SageMaker, an **entry point** must be specified. The entry_point is the training script that will be executed when the model is trained; the `train.py` function you specified above! 

#### Model size
 I found an interisting article how to choose the number and size of hidden layers [choosing-number-hidden-layers-neurons-neural-networks](https://www.linkedin.com/pulse/choosing-number-hidden-layers-neurons-neural-networks-sachdev/)

#### Instance Types

It is suggested that you use instances that are available in the free tier of usage: `'ml.c4.xlarge'` for training and `'ml.t2.medium'` for deployment.

In [13]:
# number of input features
input_dim = features.shape[1]
# choose 3 hidden layers for input features count
# Start with 8 neurons for layer 1
hidden_1 = 8
hidden_2 = 4
hidden_3 = 2
output_dim = 1

In [14]:
input_dim, hidden_1, hidden_2, hidden_3, output_dim

(16, 8, 4, 2, 1)

In [247]:
# code to evaluate the endpoint on test data
# returns a variety of model metrics
def evaluate(predictor, test_features, test_labels, verbose=True):
    """
    Evaluate a model on a test set given the prediction endpoint.  
    Return binary classification metrics.
    :param predictor: A prediction endpoint
    :param test_features: Test features
    :param test_labels: Class labels for test data
    :param verbose: If True, prints a table of all performance metrics
    :return: A dictionary of performance metrics.
    """
    
    # rounding and squeezing array
    test_preds = np.squeeze(np.round(predictor.predict(test_features)))
    
    # calculate true positives, false positives, true negatives, false negatives
    tp = np.logical_and(test_labels, test_preds).sum()
    fp = np.logical_and(1-test_labels, test_preds).sum()
    tn = np.logical_and(1-test_labels, 1-test_preds).sum()
    fn = np.logical_and(test_labels, 1-test_preds).sum()
    
    # calculate binary classification metrics
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    
    # print metrics
    if verbose:
        print(pd.crosstab(test_labels, test_preds, rownames=['actuals'], colnames=['predictions']))
        print("\n{:<11} {:.3f}".format('Recall:', recall))
        print("{:<11} {:.3f}".format('Precision:', precision))
        print("{:<11} {:.3f}".format('Accuracy:', accuracy))
        print()
        
    return {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn, 
            'Precision': precision, 'Recall': recall, 'Accuracy': accuracy}




In [54]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## TODO: Complete this classifier
class SimpleNet(nn.Module):
    
    ## TODO: Define the init function
    def __init__(self, input_dim, hidden_1, hidden_2, hidden_3, output_dim):
        '''Defines layers of a neural network.
           :param input_dim: Number of input features
           :param hidden_dim: Size of hidden layer(s)
           :param output_dim: Number of outputs
         '''
        super(SimpleNet, self).__init__()
        
        # define all layers, here
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, hidden_3)
        self.fc4 = nn.Linear(hidden_3, output_dim)
        # Define dropout
        self.drop = nn.Dropout(0.1)
        # Sigmoid Layer
        self.sig = nn.Sigmoid()
        
    
    ## TODO: Define the feedforward behavior of the network
    def forward(self, x):
        '''Feedforward behavior of the net.
           :param x: A batch of input features
           :return: A single, sigmoid activated value
         '''
        # your code, here
        out = F.relu(self.fc1(x))
        out = self.drop(out)
        out = F.relu(self.fc2(out))
        out = self.drop(out)
        out = F.relu(self.fc3(out))
        out = self.drop(out)
        out = self.fc4(out)
      
        return self.sig(out)

In [55]:
model = SimpleNet(input_dim, hidden_1, hidden_2, hidden_3, output_dim)
model

SimpleNet(
  (fc1): Linear(in_features=16, out_features=8, bias=True)
  (fc2): Linear(in_features=8, out_features=4, bias=True)
  (fc3): Linear(in_features=4, out_features=2, bias=True)
  (fc4): Linear(in_features=2, out_features=1, bias=True)
  (drop): Dropout(p=0.1, inplace=False)
  (sig): Sigmoid()
)

In [56]:
# Load the training data from a csv file
def _get_train_loader(batch_size, data_dir, file):
    print("Get train loader.")

    # read in csv file
    train_data = pd.read_csv(os.path.join(data_dir, file), header=None, names=None)

    # labels are first column
    train_y = torch.from_numpy(train_data[[0]].values).float()
    # features are the rest
    train_x = torch.from_numpy(train_data.drop([0], axis=1).values).float()

    # create dataset
    train_ds = torch.utils.data.TensorDataset(train_x, train_y)

    return torch.utils.data.DataLoader(train_ds, batch_size=batch_size)

# Load the training data from a csv file
def _get_validation_loader(batch_size, data_dir, file):
    print("Get validation loader.")

    # read in csv file
    val_data = pd.read_csv(os.path.join(data_dir, file), header=None, names=None)

    # labels are first column
    val_y = torch.from_numpy(val_data[[0]].values).float()
    # features are the rest
    val_x = torch.from_numpy(val_data.drop([0], axis=1).values).float()

    # create dataset
    val_ds = torch.utils.data.TensorDataset(val_x, val_y)

    return torch.utils.data.DataLoader(val_ds, batch_size=batch_size)

# Load the training data from a csv file
def _get_test_loader(batch_size, data_dir, file):
    print("Get test loader.")

    # read in csv file
    test_data = pd.read_csv(os.path.join(data_dir, file), header=None, names=None)

    # labels are first column
    test_y = torch.from_numpy(test_data[[0]].values).float()
    # features are the rest
    test_x = torch.from_numpy(test_data.drop([0], axis=1).values).float()

    # create dataset
    test_ds = torch.utils.data.TensorDataset(test_x, test_y)

    return torch.utils.data.DataLoader(test_ds, batch_size=batch_size)


# Provided train function
def train(model, train_loader, validation_loader, epochs, optimizer, criterion, device):
    """
    This is the training method that is called by the PyTorch training script. The parameters
    passed are as follows:
    model        - The PyTorch model that we wish to train.
    train_loader - The PyTorch DataLoader that should be used during training.
    epochs       - The total number of epochs to train for.
    optimizer    - The optimizer to use during training.
    criterion    - The loss function used for training. 
    device       - Where the model and data should be loaded (gpu or cpu).
    """
    valid_loss_min = np.Inf # track change in validation loss
    
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        valid_loss = 0
        for batch_idx, (data, target) in enumerate(train_loader, 1):
            # prep data
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad() # zero accumulated gradients
            # get output of SimpleNet
            output = model(data)
            # calculate loss and perform backprop
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
    
            total_loss += loss.item()

        ######################    
        # validate the model #
        ######################
        model.eval()
 
        for batch_idx, (data, target) in enumerate(validation_loader, 1):
            # prep data
            data, target = data.to(device), target.to(device)
            # get output of SimpleNet
            output = model(data)
            # calculate loss and perform backprop
            loss = criterion(output, target)
            # update average validation loss 
            valid_loss += loss.item()*data.size(0)

        # print loss stats
        print("Epoch: {}, Loss: {}".format(epoch, total_loss / len(train_loader)))

        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, total_loss / len(train_loader), valid_loss / len(validation_loader)))

        # save model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  \
                   Saving model ...'.format(valid_loss_min/len(validation_loader), valid_loss/len(validation_loader)))
            torch.save(model.state_dict(), 'model_cifar.pt')
            # Set model back to device after saving
            model.to(device)
            valid_loss_min = valid_loss        
        
    # save after all epochs
    save_model(model, model_dir)


# Provided model saving functions
def save_model(model, model_dir):
    print("Saving the model.")
    path = os.path.join(model_dir, 'model.pth')
    # save state dictionary
    torch.save(model.cpu().state_dict(), path)
    
def save_model_params(model, model_dir):
    model_info_path = os.path.join(args.model_dir, 'model_info.pth')
    with open(model_info_path, 'wb') as f:
        model_info = {
            'input_dim': args.input_dim,
            'hidden_1': args.hidden_1,
            'hidden_2': args.hidden_2,
            'hidden_3': args.hidden_3,
            'output_dim': args.output_dim
        }
        torch.save(model_info, f)




In [57]:
data_dir = 'data'
model_dir = 'data'
train_loader = _get_train_loader(batch_size=64, data_dir=data_dir, file='train_completed_torch.csv')
validation_loader = _get_validation_loader(batch_size=64, data_dir=data_dir, file='validation_completed_torch.csv')
test_loader = _get_test_loader(batch_size=64, data_dir=data_dir, file='test_completed_torch.csv')

Get train loader.
Get validation loader.
Get test loader.


In [58]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cpu')

In [59]:
%%time

import torch.optim as optim


## TODO: Define an optimizer and loss function for training
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()



train(model, train_loader, validation_loader, 80, optimizer, criterion, device)



Epoch: 1, Loss: 0.6905444115400314
Epoch: 1 	Training Loss: 0.690544 	Validation Loss: 42.688085
Validation loss decreased (inf --> 42.688085).                     Saving model ...
Epoch: 2, Loss: 0.665485772195163
Epoch: 2 	Training Loss: 0.665486 	Validation Loss: 41.989956
Validation loss decreased (42.688085 --> 41.989956).                     Saving model ...
Epoch: 3, Loss: 0.6574403902700355
Epoch: 3 	Training Loss: 0.657440 	Validation Loss: 41.389905
Validation loss decreased (41.989956 --> 41.389905).                     Saving model ...
Epoch: 4, Loss: 0.6500213009280127
Epoch: 4 	Training Loss: 0.650021 	Validation Loss: 40.930405
Validation loss decreased (41.389905 --> 40.930405).                     Saving model ...
Epoch: 5, Loss: 0.6379747110175656
Epoch: 5 	Training Loss: 0.637975 	Validation Loss: 39.683065
Validation loss decreased (40.930405 --> 39.683065).                     Saving model ...
Epoch: 6, Loss: 0.6280063281993609
Epoch: 6 	Training Loss: 0.628006 	Va

In [60]:
for data, labels in test_loader:
    print(data.shape)
    break

torch.Size([64, 16])


In [61]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
#h = net.init_hidden(batch_size)

model.eval()


for data, targets in test_loader:

    # get output of SimpleNet
    output = model(data)
    # calculate loss and perform backprop
    test_loss = criterion(output, targets)

    test_losses.append(test_loss.item())

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer

    # compare predictions to true label
    correct_tensor = pred.eq(targets.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))


Test loss: 0.592
Test accuracy: 0.697
