# ML Model with pyTorch

In [1]:
import os

import time
from time import gmtime, strftime

import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


## Import the preprocessed data

In [2]:
features = pd.read_csv('data/preprocessed_features_viewed.csv', index_col=0)
targets = pd.read_csv('data/preprocessed_targets_viewed.csv', index_col=0)

In [4]:
features.shape, targets.shape

((76277, 10), (76277, 1))

In [5]:
features

Unnamed: 0,age,income,M,F,O,U,email,mobile,social,web
0,0.438476,0.393389,0,0,0,1,1.0,1.0,0.0,1.0
2,0.602410,0.444444,1,0,0,0,1.0,1.0,0.0,1.0
4,0.438476,0.393389,0,0,0,1,1.0,1.0,0.0,1.0
6,0.438476,0.393389,0,0,0,1,1.0,1.0,0.0,1.0
8,0.566265,0.255556,1,0,0,0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
306125,0.132530,0.311111,0,1,0,0,1.0,1.0,1.0,1.0
306128,0.650602,0.388889,0,1,0,0,1.0,1.0,1.0,1.0
306130,0.438476,0.393389,0,0,0,1,1.0,1.0,1.0,1.0
306132,0.325301,0.266667,0,1,0,0,1.0,1.0,1.0,1.0


## Create Training, Validation and Testdata
To avoid overfitting I split the train data additional in validation data.

In [6]:
from sklearn.model_selection import train_test_split 

In [7]:
# We split the dataset into 2/3 training and 1/3 testing sets.
X_train, X_test, Y_train, Y_test = train_test_split(features, targets, test_size=0.33)

# Then we split the training set further into 2/3 training and 1/3 validation sets.
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.33)

In [8]:
X_train.shape, X_val.shape

((34240, 10), (16865, 10))

In [9]:
# Define the data directory and make sure that the directory exists
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [10]:
df = pd.concat([targets, features], axis=1)
df.viewed.value_counts()
#df.query('viewed == 1 and completed == 1').social.value_counts()
#df.query('viewed == -1 and completed == 1').social.value_counts()
#df.query('viewed == 1 and completed == 0').social.value_counts()
#df.query('viewed == -1 and completed == 0').social.value_counts()

1    56895
0    19382
Name: viewed, dtype: int64

## Create csv files for test, validation and train data

In [68]:
pd.concat([Y_test, X_test])

Unnamed: 0,viewed,age,income,M,F,O,U,email,mobile,social,web
253181,1.0,,,,,,,,,,
178496,1.0,,,,,,,,,,
49,1.0,,,,,,,,,,
299287,1.0,,,,,,,,,,
239201,0.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
174983,,0.746988,0.500000,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
290422,,0.469880,0.388889,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
194609,,0.349398,0.400000,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
252605,,0.710843,0.811111,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0


In [74]:
# We use pandas to save our test, train and validation data to csv files. Note that we make sure not to include header
# information or an index as this is required by the built in algorithms provided by Amazon. Also, for the train and
# validation data, it is assumed that the first entry in each row is the target variable.

pd.concat([Y_test, X_test], axis=1).to_csv(os.path.join(data_dir, 'test_viewed.csv'), header=False, index=False)

pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation_viewed.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train_viewed.csv'), header=False, index=False)

## Import the sagemaker specific classes and functions

In [14]:
import sagemaker
from sagemaker import get_execution_role
#from sagemaker.amazon.amazon_estimator import get_image_uri

# This is an object that represents the SageMaker session that we are currently operating in. This
# object contains some useful information that we will need to access later such as our region.
session = sagemaker.Session()

# This is an object that represents the IAM role that we are currently assigned. When we construct
# and launch the training job later we will need to tell it what IAM role it should have. Since our
# use case is relatively simple we will simply assign the training job the role we currently have.
role = get_execution_role()

## Define a prefix for s3 data upload and upload the createrd files

In [15]:
prefix = 'capstone_completed'

test_location = session.upload_data(os.path.join(data_dir, 'test_viewed.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation_viewed.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train_viewed.csv'), key_prefix=prefix)

In [19]:
# specify where to upload in S3
prefix = 'capstone_pytorch'
bucket = session.default_bucket()
# upload to S3
input_data = session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)
print(input_data)

s3://sagemaker-eu-central-1-647915836300/capstone_pytorch


In [18]:
input_data

's3://sagemaker-eu-central-1-647915836300/capstone_pytorch'

### Create a PyTorch Estimator

You've had some practice instantiating built-in models in SageMaker. All estimators require some constructor arguments to be passed in. When a custom model is constructed in SageMaker, an **entry point** must be specified. The entry_point is the training script that will be executed when the model is trained; the `train.py` function you specified above! 

#### Model size
 I found an interisting article how to choose the number and size of hidden layers [choosing-number-hidden-layers-neurons-neural-networks](https://www.linkedin.com/pulse/choosing-number-hidden-layers-neurons-neural-networks-sachdev/)

#### Instance Types

It is suggested that you use instances that are available in the free tier of usage: `'ml.c4.xlarge'` for training and `'ml.t2.medium'` for deployment.

In [31]:
# number of input features
input_dim = features.shape[1]
# choose 3 hidden layers for input features count
# Start with 8 neurons for layer 1
hidden_1 = 8
hidden_2 = 4
hidden_3 = 2
output_dim = 1

In [32]:
input_dim, hidden_1, hidden_2, hidden_3, output_dim

(10, 8, 4, 2, 1)

In [39]:
container = '763104351884.dkr.ecr.eu-central-1.amazonaws.com/pytorch-training:1.9.0-gpu-py38-cu111-ubuntu20.04'
container


'763104351884.dkr.ecr.eu-central-1.amazonaws.com/pytorch-training:1.9.0-gpu-py38-cu111-ubuntu20.04'

In [146]:
# import a PyTorch wrapper
from sagemaker.pytorch import PyTorch

# specify an output path
output_path = 's3://{}/{}'.format(bucket, prefix)

# instantiate a pytorch estimator
estimator = PyTorch(entry_point='train.py',
                    source_dir='source',
                    role=role,
                    image_uri=container,
                    #framework_version='latest',
                    instance_count=1,
                    instance_type='ml.c4.xlarge',
                    output_path=output_path,
                    sagemaker_session=session,
                    hyperparameters={
                        'input_dim': input_dim,
                        'hidden_1': hidden_1,
                        'hidden_2': hidden_2,
                        'hidden_3': hidden_3,
                        'output_dim': 1,
                        'epochs': 80, # could change to higher
                        'batch_size': 100
                    })



## Train the Estimator

After instantiating your estimator, train it with a call to `.fit()`. The `train.py` file explicitly loads in `.csv` data, so you do not need to convert the input data to any other format.

In [210]:
%%time
#estimator.fit({'train': input_data})

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.39 µs


In [245]:
%%time
# importing PyTorchModel

container = '763104351884.dkr.ecr.eu-central-1.amazonaws.com/pytorch-inference:1.9.0-gpu-py38-cu111-ubuntu20.04'

from sagemaker.pytorch import PyTorchModel

# Create a model from the trained estimator data
# And point to the prediction script
model = PyTorchModel(model_data=estimator.model_data,
                     role=role,
                     #image_uri=container,
                     framework_version='1.0',
                     py_version='py3',
                     entry_point='predict.py',
                     source_dir='source')



CPU times: user 10.5 ms, sys: 3.68 ms, total: 14.2 ms
Wall time: 54.1 ms


In [246]:
%%time
# deploy and create a predictor
predictor = model.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

---------------------------------*

UnexpectedStatusException: Error hosting endpoint sagemaker-pytorch-2021-09-22-14-08-23-539: Failed. Reason:  The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint..

In [247]:
# code to evaluate the endpoint on test data
# returns a variety of model metrics
def evaluate(predictor, test_features, test_labels, verbose=True):
    """
    Evaluate a model on a test set given the prediction endpoint.  
    Return binary classification metrics.
    :param predictor: A prediction endpoint
    :param test_features: Test features
    :param test_labels: Class labels for test data
    :param verbose: If True, prints a table of all performance metrics
    :return: A dictionary of performance metrics.
    """
    
    # rounding and squeezing array
    test_preds = np.squeeze(np.round(predictor.predict(test_features)))
    
    # calculate true positives, false positives, true negatives, false negatives
    tp = np.logical_and(test_labels, test_preds).sum()
    fp = np.logical_and(1-test_labels, test_preds).sum()
    tn = np.logical_and(1-test_labels, 1-test_preds).sum()
    fn = np.logical_and(test_labels, 1-test_preds).sum()
    
    # calculate binary classification metrics
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    
    # print metrics
    if verbose:
        print(pd.crosstab(test_labels, test_preds, rownames=['actuals'], colnames=['predictions']))
        print("\n{:<11} {:.3f}".format('Recall:', recall))
        print("{:<11} {:.3f}".format('Precision:', precision))
        print("{:<11} {:.3f}".format('Accuracy:', accuracy))
        print()
        
    return {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn, 
            'Precision': precision, 'Recall': recall, 'Accuracy': accuracy}




In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## TODO: Complete this classifier
class SimpleNet(nn.Module):
    
    ## TODO: Define the init function
    def __init__(self, input_dim, hidden_1, hidden_2, hidden_3, output_dim):
        '''Defines layers of a neural network.
           :param input_dim: Number of input features
           :param hidden_dim: Size of hidden layer(s)
           :param output_dim: Number of outputs
         '''
        super(SimpleNet, self).__init__()
        
        # define all layers, here
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, hidden_3)
        self.fc4 = nn.Linear(hidden_3, output_dim)
        # Define dropout
        self.drop = nn.Dropout(0.3)
        # Sigmoid Layer
        self.sig = nn.Sigmoid()
        
    
    ## TODO: Define the feedforward behavior of the network
    def forward(self, x):
        '''Feedforward behavior of the net.
           :param x: A batch of input features
           :return: A single, sigmoid activated value
         '''
        # your code, here
        out = F.relu(self.fc1(x))
        out = self.drop(out)
        out = F.relu(self.fc2(out))
        out = self.drop(out)
        out = F.relu(self.fc3(out))
        out = self.drop(out)
        out = self.fc4(out)
      
        return self.sig(out)

In [41]:
# Load the training data from a csv file
def _get_train_loader(batch_size, data_dir):
    print("Get train loader.")

    # read in csv file
    train_data = pd.read_csv(os.path.join(data_dir, "train_viewed.csv"), header=None, names=None)

    # labels are first column
    train_y = torch.from_numpy(train_data[[0]].values).float()
    # features are the rest
    train_x = torch.from_numpy(train_data.drop([0], axis=1).values).float()

    # create dataset
    train_ds = torch.utils.data.TensorDataset(train_x, train_y)

    return torch.utils.data.DataLoader(train_ds, batch_size=batch_size)

# Load the training data from a csv file
def _get_validation_loader(batch_size, data_dir):
    print("Get validation loader.")

    # read in csv file
    val_data = pd.read_csv(os.path.join(data_dir, "validation_viewed.csv"), header=None, names=None)

    # labels are first column
    val_y = torch.from_numpy(val_data[[0]].values).float()
    # features are the rest
    val_x = torch.from_numpy(val_data.drop([0], axis=1).values).float()

    # create dataset
    val_ds = torch.utils.data.TensorDataset(val_x, val_y)

    return torch.utils.data.DataLoader(val_ds, batch_size=batch_size)

# Load the training data from a csv file
def _get_test_loader(batch_size, data_dir):
    print("Get test loader.")

    # read in csv file
    test_data = pd.read_csv(os.path.join(data_dir, "test_viewed.csv"), header=None, names=None)

    # labels are first column
    test_y = torch.from_numpy(test_data[[0]].values).float()
    # features are the rest
    test_x = torch.from_numpy(test_data.drop([0], axis=1).values).float()

    # create dataset
    test_ds = torch.utils.data.TensorDataset(test_x, test_y)

    return torch.utils.data.DataLoader(test_ds, batch_size=batch_size)


# Provided train function
def train(model, train_loader, validation_loader, epochs, optimizer, criterion, device):
    """
    This is the training method that is called by the PyTorch training script. The parameters
    passed are as follows:
    model        - The PyTorch model that we wish to train.
    train_loader - The PyTorch DataLoader that should be used during training.
    epochs       - The total number of epochs to train for.
    optimizer    - The optimizer to use during training.
    criterion    - The loss function used for training. 
    device       - Where the model and data should be loaded (gpu or cpu).
    """
    valid_loss_min = np.Inf # track change in validation loss
    
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        valid_loss = 0
        for batch_idx, (data, target) in enumerate(train_loader, 1):
            # prep data
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad() # zero accumulated gradients
            # get output of SimpleNet
            output = model(data)
            # calculate loss and perform backprop
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
    
            total_loss += loss.item()

        ######################    
        # validate the model #
        ######################
        model.eval()
 
        for batch_idx, (data, target) in enumerate(validation_loader, 1):
            # prep data
            data, target = data.to(device), target.to(device)
            # get output of SimpleNet
            output = model(data)
            # calculate loss and perform backprop
            loss = criterion(output, target)
            # update average validation loss 
            valid_loss += loss.item()*data.size(0)

        # print loss stats
        print("Epoch: {}, Loss: {}".format(epoch, total_loss / len(train_loader)))

        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, total_loss / len(train_loader), valid_loss / len(validation_loader)))

        # save model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  \
                   Saving model ...'.format(valid_loss_min/len(validation_loader), valid_loss/len(validation_loader)))
            torch.save(model.state_dict(), 'model_cifar.pt')
            # Set model back to device after saving
            model.to(device)
            valid_loss_min = valid_loss        
        
    # save after all epochs
    save_model(model, model_dir)


# Provided model saving functions
def save_model(model, model_dir):
    print("Saving the model.")
    path = os.path.join(model_dir, 'model.pth')
    # save state dictionary
    torch.save(model.cpu().state_dict(), path)
    
def save_model_params(model, model_dir):
    model_info_path = os.path.join(args.model_dir, 'model_info.pth')
    with open(model_info_path, 'wb') as f:
        model_info = {
            'input_dim': args.input_dim,
            'hidden_1': args.hidden_1,
            'hidden_2': args.hidden_2,
            'hidden_3': args.hidden_3,
            'output_dim': args.output_dim
        }
        torch.save(model_info, f)




In [36]:
data_dir = 'data'
model_dir = 'data'
train_loader = _get_train_loader(batch_size=64, data_dir=data_dir)


Get train loader.


In [37]:
validation_loader = _get_validation_loader(batch_size=64, data_dir=data_dir)

Get validation loader.


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cpu')

In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## TODO: Complete this classifier
class SimpleNet(nn.Module):
    
    ## TODO: Define the init function
    def __init__(self, input_dim, hidden_1, hidden_2, hidden_3, output_dim):
        '''Defines layers of a neural network.
           :param input_dim: Number of input features
           :param hidden_dim: Size of hidden layer(s)
           :param output_dim: Number of outputs
         '''
        super(SimpleNet, self).__init__()
        
        # define all layers, here
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, hidden_3)
        self.fc4 = nn.Linear(hidden_3, output_dim)
        # Define dropout
        self.drop = nn.Dropout(0.3)
        # Sigmoid Layer
        self.sig = nn.Sigmoid()
        
    
    ## TODO: Define the feedforward behavior of the network
    def forward(self, x):
        '''Feedforward behavior of the net.
           :param x: A batch of input features
           :return: A single, sigmoid activated value
         '''
        # your code, here
        out = F.relu(self.fc1(x))
        out = self.drop(out)
        out = F.relu(self.fc2(out))
        out = self.drop(out)
        out = F.relu(self.fc3(out))
        out = self.drop(out)
        out = self.fc4(out)
      
        return self.sig(out)

In [81]:
%%time
#def train(model, train_loader, validation_loader, epochs, optimizer, criterion, device):

model = SimpleNet(input_dim, hidden_1, hidden_2, hidden_3, output_dim)

import torch.optim as optim


## TODO: Define an optimizer and loss function for training
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()



train(model, train_loader, validation_loader, 20, optimizer, criterion, device)



Epoch: 1, Loss: 0.6257069819441465
Epoch: 1 	Training Loss: 0.625707 	Validation Loss: 37.938244
Validation loss decreased (inf --> 37.938244).                     Saving model ...
Epoch: 2, Loss: 0.5667100990487036
Epoch: 2 	Training Loss: 0.566710 	Validation Loss: 33.929926
Validation loss decreased (37.938244 --> 33.929926).                     Saving model ...
Epoch: 3, Loss: 0.5360852284409175
Epoch: 3 	Training Loss: 0.536085 	Validation Loss: 32.063407
Validation loss decreased (33.929926 --> 32.063407).                     Saving model ...
Epoch: 4, Loss: 0.5237849187071079
Epoch: 4 	Training Loss: 0.523785 	Validation Loss: 31.377724
Validation loss decreased (32.063407 --> 31.377724).                     Saving model ...
Epoch: 5, Loss: 0.5169930422974524
Epoch: 5 	Training Loss: 0.516993 	Validation Loss: 30.698896
Validation loss decreased (31.377724 --> 30.698896).                     Saving model ...
Epoch: 6, Loss: 0.5155244921969476
Epoch: 6 	Training Loss: 0.515524 	V

In [76]:
test_loader = _get_test_loader(batch_size=64, data_dir=data_dir)


Get test loader.


In [77]:
pd.read_csv('data/test_viewed.csv')

Unnamed: 0,1,0.2530120481927711,0.33333333333333337,0,1.1,0.1,0.2,1.0,1.0.1,1.0.2,0.0
0,1,0.438476,0.393389,0,0,0,1,1.0,1.0,1.0,1.0
1,1,0.438476,0.393389,0,0,0,1,1.0,1.0,0.0,1.0
2,1,0.349398,0.677778,1,0,0,0,1.0,1.0,1.0,1.0
3,0,0.433735,0.655556,0,1,0,0,1.0,1.0,1.0,0.0
4,1,0.686747,0.011111,0,1,0,0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
25166,1,0.746988,0.500000,1,0,0,0,1.0,1.0,1.0,1.0
25167,1,0.469880,0.388889,1,0,0,0,1.0,1.0,1.0,1.0
25168,1,0.349398,0.400000,0,1,0,0,1.0,1.0,1.0,1.0
25169,0,0.710843,0.811111,1,0,0,0,1.0,1.0,1.0,0.0


In [85]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
#h = net.init_hidden(batch_size)

model.eval()


for data, targets in test_loader:

    # get output of SimpleNet
    output = model(data)
    # calculate loss and perform backprop
    test_loss = criterion(output, targets)

    test_losses.append(test_loss.item())

    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer

    # compare predictions to true label
    correct_tensor = pred.eq(targets.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))


Test loss: 0.473
Test accuracy: 0.765
