In [63]:
# imports
import torch
import torch.nn as nn
import numpy as np

# importing iris dataset
import seaborn as sns
iris_dataset = sns.load_dataset('iris')

# direct access to torch.nn functions (without using classes)
import torch.nn.functional as F

import matplotlib.pyplot as plt
import matplotlib_inline
from IPython import display
%matplotlib inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

---
# Cross Validation!
#### What is cross validation?
A technique to minimize over-fitting by validating model performance across different data subsets. These data subsets are often divided into these sections:
1. *training set* (used to train model) - data the model is trained on, backprop is used, weights are changed.
2. *dev set (aka hold-out set)* - tests model accuracy, is used to fine-tune the model, no learning with this data.
3. *test set* - is used as the final test for the model, no learning with this data. Used to address overfitting, tests model generalizability (ability to perform on new data).

## In this Project:
#### Cross Validation will be performed by manually dividing up the data with numpy.

---
# Transforming Data

In [64]:
# converting from pandas dataframe to pytorch tensor
torched_iris_dataset= torch.tensor(iris_dataset[iris_dataset.columns[0:4]].values).float() # only use first 4 columns, because last column is outcome variable or datatype.

# numeric transformation - transforming species name to a number (0-2)
iris_labels = torch.zeros(len(torched_iris_dataset), dtype=torch.long) # species 'setosa' will remain 0.
iris_labels[iris_dataset.species=='versicolor'] = 1
iris_labels[iris_dataset.species=='virginica'] = 2

# Separating Iris Dataset into Training and Testing sets
#### To keep the subsections / sets of data **representative** of the *original dataset*, the training set needs to be **randomly sampled**.

In [65]:
# Data division parameters
train_data_ratio = 0.8 # 80% of the data will be used for training
number_of_samples = int(len(iris_labels)*train_data_ratio) # number of samples for training

# Initializing a vector to store this information. Boolean indicates if sample is included in training set.
training_set_bool = np.zeros(len(iris_labels), dtype=bool)

# Randomly selecting samples for training
random_training_samples = np.random.choice(range(len(iris_labels)),number_of_samples,replace=False)

training_set_bool[random_training_samples] = True

# Testing if samples are representative of the dataset. 
# An avg ~ 1 is expected (because of an equal distribution of 0, 1, 2 for the flower labels).
raw_dataset_distribution = torch.mean(iris_labels.float())
training_set_distribution = torch.mean(iris_labels[training_set_bool].float())
testing_set_distribution = torch.mean(iris_labels[~training_set_bool].float())

print('Raw dataset distribution: ', raw_dataset_distribution)
print('Training set distribution: ', training_set_distribution)
print('Testing set distribution: ', testing_set_distribution)

# Examining the shape of each dataset
print('Shape of iris dataset: ', torched_iris_dataset.shape)
print('Shape of training set: ', torched_iris_dataset[training_set_bool].shape)
print('Shape of testing set: ', torched_iris_dataset[~training_set_bool].shape)


Raw dataset distribution:  tensor(1.)
Training set distribution:  tensor(0.9833)
Testing set distribution:  tensor(1.0667)
Shape of iris dataset:  torch.Size([150, 4])
Shape of training set:  torch.Size([120, 4])
Shape of testing set:  torch.Size([30, 4])


### Model Architecture (not the focus of this experiment)

In [66]:
ANN_iris = nn.Sequential(
    nn.Linear(4,64),
    nn.ReLU(),
    nn.Linear(64,64),
    nn.ReLU(),
    nn.Linear(64,3)
)

# Loss function
loss_function = nn.CrossEntropyLoss()

# Optimizer
optimizer = torch.optim.SGD(ANN_iris.parameters(), lr=0.01)

### Function to Train the Model
- The only major changes to the function have been exclusively using the training data and the corresponding labels, instead of training on the entire dataset.

In [67]:
def ANN_classification_training(model, loss_function, m_optimizer, input_data, input_labels, epochs = 1000):
    
    # parameters
    losses = torch.zeros(epochs)
    per_epoch_accuracy = []
    
    # training
    for epoch_IDX in range(epochs):
        # forward pass
        y_hat = model(input_data) # only training data
        
        # calculating loss
        loss = loss_function(y_hat, input_labels)
        losses[epoch_IDX] = loss
        # backprop
        m_optimizer.zero_grad()
        loss.backward()
        m_optimizer.step()

        # calculating accuracy at each epoch
        matches = torch.argmax(y_hat, axis=1) == input_labels.float() # converting to booleans (T / F)
        matches_num = matches.float()                         # convert bools to ints (0 / 1)
        accuracy_percent = 100*torch.mean(matches_num)        # average of correct matches
        per_epoch_accuracy.append(accuracy_percent)

    # final forward pass
    final_predictions = model(input_data)

    # overall accuracy
    predicted_lables = torch.argmax(final_predictions, axis=1)
    total_accuracy = 100*torch.mean((predicted_lables==input_labels).float())

    return final_predictions, losses, per_epoch_accuracy, total_accuracy, y_hat

## Training the Model

In [68]:
# model params
set_learning_rate = 0.01

# training model
final_predictions, losses, per_epoch_accuracy, model_accuracy, y_hat = ANN_classification_training(
    model= ANN_iris,
    loss_function= loss_function,
    m_optimizer= optimizer,
    input_data= torched_iris_dataset[training_set_bool], # only training data
    input_labels= iris_labels[training_set_bool], # only training data labels
    epochs= 1000
)

### Examining Model Accuracy

In [69]:
# Final Forward Pass with training data
predictions = ANN_iris(torched_iris_dataset[training_set_bool])
training_accuracy = 100*torch.mean((torch.argmax(predictions, axis=1)==iris_labels[training_set_bool]).float())
print('Training accuracy: ', training_accuracy)

# Final Forward Pass with testing data
predictions = ANN_iris(torched_iris_dataset[~training_set_bool])
testing_accuracy = 100*torch.mean((torch.argmax(predictions, axis=1)==iris_labels[~training_set_bool]).float())
print('Testing accuracy: ', testing_accuracy)

Training accuracy:  tensor(99.1667)
Testing accuracy:  tensor(93.3333)


### Notes
- An interesting result here is that occasionally the **testing** accuracy can *actually be higher* than the **training** accuracy!
- With smaller models and smaller datasets this can happen, and it's not unusual in this situation. This kind of behavior is not desirable in larger models with more data.