In [1]:
"""
Last modified: 10/05/2021

@author: Daniel Rodriguez
"""
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
#from keras.utils import to_categorical
from Networks.Network import Network
from Utilities.utils import *
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from ucimlrepo import fetch_ucirepo, list_available_datasets


In [2]:
wine = fetch_ucirepo(name='Wine')
print(wine.metadata.keys())

dict_keys(['uci_id', 'name', 'repository_url', 'data_url', 'abstract', 'area', 'tasks', 'characteristics', 'num_instances', 'num_features', 'feature_types', 'demographics', 'target_col', 'index_col', 'has_missing_values', 'missing_values_symbol', 'year_of_dataset_creation', 'last_updated', 'dataset_doi', 'creators', 'intro_paper', 'additional_info'])


In [3]:
pd.set_option('display.max_columns', None)

print(wine.variables)

                            name     role         type demographic  \
0                          class   Target  Categorical        None   
1                        Alcohol  Feature   Continuous        None   
2                      Malicacid  Feature   Continuous        None   
3                            Ash  Feature   Continuous        None   
4              Alcalinity_of_ash  Feature   Continuous        None   
5                      Magnesium  Feature      Integer        None   
6                  Total_phenols  Feature   Continuous        None   
7                     Flavanoids  Feature   Continuous        None   
8           Nonflavanoid_phenols  Feature   Continuous        None   
9                Proanthocyanins  Feature   Continuous        None   
10               Color_intensity  Feature   Continuous        None   
11                           Hue  Feature   Continuous        None   
12  0D280_0D315_of_diluted_wines  Feature   Continuous        None   
13                  

In [4]:
X, y = wine.data.features, wine.data.targets.squeeze().to_numpy()
print(f"Dataset Shape (rows,cols): {X.shape}")
print(f"Target Shape (rows,cols): {y.shape}")

Dataset Shape (rows,cols): (178, 13)
Target Shape (rows,cols): (178,)


## Parameters

In [5]:
# PARAMETERS
RANDOM_STATE = 1
RUN_EXPERIMENTS = True # if True, it will run the experiments
BEST_MODEL = False # if True, it will run the best model
EPOCHS = 400000 # just one since we are using a stopping criteria

# Define hyperparameter grid for experiments
params = {
    'hidden_size': [5, 10, 20],
    'learning_rate': [1.0, 0.1, 0.01, 0.001]
}

## Data Preparation

In [6]:
# Since all input features are numerical, we can use the StandardScaler to normalize the data
numerical_preprocessing = Pipeline([
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('numerical', numerical_preprocessing, X.columns)
])

# Train, test, validation split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=RANDOM_STATE)

# Device configuration
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Device: {device}\n")

# Preprocess the data and convert it to tensors
X_train = torch.tensor(preprocessor.fit_transform(X_train), dtype=torch.float32).to(device)
X_val = torch.tensor(preprocessor.transform(X_val), dtype=torch.float32).to(device)
X_test = torch.tensor(preprocessor.transform(X_test), dtype=torch.float32).to(device)

# One-hot encode the target
encoder = OneHotEncoder(sparse=False)
y_train_encoded = torch.tensor(encoder.fit_transform(y_train.reshape(-1, 1)), dtype=torch.long).to(device)
y_val_encoded = torch.tensor(encoder.transform(y_val.reshape(-1, 1)), dtype=torch.long).to(device)
y_test_encoded = torch.tensor(encoder.transform(y_test.reshape(-1, 1)), dtype=torch.long).to(device)

# Data shapes
print("Number of samples in each dataset:")
print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")
print(f"Test shape: {X_test.shape}"); print()

# Target shapes
print("Shape of the target after one-hot encoding:")
print(f"Train shape: {y_train_encoded.shape}")
print(f"Validation shape: {y_val_encoded.shape}")
print(f"Test shape: {y_test_encoded.shape}"); print()

Device: cuda

Number of samples in each dataset:
Train shape: torch.Size([142, 13])
Validation shape: torch.Size([8, 13])
Test shape: torch.Size([28, 13])

Shape of the target after one-hot encoding:
Train shape: torch.Size([142, 3])
Validation shape: torch.Size([8, 3])
Test shape: torch.Size([28, 3])





## Testing Various Hyper-parameters

In [7]:
if RUN_EXPERIMENTS:
    print("RUNNING EXPERIMENTS")
    #output storage lists
    posttrain_loss_storage = torch.zeros(len(params['learning_rate']), len(params['hidden_size']), dtype = torch.float32)
    train_confusion_matrix_storage = np.empty((4,3,3,3), dtype = np.int_)
    test_confusion_matrix_storage = np.empty((4,3,3,3), dtype = np.int_)
    
    lr_count = 0
    hs_cound = 0
    for i in params['learning_rate']:
        hs_count = 0
        for j in params['hidden_size']:
            num_hidden = j
            model = Network(num_hidden)
            stopping_criteria = StopCriteria()

            #sending model to cuda
            model.to(device)
            #X_train.to(device)

            criterion = nn.CrossEntropyLoss() #cross-entropy loss
            optimizer = torch.optim.SGD(model.parameters(), lr = i) # implementing momentum for learning rate

            #Showing test set loss pre-training
            print("-----------------------------------------------------------------")
            print("Learning Rate: " + str(i) + "; " + "Hidden Layer PE: " + str(j))
            
            # Training model
            for epoch in range(EPOCHS):
                optimizer.zero_grad()
                output = model.forward(X_train)
                loss = criterion(output.squeeze(), torch.max(y_train_encoded, 1)[1]) 
                
                if epoch == 0:
                    print('Epoch: {}; before training loss: {}'.format(epoch,loss.item()))

                #implementing stopping criteria
                val_output = model.forward(X_val)
                val_loss = criterion(val_output.squeeze(), torch.max(y_val_encoded, 1)[1])

                if stopping_criteria.step(val_loss):
                    print('Epoch: {}; after train loss: {}'.format(epoch,loss.item()))
                    print()
                    break

                # #printing epoch and loss 
                # if epoch % 5000 == 0:
                #     print('Epoch: {} train loss: {}'.format(epoch,loss.item()))

                #backpropagation
                loss.backward()
                optimizer.step()
                
            # evaluating the model and storing relevant information
            posttrain_loss_storage[lr_count, hs_count] = loss
            
            model.eval()
            train_pred = model(X_train)
            test_pred = model(X_test)
            train_CM = ConfusionMatrix(model, train_pred, y_train_encoded)
            test_CM = ConfusionMatrix(model, test_pred, y_test_encoded)
            after_train = criterion(test_pred.squeeze(), torch.max(y_test_encoded, 1)[1])
            
            print('Test loss post training' , after_train.item())
            print()
            print("Training Confusion Matrix: \n" + str(train_CM))
            print()
            print("Test Confusion Matrix: \n" + str(test_CM))
            print()
            print("-----------------------------------------------------------------")
            
            #train_confusion_matrix_storage[lr_count,hs_count,:,:] = train_CM
            #test_confusion_matrix_storage[lr_count,hs_count,:,:] = test_CM
            
            hs_count += 1
        lr_count += 1
    
    print()
    print("DONE TESTING HYPERPARAMETERS")

RUNNING EXPERIMENTS
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer PE: 5
Test loss pre training: 1.09132981300354

Epoch: 0; before training loss: 1.177579402923584
Epoch: 16961; after train loss: 0.00011563058069441468

Test loss post training 0.004880609456449747

Training Confusion Matrix: 
tensor([[45,  0,  0],
        [ 0, 58,  0],
        [ 0,  0, 39]])

Test Confusion Matrix: 
tensor([[11,  0,  0],
        [ 0,  8,  0],
        [ 0,  0,  9]])

-----------------------------------------------------------------
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer PE: 10
Test loss pre training: 1.179295539855957

Epoch: 0; before training loss: 1.0941189527511597
Epoch: 17544; after train loss: 8.743775106268004e-05

Test loss post training 0.005258933175355196

Training Confusion Matrix: 
tensor([[45,  0,  0],
        [ 0, 58,  0],
        [ 0,  0, 39]])

Test Confusion Matrix: 
tensor

## Best Model

In [None]:
best_lr_index, best_hs_index = torch.argmin(posttrain_loss_storage).tolist()
best_lr = params['learning_rate'][best_lr_index]
best_hs = params['hidden_size'][best_hs_index]

print("Best Learning Rate: " + str(best_lr))    
print("Best Hidden Layer PE: " + str(best_hs))

In [8]:
if BEST_MODEL:
    print("RUNNING BEST MODEL")

    #Initializing model and stopping criteria classes
    model = Network(best_hs)
    stopping_criteria = StopCriteria()

    #sending model to cuda
    model.to(device)

    criterion = nn.CrossEntropyLoss() #cross-entropy loss
    optimizer = torch.optim.SGD(model.parameters(), lr = best_lr) # implementing momentum for learning rate

    # Training model
    for epoch in range(EPOCHS):
        optimizer.zero_grad()
        output = model.forward(X_train)
        loss = criterion(output.squeeze(), torch.max(y_train_encoded, 1)[1]) 

        #implementing stopping criteria
        val_output = model.forward(X_val)
        val_loss = criterion(val_output.squeeze(), torch.max(y_val_encoded, 1)[1])

        if stopping_criteria.step(val_loss):
            print('Epoch: {}; after train loss: {}'.format(epoch,loss.item()))
            break

        #printing epoch and loss 
        if epoch % 250 == 0:
            print('Epoch: {}; train loss: {}'.format(epoch,loss.item()))

        #backpropagation
        loss.backward()
        optimizer.step()

In [9]:
model.eval()
y_pred = model(X_test)
after_train = criterion(y_pred.squeeze(), torch.max(y_test_encoded, 1)[1]) 
print('Test loss after Training' , after_train.item())

Test loss after Training 0.0060800062492489815


In [10]:
test_CM = ConfusionMatrix(model, y_pred, y_test_encoded)
print("Test Confusion Matrix:")
print(test_CM)

Test Confusion Matrix:
tensor([[11,  0,  0],
        [ 0,  8,  0],
        [ 0,  0,  9]])


Best Model Discussion:

The overall best model in terms of test set loss after training is the model with a learning rate of 1 and 5 hidden PE. This model achieved a testing loss of 0.00117. This learning rate may be performing the best because it can quickly get over non-optimal local minima on the performance surface before the training is terminated by the stopping criteria. It may be possible to achieve lower train and test loss with more patience on the stopping criteria for lower learning rates - however this would increase the number of epochs required for convergence and increase computation time. Five PE in the hidden layer may be performing well because not all of the inputs are predictive of the wine cultivar. 

# Testing model with two hidden layers

In [11]:
class Network2(nn.Module):
    def __init__(self, num_hidden1, num_hidden2):
        super(Network2, self).__init__()
        self.num_hidden1 = num_hidden1
        self.num_hidden2 = num_hidden2
        
        # Inputs to hidden linear combination
        self.hidden1 = nn.Linear(13, self.num_hidden1)
        # hidden to output layer, 3 classes - one for each cultivar
        self.hidden2 = nn.Linear(self.num_hidden1, self.num_hidden2)
        self.out = nn.Linear(self.num_hidden2, 3)
        
        # Defining activation functions
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        z1 = self.hidden1(x)
        out1 = self.sigmoid(z1)
        z2 = self.hidden2(out1)
        out2 = self.sigmoid(z2)
        z3 = self.out(out2)
        
        return z3
    
    def prediction(self, output):
        preds = torch.zeros(1,output.shape[0]).flatten().long()
        for i in range(len(preds)):
            index = torch.argmax(output[i,:])
            
            if index == 0:
                preds[i] = 1
            elif index == 1:
                preds[i] = 2
            else:
                preds[i] = 3
                
        return preds

In [12]:
if RUN_EXPERIMENTS:
    #Hyper-Parameter sets
    learning_rates = np.array([1.0, 0.1])
    hidden_sizes1 = np.array([5, 10])
    hidden_sizes2 = np.array([5, 10])
    EPOCHS = 400000 # just one since we are using a stopping criteria
    
    #output storage lists
    """posttrain_loss_storage2 = torch.zeros(len(learning_rates), len(hidden_sizes), dtype = torch.float32)
    train_confusion_matrix_storage = np.empty((4,3,3,3), dtype = np.int_)
    test_confusion_matrix_storage = np.empty((4,3,3,3), dtype = np.int_)"""
    
    lr_count = 0
    for i in learning_rates:
        hs_count1 = 0
        for j in hidden_sizes1:
            num_hidden1 = j
            hs_count2 = 0
            for k in hidden_sizes2:
                num_hidden2 = k
                
                model2 = Network2(num_hidden1, num_hidden2)
                stopping_criteria = StopCriteria()

                #sending model to cuda
                device = torch.device("cuda:0")
                model2.to(device)
                #X_train.to(device)

                criterion = nn.CrossEntropyLoss() #cross-entropy loss
                optimizer = torch.optim.SGD(model2.parameters(), lr = i) # implementing momentum for learning rate

                #Showing test set loss pre-training
                print("-----------------------------------------------------------------")
                print("Learning Rate: " + str(i) + "; " + "Hidden Layer 1 PE: " + str(j) + "; " + "Hidden Layer 2 PE: " + str(k))

                model2.eval()
                y_pred = model2(X_test)
                before_train = criterion(y_pred.squeeze(), torch.max(y_test_encoded, 1)[1])
                print("Test loss pre training: " + str(before_train.item()))
                print()

                # Training model
                for epoch in range(EPOCHS):
                    optimizer.zero_grad()
                    output = model2.forward(X_train)
                    loss = criterion(output.squeeze(), torch.max(y_train_encoded, 1)[1]) 

                    if epoch == 0:
                        print('Epoch: {}; before training loss: {}'.format(epoch,loss.item()))

                    #implementing stopping criteria
                    val_output = model2.forward(X_val)
                    val_loss = criterion(val_output.squeeze(), torch.max(y_val_encoded, 1)[1])

                    if stopping_criteria.step(val_loss):
                        print('Epoch: {}; after train loss: {}'.format(epoch,loss.item()))
                        print()
                        break

                    """
                    #printing epoch and loss 
                    if epoch % 5000 == 0:
                        print('Epoch: {} train loss: {}'.format(epoch,loss.item()))
                    """

                    #backpropagation
                    loss.backward()
                    optimizer.step()

                # evaluating the model and storing relevant information
                #posttrain_loss_storage2[lr_count, hs_count] = loss

                model2.eval()
                train_pred = model2(X_train)
                test_pred = model2(X_test)
                train_CM = ConfusionMatrix(model2, train_pred, y_train_encoded)
                test_CM = ConfusionMatrix(model2, test_pred, y_test_encoded)
                after_train = criterion(test_pred.squeeze(), torch.max(y_test_encoded, 1)[1])

                print('Test loss post training' , after_train.item())
                print()
                print("Training Confusion Matrix: \n" + str(train_CM))
                print()
                print("Test Confusion Matrix: \n" + str(test_CM))
                print()
                print("-----------------------------------------------------------------")

                #train_confusion_matrix_storage[lr_count,hs_count,:,:] = train_CM
                #test_confusion_matrix_storage[lr_count,hs_count,:,:] = test_CM
                hs_count2 += 1
            hs_count1 += 1
        lr_count += 1
    
    print()
    print("DONE TESTING HYPERPARAMETERS")

-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer 1 PE: 5; Hidden Layer 2 PE: 5
Test loss pre training: 1.2559295892715454

Epoch: 0; before training loss: 1.187574863433838


Epoch: 22661; after train loss: 6.319114618236199e-05

Test loss post training 0.0002485540753696114

Training Confusion Matrix: 
tensor([[45,  0,  0],
        [ 0, 58,  0],
        [ 0,  0, 39]])

Test Confusion Matrix: 
tensor([[11,  0,  0],
        [ 0,  8,  0],
        [ 0,  0,  9]])

-----------------------------------------------------------------
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer 1 PE: 5; Hidden Layer 2 PE: 10
Test loss pre training: 1.1136878728866577

Epoch: 0; before training loss: 1.090362310409546
Epoch: 6; after train loss: 1.084182620048523

Test loss post training 1.1220940351486206

Training Confusion Matrix: 
tensor([[ 0,  0,  0],
        [45, 58, 39],
        [ 0,  0,  0]])

Test Confusion Matrix: 
tensor([[ 0,  0,  0],
        [11,  8,  9],
        [ 0,  0,  0]])

-----------------------------------------------------------------
-----------------------------------------------------------------
Learning 

Discussion of Two Hidden Layer Models

Taking the best learning rates and layer size from the previous model and testing all combinations I found that introducing an additional layer to the model does not significantly increase model performance for classification but it does occasionaly perform better on test set loss when controlling for the train-validation-test split of the data. Both models are able to achieve perfect classification, when the stopping criteria isnt met early, in less than 5K epochs - with the 1 hidden layer model taking less epochs on average. 

Some combinations of hyper-parameters caused the model training to terminate early. This is likely due to the stop-criteria not giving the model enough time to get out of a local minima before the stop criteria was met. Increasing the stop-criteria to 10 does help reduce the number of models terminating early but some still terminate early.

## Testing model with no hidden-layers

In [13]:
class Network3(nn.Module):
    def __init__(self):
        super(Network3, self).__init__()
        
        # Inputs to outputs linear combination
        self.out = nn.Linear(13, 3)
        
        # Defining activation functions
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        z1 = self.out(x)
        
        return z1
    
    def prediction(self, output):
        preds = torch.zeros(1,output.shape[0]).flatten().long()
        for i in range(len(preds)):
            index = torch.argmax(output[i,:])
            
            if index == 0:
                preds[i] = 1
            elif index == 1:
                preds[i] = 2
            else:
                preds[i] = 3
                
        return preds

In [14]:
if RUN_EXPERIMENTS:
    #Hyper-Parameter sets
    learning_rates = np.array([1.0, 0.1, 0.01, 0.001])
    EPOCHS = 400000 # just one since we are using a stopping criteria
    
    lr_count = 0
    for i in learning_rates:
        model3 = Network3()
        stopping_criteria = StopCriteria(patience = 10)

        #sending model to cuda
        device = torch.device("cuda:0")
        model3.to(device)
        #X_train.to(device)

        criterion = nn.CrossEntropyLoss() #cross-entropy loss
        optimizer = torch.optim.SGD(model.parameters(), lr = i) # implementing momentum for learning rate

        #Showing test set loss pre-training
        print("-----------------------------------------------------------------")
        print("Learning Rate: " + str(i))

        model3.eval()
        y_pred = model3(X_test)
        before_train = criterion(y_pred.squeeze(), torch.max(y_test_encoded, 1)[1])
        print("Test loss pre training: " + str(before_train.item()))
        print()

        # Training model
        for epoch in range(EPOCHS):
            optimizer.zero_grad()
            output = model3.forward(X_train)
            loss = criterion(output.squeeze(), torch.max(y_train_encoded, 1)[1]) 

            if epoch == 0:
                print('Epoch: {}; before training loss: {}'.format(epoch,loss.item()))

            #implementing stopping criteria
            val_output = model3.forward(X_val)
            val_loss = criterion(val_output.squeeze(), torch.max(y_val_encoded, 1)[1])

            if stopping_criteria.step(val_loss):
                print('Epoch: {}; after train loss: {}'.format(epoch,loss.item()))
                print()
                break

            
            #printing epoch and loss 
            if epoch % 2 == 0:
                print('Epoch: {} train loss: {}'.format(epoch,loss.item()))
            

            #backpropagation
            loss.backward()
            optimizer.step()

        # evaluating the model and storing relevant information
        model3.eval()
        train_pred = model3(X_train)
        test_pred = model3(X_test)
        train_CM = ConfusionMatrix(model3, train_pred, y_train_encoded)
        test_CM = ConfusionMatrix(model3, test_pred, y_test_encoded)
        after_train = criterion(test_pred.squeeze(), torch.max(y_test_encoded, 1)[1])

        print('Test loss post training' , after_train.item())
        print()
        print("Training Confusion Matrix: \n" + str(train_CM))
        print()
        print("Test Confusion Matrix: \n" + str(test_CM))
        print()
        print("-----------------------------------------------------------------")

        #train_confusion_matrix_storage[lr_count,hs_count,:,:] = train_CM
        #test_confusion_matrix_storage[lr_count,hs_count,:,:] = test_CM

    lr_count += 1
    
print()
print("DONE TESTING HYPERPARAMETERS")

-----------------------------------------------------------------
Learning Rate: 1.0
Test loss pre training: 1.6084026098251343

Epoch: 0; before training loss: 1.4049665927886963
Epoch: 0 train loss: 1.4049665927886963
Epoch: 2 train loss: 1.4049665927886963
Epoch: 4 train loss: 1.4049665927886963
Epoch: 6 train loss: 1.4049665927886963
Epoch: 8 train loss: 1.4049665927886963
Epoch: 10; after train loss: 1.4049665927886963



TypeError: CrossEntropyLoss.forward() takes 3 positional arguments but 4 were given

Discussion of no hidden layer model:

A model with no hidden layers is unable to classify the Wine.