In [1]:
"""
Last modified: 10/05/2021

@author: Daniel Rodriguez
"""
import itertools
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from Networks.Network import Network
from Utilities.utils import *
from Utilities.run_experiments import *
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from ucimlrepo import fetch_ucirepo, list_available_datasets


In [2]:
wine = fetch_ucirepo(name='Wine')
print(wine.metadata.keys())

dict_keys(['uci_id', 'name', 'repository_url', 'data_url', 'abstract', 'area', 'tasks', 'characteristics', 'num_instances', 'num_features', 'feature_types', 'demographics', 'target_col', 'index_col', 'has_missing_values', 'missing_values_symbol', 'year_of_dataset_creation', 'last_updated', 'dataset_doi', 'creators', 'intro_paper', 'additional_info'])


In [3]:
pd.set_option('display.max_columns', None)

print(wine.variables)

                            name     role         type demographic  \
0                          class   Target  Categorical        None   
1                        Alcohol  Feature   Continuous        None   
2                      Malicacid  Feature   Continuous        None   
3                            Ash  Feature   Continuous        None   
4              Alcalinity_of_ash  Feature   Continuous        None   
5                      Magnesium  Feature      Integer        None   
6                  Total_phenols  Feature   Continuous        None   
7                     Flavanoids  Feature   Continuous        None   
8           Nonflavanoid_phenols  Feature   Continuous        None   
9                Proanthocyanins  Feature   Continuous        None   
10               Color_intensity  Feature   Continuous        None   
11                           Hue  Feature   Continuous        None   
12  0D280_0D315_of_diluted_wines  Feature   Continuous        None   
13                  

In [4]:
X, y = wine.data.features, wine.data.targets.squeeze().to_numpy()
print(f"Dataset Shape (rows,cols): {X.shape}")
print(f"Target Shape (rows,cols): {y.shape}")

Dataset Shape (rows,cols): (178, 13)
Target Shape (rows,cols): (178,)


## Data Preparation

In [5]:
RANDOM_STATE = 1

# Since all input features are numerical, we can use the StandardScaler to normalize the data
numerical_preprocessing = Pipeline([
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('numerical', numerical_preprocessing, X.columns)
])

# Train, test, validation split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=RANDOM_STATE)

# Device configuration
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Device: {device}\n")

# Preprocess the data and convert it to tensors
X_train = torch.tensor(preprocessor.fit_transform(X_train), dtype=torch.float32).to(device)
X_val = torch.tensor(preprocessor.transform(X_val), dtype=torch.float32).to(device)
X_test = torch.tensor(preprocessor.transform(X_test), dtype=torch.float32).to(device)

# One-hot encode the target
encoder = OneHotEncoder(sparse=False)
y_train_encoded = torch.tensor(encoder.fit_transform(y_train.reshape(-1, 1)), dtype=torch.long).to(device)
y_val_encoded = torch.tensor(encoder.transform(y_val.reshape(-1, 1)), dtype=torch.long).to(device)
y_test_encoded = torch.tensor(encoder.transform(y_test.reshape(-1, 1)), dtype=torch.long).to(device)

# Data shapes
print("Number of samples in each dataset:")
print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")
print(f"Test shape: {X_test.shape}"); print()

# Target shapes
print("Shape of the target after one-hot encoding:")
print(f"Train shape: {y_train_encoded.shape}")
print(f"Validation shape: {y_val_encoded.shape}")
print(f"Test shape: {y_test_encoded.shape}"); print()

Device: cuda

Number of samples in each dataset:
Train shape: torch.Size([142, 13])
Validation shape: torch.Size([8, 13])
Test shape: torch.Size([28, 13])

Shape of the target after one-hot encoding:
Train shape: torch.Size([142, 3])
Validation shape: torch.Size([8, 3])
Test shape: torch.Size([28, 3])





## Network with One Hidden Layer

### Testing Various Hyper-parameters

In [6]:
# GLOBAL PARAMETERS
RUN_EXPERIMENTS = True # if True, it will run the experiments
BEST_MODEL = False # if True, it will run the best model
EPOCHS = 400000

# LOCAL PARAMETERS
NUM_LAYERS = 1
NUM_INPUTS = X_train.shape[1]
NUM_OUTPUTS = y_train_encoded.shape[1]

# Define hyperparameter grid for experiments
params = {
    'hidden_size': [[5], [10], [20]],
    'learning_rate': [1.0, 0.1, 0.01]
}

In [7]:
if RUN_EXPERIMENTS:
    print("Running Experiments...")

    training_losses, validation_losses, training_accuracies, validation_accuracies = [], [], [], []
    test_losses, test_accuracies = [], []
    for hs in params['hidden_size']:
        sub_training_losses, sub_validation_losses, sub_training_accuracies, sub_validation_accuracies = [], [], [], []
        sub_test_loss, sub_test_accuracy = [], []
        for lr in params['learning_rate']:

            # Create the model
            model = Network(NUM_INPUTS, NUM_OUTPUTS, NUM_LAYERS, hs).to(device)

            print("---------------------------------------------")
            print(f"Hidden Layer Shape: {hs}; Learning Rate: {lr}\n")

            # Train the model
            model, train_loss, val_loss, train_cm, val_cm, epoch_stop, time = train_model(model, EPOCHS, lr, X_train, y_train_encoded, X_val, y_val_encoded, verbose=True)
            training_accuracy = (train_cm[0,0] + train_cm[1,1] + train_cm[2,2]) / torch.sum(train_cm)
            validation_accuracy = (val_cm[0,0] + val_cm[1,1] + val_cm[2,2]) / torch.sum(val_cm)
            
            print(f"Training stopped at epoch {epoch_stop} after {time:.2f} seconds\n")

            # Evaluate the model on the test set
            test_loss, test_cm = evaluate_model(model, X_test, y_test_encoded)
            test_accuracy = (test_cm[0,0] + test_cm[1,1] + test_cm[2,2]) / torch.sum(test_cm)

            # Saving some results
            sub_training_losses.append(train_loss)
            sub_validation_losses.append(val_loss)
            sub_training_accuracies.append(training_accuracy)
            sub_validation_accuracies.append(validation_accuracy)
            sub_test_loss.append(test_loss)
            sub_test_accuracy.append(test_accuracy)

            print("Post-training Results:")
            print(f"Train Loss: {train_loss[-1]:.4f}; Validation Loss: {val_loss[-1]:.4f}; Test Loss: {test_loss:.4f}")
            print(f"Training Accuracy: {training_accuracy:.4f}; Test Accuracy: {test_accuracy:.4f}"); print()
            print(f"Train Confusion Matrix:\n{train_cm}"); print()
            print(f"Test Confusion Matrix:\n{test_cm}"); print()

            print("---------------------------------------------")

        # Saving some results
        training_losses.append(sub_training_losses)
        validation_losses.append(sub_validation_losses)
        training_accuracies.append(sub_training_accuracies)
        validation_accuracies.append(sub_validation_accuracies)
        test_losses.append(sub_test_loss)
        test_accuracies.append(sub_test_accuracy)

    # results dictionary
    results = {
        'training_losses': training_losses,
        'validation_losses': validation_losses,
        'training_accuracies': training_accuracies,
        'validation_accuracies': validation_accuracies,
        'test_losses': test_losses,
        'test_accuracies': test_accuracies
    }

    print("Experiments Completed")

Running Experiments...
---------------------------------------------
Hidden Layer Shape: [5]; Learning Rate: 1.0



Initial training loss: 1.0928658246994019
Training stopped at epoch 52343 after 59.06 seconds

Post-training Results:
Train Loss: 0.0000; Validation Loss: 0.0041; Test Loss: 0.0018
Training Accuracy: 1.0000; Test Accuracy: 1.0000

Train Confusion Matrix:
tensor([[45,  0,  0],
        [ 0, 58,  0],
        [ 0,  0, 39]])

Test Confusion Matrix:
tensor([[11,  0,  0],
        [ 0,  8,  0],
        [ 0,  0,  9]])

---------------------------------------------
---------------------------------------------
Hidden Layer Shape: [10]; Learning Rate: 1.0

Initial training loss: 1.1048319339752197
Training stopped at epoch 63806 after 78.38 seconds

Post-training Results:
Train Loss: 0.0000; Validation Loss: 0.0026; Test Loss: 0.0053
Training Accuracy: 1.0000; Test Accuracy: 1.0000

Train Confusion Matrix:
tensor([[45,  0,  0],
        [ 0, 58,  0],
        [ 0,  0, 39]])

Test Confusion Matrix:
tensor([[11,  0,  0],
        [ 0,  8,  0],
        [ 0,  0,  9]])

----------------------------------

In [9]:
converted_results = convert_all_results(params, results)
print(converted_results.keys())
print(converted_results['training_losses'].keys())

sys.exit()
# TODO: Modify results storage to save results in a dictionary with a unique key for each combination of hyperparameters
# TODO: Finish updating the code to use the new functions, create experiment analysis method and identify best model, add plots, create README.md, and push to GitHub 
# TODO: later, update ConfusionMatrix (and any other functions) to work with K classes

dict_keys(['training_losses', 'validation_losses', 'training_accuracies', 'validation_accuracies', 'test_losses', 'test_accuracies'])
dict_keys(['[5]', '[10]', '[20]'])


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
if RUN_EXPERIMENTS:
    print("RUNNING EXPERIMENTS")
    
    all_training_loss = []
    train_cm_storage = []
    test_cm_storage = []
    for i in params['learning_rate']:
        training_loss = []
        train_cm = []
        test_cm = []
        for size in params['hidden_size']:

            #creating model and stopping criteria
            model = Network(NUM_INPUTS, NUM_OUTPUTS, NUM_LAYERS, size)
            stopping_criteria = StopCriteria()

            #sending model to cuda
            model.to(device)

            criterion = nn.CrossEntropyLoss() #cross-entropy loss
            optimizer = torch.optim.SGD(model.parameters(), lr = i) # implementing momentum for learning rate

            #Showing test set loss pre-training
            print("-----------------------------------------------------------------")
            print(f"Learning Rate: {i}; Hidden Layer Architecture: {size}")
            
            # Training model
            for epoch in range(EPOCHS):
                optimizer.zero_grad()
                output = model.forward(X_train).squeeze()
                loss = criterion(output, torch.max(y_train_encoded, 1)[1]) 
                
                if epoch == 0:
                    print('Epoch: {}; before training loss: {}'.format(epoch,loss.item()))

                #implementing stopping criteria
                val_output = model.forward(X_val)
                val_loss = criterion(val_output.squeeze(), torch.max(y_val_encoded, 1)[1])

                if stopping_criteria.step(val_loss):
                    print('Epoch: {}; after train loss: {}'.format(epoch,loss.item()))
                    print()
                    break

                # #printing epoch and loss 
                # if epoch % 5000 == 0 and epoch != 0:
                #     print('Epoch: {} train loss: {}'.format(epoch,loss.item()))

                #backpropagation
                loss.backward()
                optimizer.step()
                
            training_loss.append(loss.item())
            # evaluating the model and storing relevant information
            
            
            model.eval()
            train_pred = model(X_train)
            test_pred = model(X_test)
            train_CM = ConfusionMatrix(model, train_pred, y_train_encoded)
            test_CM = ConfusionMatrix(model, test_pred, y_test_encoded)
            after_train = criterion(test_pred.squeeze(), torch.max(y_test_encoded, 1)[1])
            
            print('Test loss post training: ' , after_train.item())
            print()
            print("Training Confusion Matrix: \n" + str(train_CM))
            print()
            print("Test Confusion Matrix: \n" + str(test_CM))
            print()
            print("-----------------------------------------------------------------")
            
            train_cm.append(train_CM)
            test_cm.append(test_CM)

        all_training_loss.append(training_loss)
        train_cm_storage.append(train_cm)
        test_cm_storage.append(test_cm)
    
    all_training_loss = np.array(all_training_loss)
    train_cm_storage = np.array(train_cm_storage)
    test_cm_storage = np.array(test_cm_storage)
    print()
    print("DONE TESTING HYPERPARAMETERS")

RUNNING EXPERIMENTS
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer Architecture: [5]
Epoch: 0; before training loss: 1.1604379415512085
Epoch: 26749; after train loss: 7.276311953319237e-05

Test loss post training 0.002410204615443945

Training Confusion Matrix: 
tensor([[45,  0,  0],
        [ 0, 58,  0],
        [ 0,  0, 39]])

Test Confusion Matrix: 
tensor([[11,  0,  0],
        [ 0,  8,  0],
        [ 0,  0,  9]])

-----------------------------------------------------------------
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer Architecture: [10]
Epoch: 0; before training loss: 1.1342799663543701
Epoch: 49331; after train loss: 2.768605190794915e-05

Test loss post training 0.0075431871227920055

Training Confusion Matrix: 
tensor([[45,  0,  0],
        [ 0, 58,  0],
        [ 0,  0, 39]])

Test Confusion Matrix: 
tensor([[11,  0,  0],
        [ 0,  8,  0],
        [ 0,  0,  9]])

KeyboardInterrupt: 

## Best Model

In [None]:
best_lr_index, best_hs_index = np.argmin(all_training_loss).tolist()
best_lr = params['learning_rate'][best_lr_index]
best_hs = params['hidden_size'][best_hs_index]

print("Best Learning Rate: " + str(best_lr))    
print("Best Hidden Layer PE: " + str(best_hs))

In [None]:
if BEST_MODEL:
    print("RUNNING BEST MODEL")

    #Initializing model and stopping criteria classes
    model = Network(best_hs)
    stopping_criteria = StopCriteria()

    #sending model to cuda
    model.to(device)

    criterion = nn.CrossEntropyLoss() #cross-entropy loss
    optimizer = torch.optim.SGD(model.parameters(), lr = best_lr) # implementing momentum for learning rate

    # Training model
    for epoch in range(EPOCHS):
        optimizer.zero_grad()
        output = model.forward(X_train)
        loss = criterion(output.squeeze(), torch.max(y_train_encoded, 1)[1]) 

        #implementing stopping criteria
        val_output = model.forward(X_val)
        val_loss = criterion(val_output.squeeze(), torch.max(y_val_encoded, 1)[1])

        if stopping_criteria.step(val_loss):
            print('Epoch: {}; after train loss: {}'.format(epoch,loss.item()))
            break

        #printing epoch and loss 
        if epoch % 250 == 0:
            print('Epoch: {}; train loss: {}'.format(epoch,loss.item()))

        #backpropagation
        loss.backward()
        optimizer.step()

In [None]:
model.eval()
y_pred = model(X_test).squeeze()
after_train = criterion(y_pred, torch.max(y_test_encoded, 1)[1]) 
print('Test loss after Training' , after_train.item())

Test loss after Training 0.0060800062492489815


In [None]:
test_CM = ConfusionMatrix(model, y_pred, y_test_encoded)
print("Test Confusion Matrix:")
print(test_CM)

Test Confusion Matrix:
tensor([[11,  0,  0],
        [ 0,  8,  0],
        [ 0,  0,  9]])


Best Model Discussion:

The overall best model in terms of test set loss after training is the model with a learning rate of 1 and 5 hidden PE. This model achieved a testing loss of 0.00117. This learning rate may be performing the best because it can quickly get over non-optimal local minima on the performance surface before the training is terminated by the stopping criteria. It may be possible to achieve lower train and test loss with more patience on the stopping criteria for lower learning rates - however this would increase the number of epochs required for convergence and increase computation time. Five PE in the hidden layer may be performing well because not all of the inputs are predictive of the wine cultivar. 

# Testing model with two hidden layers

In [None]:
# LOCAL PARAMETERS
NUM_LAYERS = 2
NUM_INPUTS = X_train.shape[1]
NUM_OUTPUTS = y_train_encoded.shape[1]
pairs = list(itertools.product([5, 10], repeat=2))
print(pairs)

params = {
    'learning_rate': [1.0, 0.1, 0.01, 0.001],
    'hidden_size': pairs
    }

[(5, 5), (5, 10), (10, 5), (10, 10)]


In [None]:
if RUN_EXPERIMENTS:
    print("RUNNING EXPERIMENTS")
    
    for i in params['learning_rate']:
        for size in params['hidden_size']:
            
            model2 = Network(NUM_INPUTS, NUM_OUTPUTS, NUM_LAYERS, size)
            stopping_criteria = StopCriteria()

            #sending model to cuda
            model2.to(device)

            criterion = nn.CrossEntropyLoss() #cross-entropy loss
            optimizer = torch.optim.SGD(model2.parameters(), lr = i) # implementing momentum for learning rate

            #Showing test set loss pre-training
            print("-----------------------------------------------------------------")
            print(f"Learning Rate: {i}; Hidden Layer Architecture: {size}")

            # Training model
            for epoch in range(EPOCHS):
                optimizer.zero_grad()
                output = model2.forward(X_train).squeeze()
                loss = criterion(output, torch.max(y_train_encoded, 1)[1]) 

                if epoch == 0:
                    print('Epoch: {}; before training loss: {}'.format(epoch,loss.item()))

                #implementing stopping criteria
                val_output = model2.forward(X_val)
                val_loss = criterion(val_output.squeeze(), torch.max(y_val_encoded, 1)[1])

                if stopping_criteria.step(val_loss):
                    print('Epoch: {}; after train loss: {}'.format(epoch,loss.item()))
                    print()
                    break

                #backpropagation
                loss.backward()
                optimizer.step()

            # evaluating the model and storing relevant information
            model2.eval()
            train_pred = model2(X_train)
            test_pred = model2(X_test)
            train_CM = ConfusionMatrix(model2, train_pred, y_train_encoded)
            test_CM = ConfusionMatrix(model2, test_pred, y_test_encoded)
            after_train = criterion(test_pred.squeeze(), torch.max(y_test_encoded, 1)[1])

            print('Test loss post training' , after_train.item())
            print()
            print("Training Confusion Matrix: \n" + str(train_CM))
            print()
            print("Test Confusion Matrix: \n" + str(test_CM))
            print()
            print("-----------------------------------------------------------------")

    print()
    print("DONE TESTING HYPERPARAMETERS")

RUNNING EXPERIMENTS
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer Architecture: (5, 5)
Epoch: 0; before training loss: 1.1175674200057983
Epoch: 6; after train loss: 1.085118293762207

Test loss post training 1.124037265777588

Training Confusion Matrix: 
tensor([[ 0,  0,  0],
        [45, 58, 39],
        [ 0,  0,  0]])

Test Confusion Matrix: 
tensor([[ 0,  0,  0],
        [11,  8,  9],
        [ 0,  0,  0]])

-----------------------------------------------------------------
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer Architecture: (5, 10)
Epoch: 0; before training loss: 1.1287072896957397
Epoch: 45010; after train loss: 2.2319885829347186e-05

Test loss post training 0.0008015120984055102

Training Confusion Matrix: 
tensor([[45,  0,  0],
        [ 0, 58,  0],
        [ 0,  0, 39]])

Test Confusion Matrix: 
tensor([[11,  0,  0],
        [ 0,  8,  0],
        [ 0,  0,  9]])

--

## Testing model with no hidden-layers

In [None]:
# LOCAL PARAMETERS
NUM_LAYERS = 0 # no hidden layers
SIZE = [] # no hidden layers
NUM_INPUTS = X_train.shape[1]
NUM_OUTPUTS = y_train_encoded.shape[1]

params = {
    'learning_rate': [1.0, 0.1, 0.01, 0.001]
}

In [None]:
if RUN_EXPERIMENTS:
    print("RUNNING EXPERIMENTS")
    
    for i in params['learning_rate']:
        model3 = Network(NUM_INPUTS, NUM_OUTPUTS, NUM_LAYERS, SIZE)
        stopping_criteria = StopCriteria(patience = 10)

        #sending model to cuda
        model3.to(device)

        # initializing criterion and optimizer
        criterion = nn.CrossEntropyLoss() #cross-entropy loss
        optimizer = torch.optim.SGD(model3.parameters(), lr = i) # implementing momentum for learning rate

        print("-----------------------------------------------------------------")
        print(f"Learning Rate: {i}; Hidden Layer Architecture: {SIZE}")

        # Training model
        for epoch in range(EPOCHS):
            optimizer.zero_grad()
            output = model3.forward(X_train).squeeze()
            loss = criterion(output, torch.max(y_train_encoded, 1)[1]) 

            if epoch == 0:
                print('Epoch: {}; before training loss: {}'.format(epoch,loss.item()))

            #implementing stopping criteria
            val_output = model3.forward(X_val)
            val_loss = criterion(val_output.squeeze(), torch.max(y_val_encoded, 1)[1])

            if stopping_criteria.step(val_loss):
                print('Epoch: {}; after train loss: {}'.format(epoch,loss.item()))
                print()
                break

            # #printing epoch and loss 
            # if epoch % 5000 == 0 and epoch != 0:
            #     print('Epoch: {} train loss: {}'.format(epoch,loss.item()))
            
            # Backpropagation
            loss.backward()
            optimizer.step()

        # Evaluating the model and storing relevant information
        model3.eval()
        train_pred = model3(X_train)
        test_pred = model3(X_test)
        train_CM = ConfusionMatrix(model3, train_pred, y_train_encoded)
        test_CM = ConfusionMatrix(model3, test_pred, y_test_encoded)
        after_train = criterion(test_pred.squeeze(), torch.max(y_test_encoded, 1)[1])

        print('Test loss post training' , after_train.item())
        print()
        print("Training Confusion Matrix: \n" + str(train_CM))
        print()
        print("Test Confusion Matrix: \n" + str(test_CM))
        print()
        print("-----------------------------------------------------------------")

    
print()
print("DONE TESTING HYPERPARAMETERS")

RUNNING EXPERIMENTS
-----------------------------------------------------------------
Learning Rate: 1.0; Hidden Layer Architecture: []
Epoch: 0; before training loss: 1.4978491067886353
Epoch: 38384; after train loss: 0.00011088434257544577

Test loss post training 0.014064214192330837

Training Confusion Matrix: 
tensor([[45,  0,  0],
        [ 0, 58,  0],
        [ 0,  0, 39]])

Test Confusion Matrix: 
tensor([[11,  0,  0],
        [ 0,  8,  0],
        [ 0,  0,  9]])

-----------------------------------------------------------------
-----------------------------------------------------------------
Learning Rate: 0.1; Hidden Layer Architecture: []
Epoch: 0; before training loss: 1.202933430671692
Epoch: 79037; after train loss: 0.0005150689394213259

Test loss post training 0.011468115262687206

Training Confusion Matrix: 
tensor([[45,  0,  0],
        [ 0, 58,  0],
        [ 0,  0, 39]])

Test Confusion Matrix: 
tensor([[11,  0,  0],
        [ 0,  8,  0],
        [ 0,  0,  9]])

--

Discussion of no hidden layer model:

A model with no hidden layers is unable to classify the Wine.