In [111]:
# imports
import pandas as pd
import numpy as np

In [112]:
# loading in the data

columns = [
    'variance',
    'skewness',
    'curtosis',
    'entropy',
    'label'
]

train_data = pd.read_csv('bank-note/train.csv', names=columns,header=None)
test_data = pd.read_csv('bank-note/test.csv', names=columns, header=None)

train_data

#Do i need to change labels?

Unnamed: 0,variance,skewness,curtosis,entropy,label
0,3.848100,10.15390,-3.85610,-4.22280,0
1,4.004700,0.45937,1.36210,1.61810,0
2,-0.048008,-1.60370,8.47560,0.75558,0
3,-1.266700,2.81830,-2.42600,-1.88620,1
4,2.203400,5.99470,0.53009,0.84998,0
...,...,...,...,...,...
867,0.273310,4.87730,-4.91940,-5.81980,1
868,1.063700,3.69570,-4.15940,-1.93790,1
869,-1.242400,-1.71750,-0.52553,-0.21036,1
870,1.837300,6.12920,0.84027,0.55257,0


In [113]:
x_train = train_data.drop('label', axis=1).values
y_train = train_data['label'].values

x_test = test_data.drop('label', axis=1).values
y_test = test_data['label'].values

# Three Layer Artificial Network

In [114]:
def sigmoid_activation(x):
    return 1 / (1 + np.exp(-x))

In [115]:
# the input should be segmoid_activation
def sigmoid_activation_derivative(x):
    return sigmoid_activation(x) * (1 - sigmoid_activation(x))

In [141]:
#making a class definition to initializze easier and practice notation for pytorch

class NeuralNetwork:

    def __init__(self, feature_size, hidden_size1, hidden_size2, output_size):
        self.input_size = input_size
        self.hidden_size1 = hidden_size1
        self.hidden_size2 = hidden_size2
        self.output_size = output_size

        # Initialize weights with random values
        self.weights_input_hidden1 = np.random.randn(self.input_size, self.hidden_size1)
        self.weights_hidden1_hidden2 = np.random.randn(self.hidden_size1, self.hidden_size2)
        self.weights_hidden2_output = np.random.randn(self.hidden_size2, self.output_size)

        # Bias terms initialized as 1
        self.bias_hidden1 = np.ones((1, self.hidden_size1))
        self.bias_hidden2 = np.ones((1, self.hidden_size2))

    def forward_pass(self, features):
        # Forward pass
        self.hidden_input1 = np.dot(features, self.weights_input_hidden1)
        self.hidden_output1 = sigmoid_activation(self.hidden_input1)

        self.hidden_input2 = np.dot(self.hidden_output1, self.weights_hidden1_hidden2)
        self.hidden_output2 = sigmoid_activation(self.hidden_input2)

        # NO ACTIVATION FOR OUTPUT NODE
        self.output = (np.dot(self.hidden_output2, self.weights_hidden2_output))
        return self.output

    def backpropagation(self, features, labels, learning_rate):
        #so labels has the right shape for opperations
        labels = labels.reshape(-1, 1)

        self.forward_pass(features)
        
        
        # Backward pass
        output_error = labels - self.output

         # Calculate deltas starting from output layer
        hidden2_error = np.dot(output_error, self.weights_hidden2_output.T)
        hidden2_delta = hidden2_error * sigmoid_activation_derivative(self.hidden_input2)

        hidden1_error = np.dot(hidden2_delta, self.weights_hidden1_hidden2.T)
        hidden1_delta = hidden1_error * sigmoid_activation_derivative(self.hidden_input1)

        # # Update weights
        # self.weights_hidden2_output +=   np.dot(self.hidden_output2.T, output_error)
        # self.weights_hidden1_hidden2 += np.dot(self.hidden_output1.T, hidden2_delta)
        # self.weights_input_hidden1 +=  np.dot(features.T, hidden1_delta)

        # Update biases
        self.bias_hidden2 += hidden2_delta.sum(axis=0)  # Update bias for hidden layer 2
        self.bias_hidden1 += hidden1_delta.sum(axis=0)  # Update bias for hidden layer 1

        # Update weights having the learning rate there for second question part but will be 1 for first question
        self.weights_hidden2_output += learning_rate * np.dot(self.hidden_output2.T, output_error)
        self.weights_hidden1_hidden2 += learning_rate * np.dot(self.hidden_output1.T, hidden2_delta)
        self.weights_input_hidden1 += learning_rate * np.dot(features.T, hidden1_delta)
        return self.weights_hidden2_output, self.weights_hidden1_hidden2, self.weights_input_hidden1, self.bias_hidden2,  self.bias_hidden1


    def stochastic_gradient_descent(self, x_train, y_train, x_test, y_test, width, learning_rate, d, epochs, zero_weights =False ):
        # Initialize weights with random values for the specified width
        if zero_weights == False: 
            self.weights_input_hidden1 = np.random.randn(self.input_size, width)
            self.weights_hidden1_hidden2 = np.random.randn(width, width)
            self.weights_hidden2_output = np.random.randn(width, self.output_size)
        else:
            #for problem 3
            self.weights_input_hidden1 = np.zeros((self.input_size, width))
            self.weights_hidden1_hidden2 = np.zeros((width, width))
            self.weights_hidden2_output = np.zeros((width, self.output_size))
        # Initialize biases
        self.bias_hidden1 = np.ones((1, width))
        self.bias_hidden2 = np.ones((1, width))

        # Variables to track objective function values during training
        train_err = []
        test_err = []
        train_accuracy = []
        test_accuracy = []

        # Training loop
        for epoch in range(epochs):
           #shuffle time
            shuffled_idx = np.random.permutation(len(x_train))
            x_train_shuffled = x_train[shuffled_idx]
            y_train_shuffled = y_train[shuffled_idx]

         
            LR = learning_rate / (1 + (learning_rate / d) * epoch)

            # Stochastic Gradient Descent
            for i in range(len(x_train_shuffled)):
                sample_feature = x_train_shuffled[i].reshape(1, -1)
                sample_label = y_train_shuffled[i].reshape(-1, 1)

               
                self.backpropagation(sample_feature, sample_label, LR)

      
        train_output = self.forward_pass(x_train)
        train_loss = np.mean(np.square(y_train - train_output))

        test_output = self.forward_pass(x_test)
        test_loss = np.mean(np.square(y_test - test_output))
            
        train_err.append(train_loss)
        test_err.append(test_loss)
        return train_err, test_err


In [103]:
# Want to compute the gradeint with respect to one training example
input_size = 4
hidden_size1 = 3
hidden_size2 = 3
output_size = 1

x_single_sample = x_train[0].reshape(1, -1) 
y_single_sample = y_train[0].reshape(-1, 1)

one_sample_network = NeuralNetwork(input_size, hidden_size1, hidden_size2, output_size)
weights_hidden2_output, weights_hidden1_hidden2, weights_input_hidden1, bias_hidden2, bias_hidden1 = one_sample_network.backpropagation(x_single_sample, y_single_sample, 1)

print("Weight Gradients:", weights_hidden2_output, weights_hidden1_hidden2,weights_input_hidden1)
print("Bias:", bias_hidden2, bias_hidden1)

Weight Gradients: [[-0.4129254 ]
 [-0.49615295]
 [ 0.22821415]] [[-2.07411556  0.74958547  0.59384841]
 [-0.47715872  0.1859279   0.65521873]
 [-0.85218659 -0.3995431  -0.36171508]] [[ 0.01321342  0.24258624  1.33412005]
 [-1.6529981   0.88631881 -0.6636908 ]
 [-0.32629197  1.00022033 -1.07157695]
 [ 0.27040572  1.05403659 -0.934335  ]]
Bias: [[0.87305472 0.74164657 0.91161177]] [[1.         0.99257931 1.0002443 ]]


In [104]:
#little experiement to find the best d and LR

LR_values = [0.1, 0.01, 0.001]
d_values = [0.01, 0.01, 100]
width_values = [5, 10, 25, 50, 100]
epochs = 100
input_size = 4
hidden_size1 = 3
hidden_size2 = 3
output_size = 1

best_train_err = float('inf')
best_test_err = float('inf')
best_learning_rate = None
best_d = None
best_width = None

for gamma in LR_values:
    for d_val in d_values:
        for width_val in width_values:
            
            nn = NeuralNetwork(input_size, hidden_size1, hidden_size2, output_size)

            #do gradient descent
            train_err, test_err = nn.stochastic_gradient_descent(x_train, y_train, x_test, y_test, width_val, gamma, d_val, epochs)

            #now see whihc has the lowest err
            if train_err[-1] < best_train_err and test_err[-1] < best_test_err:
                best_train_err = train_err[-1]
                best_test_err = test_err[-1]
                best_learning_rate = gamma
                best_d = d_val
                best_width = width_val

# Output the best ones found
print(f"Best Configuration - Gamma: {best_learning_rate}, d: {best_d}, Width: {best_width}")
print(f"Best Training Error: {best_train_err}")
print(f"Best Test Error: {best_test_err}")


  return 1 / (1 + np.exp(-x))


Best Configuration - Gamma: 0.001, d: 0.01, Width: 5
Best Training Error: 0.2902630334632157
Best Test Error: 0.2871435392484001


In [140]:
#part b

widths = [5,10,25,50,100]
d = 0.01
learning_rate = 0.001
input_size = 4
hidden_size1 = 3
hidden_size2 = 3
output_size = 1
epochs = 100

for width in widths:
    network =  NeuralNetwork(input_size, hidden_size1, hidden_size2, output_size)
    train_err, test_err = network.stochastic_gradient_descent(x_train, y_train, x_test, y_test, width, learning_rate, d, epochs)
    print(
        f"ERR Width: {width}, Training err: {train_err}, Test err: {test_err}")


ERR Width: 5, Training err: [0.2949359456583213], Test err: [0.29445387428394526]
ERR Width: 10, Training err: [0.4166329963106501], Test err: [0.410998943323258]
ERR Width: 25, Training err: [0.520486313861385], Test err: [0.5012146017915575]
ERR Width: 50, Training err: [0.5873224199956868], Test err: [0.5726848990407992]
ERR Width: 100, Training err: [0.5838476187837106], Test err: [0.6396642814391519]


In [142]:
for width in widths:
    network =  NeuralNetwork(input_size, hidden_size1, hidden_size2, output_size)
    train_err, test_err = network.stochastic_gradient_descent(x_train, y_train, x_test, y_test, width, learning_rate, d, epochs, zero_weights=True)
    print(
        f"Width: {width}, Zero-Initialized Weights, Training Err: {train_err}, Test Err: {test_err}")

Width: 5, Zero-Initialized Weights, Training Accuracy: [0.24709490626882546], Test Accuracy: [0.2466517593255136]
Width: 10, Zero-Initialized Weights, Training Accuracy: [0.2470948919353979], Test Accuracy: [0.24665327268552806]
Width: 25, Zero-Initialized Weights, Training Accuracy: [0.24709488972901356], Test Accuracy: [0.2466525819683823]
Width: 50, Zero-Initialized Weights, Training Accuracy: [0.24709489002089896], Test Accuracy: [0.24665254383559532]
Width: 100, Zero-Initialized Weights, Training Accuracy: [0.24709488984537384], Test Accuracy: [0.24665256605547442]


# TORCH TIME

In [107]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

#loading in data for pytorch use
def create_dataloader(csv_file, batch_size=32):
    data = pd.read_csv(csv_file)
    features = torch.tensor(data.iloc[:, :-1].values, dtype=torch.float32)
    labels = torch.tensor(data.iloc[:, -1].values, dtype=torch.int64)
    dataset = TensorDataset(features, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)


train_loader = create_dataloader('bank-note/train.csv')
test_loader = create_dataloader('bank-note/test.csv')

class PTNeuralNetwork(nn.Module):
    #constructor
    def __init__(self, input_size, output_size, hidden_layers, activation_fn):
        super(PTNeuralNetwork, self).__init__()
        self.layers = nn.ModuleList()
        for i in range(len(hidden_layers)):
            if i == 0:
                self.layers.append(nn.Linear(input_size, hidden_layers[i]))
            else:
                self.layers.append(nn.Linear(hidden_layers[i - 1], hidden_layers[i]))
        self.layers.append(nn.Linear(hidden_layers[-1], output_size))
        self.activation_fn = activation_fn

    #forward Pass method
    def forward(self, x):
        for i, layer in enumerate(self.layers):
            x = layer(x)
            if i < len(self.layers) - 1:
                x = self.activation_fn(x)
        return x


#wanna see the value that the model is correct
def evaluate_accuracy(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in data_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
    return 100 * correct / total


def train_and_evaluate(depths, widths, activation_fns, init_methods, train_loader, test_loader):
    input_size = 4  
    output_size = 2  
    num_epochs = 20  

    for depth in depths:
        for width in widths:
            for activation_fn, init_method in zip(activation_fns, init_methods):
                hidden_layers = [width] * depth
                model = PTNeuralNetwork(input_size, output_size, hidden_layers, activation_fn)
                for layer in model.layers:
                    if isinstance(layer, nn.Linear):
                        init_method(layer.weight)
                criterion = nn.CrossEntropyLoss()
                optimizer = optim.Adam(model.parameters(), lr=1e-3)

                for epoch in range(num_epochs):
                    #need to tell the model it is training
                    model.train()  
                    for data, label in train_loader:
                        #WAY EASIER WAY TO DO GRADIENTS 
                        optimizer.zero_grad()
                        output = model(data)
                        loss = criterion(output, label)
                        loss.backward()
                        optimizer.step()

                train_accuracy = evaluate_accuracy(model, train_loader)
                test_accuracy = evaluate_accuracy(model, test_loader)
                print(f"Depth: {depth}, Width: {width}, Activation: {activation_fn.__name__}, "
                      f"Train Accuracy: {train_accuracy:.2f}%, Test Accuracy: {test_accuracy:.2f}%")


# Activation functions and initialization methods
activation_fns = [torch.tanh, torch.relu]
init_methods = [nn.init.xavier_normal_, nn.init.kaiming_normal_]

train_and_evaluate([3, 5, 9], [5, 10, 25, 50, 100], activation_fns, init_methods, train_loader, test_loader)

Depth: 3, Width: 5, Activation: tanh, Train Accuracy: 99.20%, Test Accuracy: 98.60%
Depth: 3, Width: 5, Activation: relu, Train Accuracy: 73.94%, Test Accuracy: 69.54%
Depth: 3, Width: 10, Activation: tanh, Train Accuracy: 100.00%, Test Accuracy: 100.00%
Depth: 3, Width: 10, Activation: relu, Train Accuracy: 100.00%, Test Accuracy: 100.00%
Depth: 3, Width: 25, Activation: tanh, Train Accuracy: 100.00%, Test Accuracy: 100.00%
Depth: 3, Width: 25, Activation: relu, Train Accuracy: 100.00%, Test Accuracy: 100.00%
Depth: 3, Width: 50, Activation: tanh, Train Accuracy: 100.00%, Test Accuracy: 100.00%
Depth: 3, Width: 50, Activation: relu, Train Accuracy: 100.00%, Test Accuracy: 100.00%
Depth: 3, Width: 100, Activation: tanh, Train Accuracy: 100.00%, Test Accuracy: 100.00%
Depth: 3, Width: 100, Activation: relu, Train Accuracy: 100.00%, Test Accuracy: 100.00%
Depth: 5, Width: 5, Activation: tanh, Train Accuracy: 100.00%, Test Accuracy: 100.00%
Depth: 5, Width: 5, Activation: relu, Train Accu