MLP allows for 2 linear lines to categorize separate parts of T/F
Adam instead of SGD
ReLu instead of Sigmoid

In [1]:
import torch
from torch.nn.modules.loss import BCELoss
import torch.optim as optim

In [2]:
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]])
Y = torch.FloatTensor([[0], [1], [1], [0]])

linear = torch.nn.Linear(2, 1, bias=True) #2x1 shape
sigmoid = torch.nn.Sigmoid()

model = torch.nn.Sequential(linear, sigmoid) #hidden, input, activation function

Loss = BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

for epoch in range(10001):
  hypothesis = model(X)
  cost = Loss(hypothesis, Y)

  optimizer.zero_grad() #cost value using gradient updated every epoch
  cost.backward()
  optimizer.step() #finds most optimized parameters (w and b)

  if epoch % 500 == 0:
    print(epoch, cost.item())

0 0.8879363536834717
500 0.6990728974342346
1000 0.6939902305603027
1500 0.6936231851577759
2000 0.6934486627578735
2500 0.6933406591415405
3000 0.6932724118232727
3500 0.6932288408279419
4000 0.6932008266448975
4500 0.6931825876235962
5000 0.6931706666946411
5500 0.693162739276886
6000 0.6931576132774353
6500 0.6931542158126831
7000 0.6931518912315369
7500 0.6931502819061279
8000 0.693149209022522
8500 0.6931485533714294
9000 0.693148136138916
9500 0.6931478381156921
10000 0.6931476593017578


In [3]:
with torch.no_grad():
  hypothesis = model(X)
  pred = (hypothesis > 0.5).float()
  accuracy = (pred==Y).float().mean()
  print("Hypothesis: ", hypothesis, "Correct: ", pred, "Testing Acc: ", accuracy)

Hypothesis:  tensor([[0.5008],
        [0.5001],
        [0.5002],
        [0.4995]]) Correct:  tensor([[1.],
        [1.],
        [1.],
        [0.]]) Testing Acc:  tensor(0.7500)


#MLP for XOR

In [4]:
#optimized neuron amount is another hyperparameter
#go from larger number ang go down by 2^n
linear1 = torch.nn.Linear(2, 4, bias=True) #2x4 shape for 1st hidden layer, 2 comes from input
linear2 = torch.nn.Linear(4, 1, bias=True) #4x1 shape, must be 1 because of 1 output
sigmoid = torch.nn.Sigmoid()

model = torch.nn.Sequential(linear1, linear2, sigmoid) #order here and adding sigmoid after linear 1 is possible

Loss = BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

for epoch in range(10001):
  hypothesis = model(X)
  cost = Loss(hypothesis, Y)

  optimizer.zero_grad() #cost value using gradient updated every epoch
  cost.backward()
  optimizer.step() #finds most optimized parameters (w and b)

  if epoch % 500 == 0:
    print(epoch, cost.item())

0 0.696247398853302
500 0.6951537132263184
1000 0.6945216059684753
1500 0.694121241569519
2000 0.6938543319702148
2500 0.6936694383621216
3000 0.6935377717018127
3500 0.6934420466423035
4000 0.6933712959289551
4500 0.6933184266090393
5000 0.6932785511016846
5500 0.6932482719421387
6000 0.6932251453399658
6500 0.6932073831558228
7000 0.693193793296814
7500 0.6931832432746887
8000 0.693175196647644
8500 0.6931688189506531
9000 0.6931639909744263
9500 0.6931601762771606
10000 0.6931572556495667


MLP for Mnist

In [5]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch.optim as optim
import random
import torchvision.datasets as dsets
import torchvision.transforms as transforms #should be used to convert images to pytorch tensors

In [7]:
device = 'cuda' if torch.cuda.is_available else 'cpu'
if device == 'cuda':
  torch.cuda.torch.manual_seed(777) #random data sample is the same

In [39]:
#runs faster with lower training epoch and larger batch size and learning rate
training_epoch = 10
batch_size = 8   #2^10
learning_rate = 0.1

In [33]:
mnist_train = dsets.MNIST(root='/content/drive/MyDrive/CSCE464/datasets/MNIST',
                          train= True,
                          transform= transforms.ToTensor(),
                          download= True)

mnist_test = dsets.MNIST(root='/content/drive/MyDrive/CSCE464/datasets/MNIST',
                          train= False,
                          transform= transforms.ToTensor(),
                          download= True)

In [40]:
data_loader = torch.utils.data.DataLoader(dataset= mnist_train,
                                          batch_size= batch_size,
                                          shuffle = True,
                                          drop_last = True)

#Build the MLP Model

In [27]:
#vanishing gradient if too many hidden layers
linear1 = torch.nn.Linear(28*28, 1024, bias=True)
linear2 = torch.nn.Linear(1024, 512, bias=True)
linear3 = torch.nn.Linear(512, 10, bias=True)
#linear4 = torch.nn.Linear(256, 10, bias=True)
#linear5 = torch.nn.Linear(128, 64, bias=True)
#linear6 = torch.nn.Linear(64, 32, bias=True)
#linear7 = torch.nn.Linear(32, 16, bias=True)
#linear8 = torch.nn.Linear(16, 10, bias=True) #last number always amount to output (labels)

sigmoid = torch.nn.Sigmoid()

#Weight Initialization (optional but improves accuracy)

In [28]:
torch.nn.init.normal_(linear1.weight) #initializes neural network parameters
torch.nn.init.normal_(linear2.weight) #normal_ initializes all weights from the normal distribution with mean 0 and SD 1
torch.nn.init.normal_(linear3.weight)
#torch.nn.init.normal_(linear4.weight)
#torch.nn.init.normal_(linear5.weight)
#torch.nn.init.normal_(linear6.weight)
#torch.nn.init.normal_(linear7.weight)
#torch.nn.init.normal_(linear8.weight)

Parameter containing:
tensor([[ 0.5669,  1.5091,  0.6193,  ..., -1.3701, -0.0553, -1.4825],
        [ 0.9003,  1.0279,  0.0821,  ..., -0.3997,  0.3397,  1.2447],
        [ 0.2654, -0.2413,  0.6570,  ..., -1.9665,  0.3017, -0.6566],
        ...,
        [-0.0153,  0.3929, -0.8454,  ...,  0.0826,  1.5703, -0.0914],
        [-0.4802,  0.1001,  0.0326,  ...,  0.9314, -0.7708,  0.3308],
        [-2.4604, -1.7602, -1.2877,  ...,  0.0163, -1.3503,  1.8696]],
       requires_grad=True)

In [29]:
model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid, linear3, sigmoid).to(device)
#sigmoid after each hidden layer can reduce cost

In [30]:
loss = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [41]:
total_batch = len(data_loader)

for epoch in range(training_epoch):
  avg_cost = 0

  for X, Y in data_loader:
    X = X.view(-1, 28*28).to(device)
    Y = Y.to(device)

    pred = model(X).to(device)
    cost = loss(pred, Y).to(device)
    correct_pred = torch.torch.argmax(pred, axis=1) == Y
    accuracy = correct_pred.float().mean()

    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    avg_cost += cost / total_batch

  print("Epoch {:2d} / {} Cost: {:.5f} Training ACC: {:.2f}%".format(epoch+1, 10, avg_cost, accuracy * 100))

Epoch  1 / 10 Cost: 1.63151 Training ACC: 75.00%
Epoch  2 / 10 Cost: 1.62371 Training ACC: 100.00%
Epoch  3 / 10 Cost: 1.61763 Training ACC: 75.00%
Epoch  4 / 10 Cost: 1.61253 Training ACC: 87.50%
Epoch  5 / 10 Cost: 1.60847 Training ACC: 87.50%
Epoch  6 / 10 Cost: 1.60518 Training ACC: 50.00%
Epoch  7 / 10 Cost: 1.60241 Training ACC: 87.50%
Epoch  8 / 10 Cost: 1.60012 Training ACC: 87.50%
Epoch  9 / 10 Cost: 1.59825 Training ACC: 75.00%
Epoch 10 / 10 Cost: 1.59655 Training ACC: 87.50%


In [None]:
#test the model
with torch.no_grad():
  x_test = mnist_test.test_data.view(-1, 28*28).float().to(device)
  y_test = mnist_test.test_labels.to(device)

  pred = model(x_test)
  correct_prediction = torch.torch.argmax(pred, axis=1) == y_test
  accuracy = correct_prediction.float().mean()
  print("Testing Accuracy {:.2f}".format(accuracy.item()*100))

#MLP for MNIST with ReLU and Adam Optimizer

In [42]:
#runs faster with lower training epoch and larger batch size and learning rate
training_epoch = 10
batch_size = 8   #2^10
learning_rate = 0.1

In [43]:
mnist_train = dsets.MNIST(root='/content/drive/MyDrive/CSCE464/datasets/MNIST',
                          train= True,
                          transform= transforms.ToTensor(),
                          download= True)

mnist_test = dsets.MNIST(root='/content/drive/MyDrive/CSCE464/datasets/MNIST',
                          train= False,
                          transform= transforms.ToTensor(),
                          download= True)

In [44]:
data_loader = torch.utils.data.DataLoader(dataset= mnist_train,
                                          batch_size= batch_size,
                                          shuffle = True,
                                          drop_last = True)


In [49]:
#vanishing gradient if too many hidden layers
linear1 = torch.nn.Linear(28*28, 1024, bias=True)
linear2 = torch.nn.Linear(1024, 512, bias=True)
linear3 = torch.nn.Linear(512, 256, bias=True)
linear4 = torch.nn.Linear(256, 10, bias=True)
#linear5 = torch.nn.Linear(128, 64, bias=True)
#linear6 = torch.nn.Linear(64, 32, bias=True)
#linear7 = torch.nn.Linear(32, 16, bias=True)
#linear8 = torch.nn.Linear(16, 10, bias=True) #last number always amount to output (labels)

relu = torch.nn.ReLU()

In [50]:
torch.nn.init.normal_(linear1.weight) #initializes neural network parameters
torch.nn.init.normal_(linear2.weight) #normal_ initializes all weights from the normal distribution with mean 0 and SD 1
torch.nn.init.normal_(linear3.weight)
torch.nn.init.normal_(linear4.weight)
#torch.nn.init.normal_(linear5.weight)
#torch.nn.init.normal_(linear6.weight)
#torch.nn.init.normal_(linear7.weight)
#torch.nn.init.normal_(linear8.weight)

Parameter containing:
tensor([[-0.8759,  1.1599, -0.4306,  ...,  1.4843,  1.0364, -1.2166],
        [ 0.6667,  0.0450,  1.8502,  ...,  0.6450,  0.0819,  1.3105],
        [-1.1029,  0.2350, -2.0204,  ...,  0.9263, -0.9921,  0.8881],
        ...,
        [-0.0828, -0.3181, -1.1015,  ...,  0.4314,  0.4562, -0.1117],
        [ 1.2712, -0.3389,  0.4084,  ..., -1.6928,  0.1476,  0.1778],
        [ 0.8001,  1.1416,  0.8854,  ...,  0.1375,  1.7652, -0.1413]],
       requires_grad=True)

In [51]:
model = torch.nn.Sequential(linear1, relu, linear2, relu, linear3, relu, linear4, relu).to(device)
#sigmoid after each hidden layer can reduce cost

In [53]:
loss = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [1]:
total_batch = len(data_loader)

for epoch in range(training_epoch):
  avg_cost = 0

  for X, Y in data_loader:
    X = X.view(-1, 28*28).to(device)
    Y = Y.to(device)

    pred = model(X).to(device)
    cost = loss(pred, Y).to(device)
    correct_pred = torch.torch.argmax(pred, axis=1) == Y
    accuracy = correct_pred.float().mean()

    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    avg_cost += cost / total_batch

  print("Epoch {:2d} / {} Cost: {:.5f} Training ACC: {:.2f}%".format(epoch+1, 10, avg_cost, accuracy * 100))

NameError: name 'data_loader' is not defined