# COSC401 Assignment 2
## Andrew French 
### ID: 11147452

Below is all of the libraries and modules needed for the entire program being imported. 
Make sure to run this step before running any of the feature code.

In [6]:
import numpy as np
import pandas as pd
import torch
import random

Below is all of the variables set that are global to the program.

In [2]:
np.random.seed(19680801)
plt.rcParams['figure.figsize'] = (10,5)

## Part 1 - Gradient-based Learning with Tensors

In [43]:
DATA_SIZE = 1000
RANDOM_STD_DEV = 0.01

In [47]:
def noiseFunction():
    return random.gauss(1, RANDOM_STD_DEV)

def ran_int():
    return random.randint(1, 100)

def inputFunc1(x1, x2, x4):
    return x1 + x2 - x4 + noiseFunction()

def inputFunc2(x2, x3):
    return x2 * 3 + x3 + noiseFunction()

def inputFunc3(x2, x4):
    return x2 + x4 + 2 + noiseFunction()

def inputFunc4(x1, x3):
    return (x1 - x3) * 2 + noiseFunction()

def inputFunc5(x1, x4):
    return (x4 + x1) - 6 + noiseFunction()

def modelFunc(x, w, b):
    """Performs matrix multiplication..."""
    return x @ w.t() + b

def mse(t1, t2):
    """Mean Squared Error loss function"""
    diff = t1 - t2
    return torch.sum(diff * diff) / diff.numel()
    

In [48]:
input_data_list = []
target_data_list = []

for i in range(DATA_SIZE):
    x1, x2, x3, x4 = ran_int(), ran_int(), ran_int(), ran_int()
    input_data_list.append([x1, x2, x3, x4])
    target_data_list.append([inputFunc1(x1, x2, x4), inputFunc2(x2, x3), inputFunc3(x2, x4), inputFunc4(x1, x3), 
                      inputFunc5(x1, x4)])
    
input_data = np.array(input_data_list, dtype='float32')
target_data = np.array(target_data_list, dtype='float32')

#Convert to tensors
inputs = torch.from_numpy(input_data)
targets = torch.from_numpy(target_data)

In [50]:
# Create random Tensors for weights; setting requires_grad=True means that we
# want to compute gradients for these Tensors during the backward pass.
w = torch.randn(5, 4, requires_grad=True)
b = torch.randn(5, requires_grad=True)

preds = modelFunc(inputs, w, b)
loss = mse(preds, targets)

learning_rate = 1e-4
count = 0
while count < 1000000 and loss.item() > 0.04:
    preds = modelFunc(inputs, w, b)
    loss = mse(preds, targets)
    loss.backward()
    with torch.no_grad():
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad
        w.grad.zero_()
        b.grad.zero_()
    if count % 10000 == 0:
        print(f"Iteration: {count}, Loss {loss.item()}")
    count += 1
    


Iteration: 0, Loss 24602.76171875
Iteration: 10000, Loss 0.7094482779502869
Iteration: 20000, Loss 0.6653133034706116
Iteration: 30000, Loss 0.6239269375801086
Iteration: 40000, Loss 0.5851123332977295
Iteration: 50000, Loss 0.5487135648727417
Iteration: 60000, Loss 0.5145819783210754
Iteration: 70000, Loss 0.4825710952281952
Iteration: 80000, Loss 0.4525541067123413
Iteration: 90000, Loss 0.42440658807754517
Iteration: 100000, Loss 0.3980015218257904
Iteration: 110000, Loss 0.3732510209083557
Iteration: 120000, Loss 0.3500339090824127
Iteration: 130000, Loss 0.3282581865787506
Iteration: 140000, Loss 0.3078409731388092
Iteration: 150000, Loss 0.2886984348297119
Iteration: 160000, Loss 0.2707436680793762
Iteration: 170000, Loss 0.2539035677909851
Iteration: 180000, Loss 0.23811420798301697
Iteration: 190000, Loss 0.2233061045408249
Iteration: 200000, Loss 0.2094200700521469
Iteration: 210000, Loss 0.19639381766319275
Iteration: 220000, Loss 0.18419933319091797
Iteration: 230000, Loss 0

## Part 2 - Transfer Learning

In [2]:
from torchvision.datasets import MNIST, CIFAR10

# Code in file nn/two_layer_net_nn.py
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# After constructing the model we use the .to() method to move it to the
# desired device.
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        ).to(device)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function. Setting
# reduction='sum' means that we are computing the *sum* of squared errors rather
# than the mean; this is for consistency with the examples above where we
# manually compute the loss, but in practice it is more common to use mean
# squared error as a loss by setting reduction='elementwise_mean'.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
  # Forward pass: compute predicted y by passing x to the model. Module objects
  # override the __call__ operator so you can call them like functions. When
  # doing so you pass a Tensor of input data to the Module and it produces
  # a Tensor of output data.
  y_pred = model(x)

  # Compute and print loss. We pass Tensors containing the predicted and true
  # values of y, and the loss function returns a Tensor containing the loss.
  loss = loss_fn(y_pred, y)
  print(t, loss.item())
  
  # Zero the gradients before running the backward pass.
  model.zero_grad()

  # Backward pass: compute gradient of the loss with respect to all the learnable
  # parameters of the model. Internally, the parameters of each Module are stored
  # in Tensors with requires_grad=True, so this call will compute gradients for
  # all learnable parameters in the model.
  loss.backward()

  # Update the weights using gradient descent. Each parameter is a Tensor, so
  # we can access its data and gradients like we did before.
  with torch.no_grad():
    for param in model.parameters():
      param.data -= learning_rate * param.grad

0 742.8016967773438
1 686.142578125
2 636.8507080078125
3 593.2828979492188
4 554.7335815429688
5 520.5545043945312
6 489.7171936035156
7 461.5309143066406
8 435.5517883300781
9 411.4443054199219
10 388.98382568359375
11 367.82623291015625
12 347.8412780761719
13 329.0830993652344
14 311.28863525390625
15 294.4093017578125
16 278.3799743652344
17 263.15179443359375
18 248.65359497070312
19 234.85372924804688
20 221.78561401367188
21 209.33975219726562
22 197.5129852294922
23 186.20469665527344
24 175.49124145507812
25 165.32415771484375
26 155.71221923828125
27 146.64251708984375
28 138.01995849609375
29 129.85888671875
30 122.15640258789062
31 114.8960952758789
32 108.03673553466797
33 101.56907653808594
34 95.47881317138672
35 89.73491668701172
36 84.31725311279297
37 79.21713256835938
38 74.43209838867188
39 69.92500305175781
40 65.68978881835938
41 61.71757888793945
42 57.97899627685547
43 54.474082946777344
44 51.19098663330078
45 48.10993576049805
46 45.21963882446289
47 42.51014

423 6.775365181965753e-05
424 6.594568549189717e-05
425 6.419258716050535e-05
426 6.248038698686287e-05
427 6.082204345148057e-05
428 5.9202029660809785e-05
429 5.7630848459666595e-05
430 5.6098793720593676e-05
431 5.46075934835244e-05
432 5.316117312759161e-05
433 5.174609032110311e-05
434 5.0373935664538294e-05
435 4.903993249172345e-05
436 4.7737135901115835e-05
437 4.647467358154245e-05
438 4.5241689804242924e-05
439 4.404642095323652e-05
440 4.2877749365288764e-05
441 4.174300192971714e-05
442 4.063723827130161e-05
443 3.956340151489712e-05
444 3.851810470223427e-05
445 3.7500787584576756e-05
446 3.65080704796128e-05
447 3.5547658626455814e-05
448 3.460848529357463e-05
449 3.369620753801428e-05
450 3.2808748073875904e-05
451 3.194475721102208e-05
452 3.110207399004139e-05
453 3.0281004001153633e-05
454 2.9485132472473197e-05
455 2.8706846933346242e-05
456 2.7952033633482642e-05
457 2.721598866628483e-05
458 2.649847920110915e-05
459 2.5804109100135975e-05
460 2.5124618332483806e-0

In [2]:
# Code in file autograd/tf_two_layer_net.py
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
  # Run the graph once to initialize the Variables w1 and w2.
  sess.run(tf.global_variables_initializer())

  # Create numpy arrays holding the actual data for the inputs x and targets y
  x_value = np.random.randn(N, D_in)
  y_value = np.random.randn(N, D_out)
  for _ in range(500):
    # Execute the graph many times. Each time it executes we want to bind
    # x_value to x and y_value to y, specified with the feed_dict argument.
    # Each time we execute the graph we want to compute the values for loss,
    # new_w1, and new_w2; the values of these Tensors are returned as numpy
    # arrays.
    loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                feed_dict={x: x_value, y: y_value})
    print(loss_value)

AttributeError: module 'tensorflow' has no attribute 'placeholder'

## Part 3 - Small Research/Thinking Activity - Cost-Sensitive Learning

Suppose you are given a k-by-k cost matrix C for a classification task which
has k classes. The element Ci,j is the cost of classifying an instance of class j
as class i. You want to train a classifier that minimises the expected cost of
predictions. Do the following:
1. Write a pseudocode or Python code for a loss function that, once optimised
over all examples, achieves minimum expected cost of prediction.
2. Assuming that a learning algorithm which minimises the classification
error is given to you (and that you cannot supply your own loss function
to the algorithm), think and write about a different way of achieving
minimum expected cost of prediction.
Additional notes or requirements:
• Consider cases with k ≥ 2 and briefly discuss if the two methods are
scalable.
• You do not have to use tensors in this questions.
• If you choose to write in Python, you do not have to run your program
on any particular input. You can also use “pseudo-Python” if you wish.

### Part 3.2 - Achieving minimym expected cost of prediction
Idea: if you have a classifer with lots of data and one without, reduce the size of the data to match the smaller one.