# MLP 

In [1]:
import torch

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [3]:
X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]).to(device)
Y = torch.FloatTensor([[0], [1], [1], [0]]).to(device)

In [4]:
w1 = torch.Tensor(2, 2).to(device)
b1 = torch.Tensor(2).to(device)
w2 = torch.Tensor(2, 1).to(device)
b2 = torch.Tensor(1).to(device)

In [5]:
def sigmoid(x):
  return 1.0 / (1.0 + torch.exp(-x))

In [6]:
def sigmoid_prime(x):
  return sigmoid(x) * (1 - sigmoid(x))

In [22]:
learning_rate = 10

for step in range(10000):
  # Forward
  l1 = torch.add(torch.matmul(X, w1), b1)
  a1 = sigmoid(l1)
  l2 = torch.add(torch.matmul(a1, w2), b2)
  Y_pred = sigmoid(l2)

  cost = -torch.mean(Y * torch.log(Y_pred) + (1-Y) * torch.log(1-Y_pred))

  # Backpropagation
  d_Y_pred = (Y_pred - Y) / (Y_pred * (1.0 - Y_pred))

  d_l2 = d_Y_pred * sigmoid_prime(l2)
  d_b2 = d_l2
  d_w2 = torch.matmul(torch.transpose(a1, 0, 1), d_b2)
  d_a1 = torch.matmul(d_b2, torch.transpose(w2, 0, 1))
  d_l1 = d_a1 * sigmoid_prime(l1)
  d_b1 = d_l1
  d_w1 = torch.matmul(torch.transpose(X, 0, 1), d_b1)

  w1 = w1 - learning_rate * d_w1
  b1 = b1 - learning_rate * torch.mean(d_b1, 0)
  w2 = w2 - learning_rate * d_w2
  b2 = b2 - learning_rate * torch.mean(d_b2, 0)

  if step % 100 == 0:
    print(step, cost.item())

0 0.6931471824645996
100 0.6931471824645996
200 0.6931471824645996
300 0.6931471824645996
400 0.6931471824645996
500 0.6931471824645996
600 0.6931471824645996
700 0.6931471824645996
800 0.6931471824645996
900 0.6931471824645996
1000 0.6931471824645996
1100 0.6931471824645996
1200 0.6931471824645996
1300 0.6931471824645996
1400 0.6931471824645996
1500 0.6931471824645996
1600 0.6931471824645996
1700 0.6931471824645996
1800 0.6931471824645996
1900 0.6931471824645996
2000 0.6931471824645996
2100 0.6931471824645996
2200 0.6931471824645996
2300 0.6931471824645996
2400 0.6931471824645996
2500 0.6931471824645996
2600 0.6931471824645996
2700 0.6931471824645996
2800 0.6931471824645996
2900 0.6931471824645996
3000 0.6931471824645996
3100 0.6931471824645996
3200 0.6931471824645996
3300 0.6931471824645996
3400 0.6931471824645996
3500 0.6931471824645996
3600 0.6931471824645996
3700 0.6931471824645996
3800 0.6931471824645996
3900 0.6931471824645996
4000 0.6931471824645996
4100 0.6931471824645996
4200

# XOR NN

In [None]:
linear1 = torch.nn.Linear(2, 2, bias=True)
linear2 = torch.nn.Linear(2, 1, bias=True)
sigmoid = torch.nn.Sigmoid()

In [None]:
model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid).to(device)

In [None]:
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1) 

In [None]:
for step in range(10001):
    optimizer.zero_grad()
    hypothesis = model(X)

    # cost/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()

    if step % 100 == 0:
        print(step, cost.item())

0 0.7434073090553284
100 0.693165123462677
200 0.6931577920913696
300 0.6931517124176025
400 0.6931463479995728
500 0.6931411027908325
600 0.6931357383728027
700 0.6931295394897461
800 0.6931220889091492
900 0.6931126117706299
1000 0.6930999755859375
1100 0.693082332611084
1200 0.6930569410324097
1300 0.6930191516876221
1400 0.6929606199264526
1500 0.6928660273551941
1600 0.6927032470703125
1700 0.6923960447311401
1800 0.6917302012443542
1900 0.6899654269218445
2000 0.6838315725326538
2100 0.6561667323112488
2200 0.43110138177871704
2300 0.13489334285259247
2400 0.06630435585975647
2500 0.042168181389570236
2600 0.030453868210315704
2700 0.023665912449359894
2800 0.01927776448428631
2900 0.016224022954702377
3000 0.013983809389173985
3100 0.012273931875824928
3200 0.010928118601441383
3300 0.009842473082244396
3400 0.008949032984673977
3500 0.008201321586966515
3600 0.007566752843558788
3700 0.007021686062216759
3800 0.006548595614731312
3900 0.006134253926575184
4000 0.005768344737589

# XOR deep

In [None]:
linear1 = torch.nn.Linear(2, 10, bias=True)
linear2 = torch.nn.Linear(10, 10, bias=True)
linear3 = torch.nn.Linear(10, 10, bias=True)
linear4 = torch.nn.Linear(10, 1, bias=True)
sigmoid = torch.nn.Sigmoid()

In [None]:
model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid, linear3, sigmoid, linear4, sigmoid).to(device)

In [None]:
criterion = torch.nn.BCELoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1)

In [None]:
for step in range(10001):
    optimizer.zero_grad()
    hypothesis = model(X)

    # cost/loss function
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()

    if step % 100 == 0:
        print(step, cost.item())

0 0.6978083848953247
100 0.6931105256080627
200 0.6931020021438599
300 0.6930921673774719
400 0.6930808424949646
500 0.6930676698684692
600 0.6930519938468933
700 0.69303297996521
800 0.6930099129676819
900 0.6929808855056763
1000 0.6929439902305603
1100 0.6928958296775818
1200 0.6928311586380005
1300 0.6927415132522583
1400 0.6926121115684509
1500 0.692415714263916
1600 0.6920979619026184
1700 0.6915386915206909
1800 0.6904336214065552
1900 0.6878381371498108
2000 0.6797102689743042
2100 0.6408315300941467
2200 0.5488754510879517
2300 0.5083184242248535
2400 0.487520694732666
2500 0.49485543370246887
2600 0.02016819454729557
2700 0.007714720442891121
2800 0.004507085308432579
2900 0.0031063584610819817
3000 0.0023384878877550364
3100 0.0018596660811454058
3200 0.0015350303146988153
3300 0.001301785814575851
3400 0.0011267174268141389
3500 0.000990909757092595
3600 0.0008827425772324204
3700 0.0007947051781229675
3800 0.0007217370439320803
3900 0.0006604348309338093
4000 0.000608202069