# Using Numpy 

In [5]:
import numpy as np

batch_size, in_dim, hidden_dim, out_dim = 64, 1000, 100,10

x = np.random.randn(batch_size, in_dim)
y = np.random.randn(batch_size, out_dim)

#weights
w1 = np.random.randn(in_dim, hidden_dim)
w2 = np.random.randn(hidden_dim, out_dim)

learning_rate = 1e-6
for t in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(h,0)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred - y).sum()
    print(t,loss)
    
    grad_y_pred = 2.0*(y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

(0, 30346765.621983021)
(1, 27528796.929087717)
(2, 27660176.46765976)
(3, 26571625.931840777)
(4, 22402830.411030799)
(5, 15974332.34572117)
(6, 9903021.788522074)
(7, 5674214.1750736879)
(8, 3273808.0383201297)
(9, 2020701.9381504485)
(10, 1373158.0146945729)
(11, 1018788.6522110815)
(12, 806210.08844596706)
(13, 664786.09872368071)
(14, 562077.31186583685)
(15, 482588.61287565983)
(16, 418506.96848381928)
(17, 365489.24416790192)
(18, 320946.67816742574)
(19, 283110.29715778178)
(20, 250695.21426017766)
(21, 222720.40103667916)
(22, 198456.78438444808)
(23, 177324.52498630324)
(24, 158867.00455420383)
(25, 142686.00412946154)
(26, 128449.76797435193)
(27, 115877.69352342954)
(28, 104749.74557612099)
(29, 94871.865332593647)
(30, 86086.84646354233)
(31, 78252.063558112946)
(32, 71240.182122840182)
(33, 64949.617924866674)
(34, 59296.214243623755)
(35, 54208.637445514614)
(36, 49622.145177021521)
(37, 45478.927960500689)
(38, 41730.234339375238)
(39, 38332.854716093454)
(40, 35248.442

(324, 0.013556657339691988)
(325, 0.01296452922693501)
(326, 0.012398260206125056)
(327, 0.011856865280760295)
(328, 0.011339324543381178)
(329, 0.010844674926899512)
(330, 0.010371385246646346)
(331, 0.0099187976279624772)
(332, 0.0094861745927643532)
(333, 0.0090724471221427759)
(334, 0.0086767490529356707)
(335, 0.0082984015871371605)
(336, 0.0079367695540459825)
(337, 0.0075908555223848581)
(338, 0.0072601262459859586)
(339, 0.006943779975207059)
(340, 0.0066413333283553734)
(341, 0.0063520759224780093)
(342, 0.0060754202116012259)
(343, 0.0058110078553697282)
(344, 0.005558343514525433)
(345, 0.0053164681507105521)
(346, 0.0050850703361263313)
(347, 0.0048637916521749418)
(348, 0.0046522168913061511)
(349, 0.0044498816986826718)
(350, 0.0042563694984661237)
(351, 0.0040713071437373802)
(352, 0.0038942987483510621)
(353, 0.0037250357783894425)
(354, 0.0035631410502548911)
(355, 0.0034082862563336543)
(356, 0.0032602076671417995)
(357, 0.0031186098473333472)
(358, 0.0029831254911136

# Using Tensors

In [8]:
import torch

dtype = torch.float
device = torch.device("cuda:0")

batch_size, in_dim, hidden_dim, out_dim = 64, 1000, 100,10

x = torch.randn(batch_size, in_dim, device=device, dtype=dtype)
y = torch.randn(batch_size, out_dim, device=device, dtype=dtype)

w1 = torch.randn(in_dim, hidden_dim, device=device, dtype=dtype)
w2 = torch.randn(hidden_dim, out_dim, device=device, dtype=dtype)
for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    grad_y_pred = 2.0*(y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate*grad_w1
    w2 -= learning_rate*grad_w2

(0, 37231640.0)
(1, 33843512.0)
(2, 32377314.0)
(3, 28075042.0)
(4, 20323622.0)
(5, 12474194.0)
(6, 6913551.0)
(7, 3865556.75)
(8, 2350301.5)
(9, 1598494.0)
(10, 1192875.25)
(11, 948017.3125)
(12, 781926.0)
(13, 659126.9375)
(14, 563227.25)
(15, 485777.03125)
(16, 421830.28125)
(17, 368368.21875)
(18, 323192.625)
(19, 284737.78125)
(20, 251821.171875)
(21, 223484.53125)
(22, 198940.59375)
(23, 177572.59375)
(24, 158911.296875)
(25, 142574.390625)
(26, 128208.109375)
(27, 115532.0546875)
(28, 104316.1015625)
(29, 94365.5859375)
(30, 85540.859375)
(31, 77661.7890625)
(32, 70620.328125)
(33, 64315.828125)
(34, 58662.38671875)
(35, 53588.6875)
(36, 49013.50390625)
(37, 44891.4453125)
(38, 41161.1953125)
(39, 37782.10546875)
(40, 34715.83203125)
(41, 31928.52734375)
(42, 29393.912109375)
(43, 27086.28515625)
(44, 24981.212890625)
(45, 23057.9609375)
(46, 21298.1171875)
(47, 19689.296875)
(48, 18215.7578125)
(49, 16863.38671875)
(50, 15621.322265625)
(51, 14480.14453125)
(52, 13431.430664062

(431, 0.00022342789452522993)
(432, 0.00021795392967760563)
(433, 0.0002134047244908288)
(434, 0.00020799392950721085)
(435, 0.00020326378580648452)
(436, 0.0001991498575080186)
(437, 0.00019393957336433232)
(438, 0.00019049318507313728)
(439, 0.0001868521503638476)
(440, 0.00018208172696176916)
(441, 0.00017870446026790887)
(442, 0.00017522249254398048)
(443, 0.00017044956621248275)
(444, 0.00016761431470513344)
(445, 0.00016440801846329123)
(446, 0.00016117074119392782)
(447, 0.0001574320631334558)
(448, 0.00015432345389854163)
(449, 0.00015178618195932359)
(450, 0.0001482305524405092)
(451, 0.00014535643276758492)
(452, 0.00014252385881263763)
(453, 0.00013970016152597964)
(454, 0.00013725823373533785)
(455, 0.0001347455254290253)
(456, 0.00013210033648647368)
(457, 0.00012952068937011063)
(458, 0.00012684729881584644)
(459, 0.00012482795864343643)
(460, 0.0001223530271090567)
(461, 0.00012030984362354502)
(462, 0.00011838111822726205)
(463, 0.00011610313958954066)
(464, 0.000113889

In [10]:
import torch

dtype = torch.float
device = torch.device("cuda:0")

batch_size, in_dim, hidden_dim, out_dim = 64, 1000, 100,10

x = torch.randn(batch_size, in_dim, device=device, dtype=dtype)
y = torch.randn(batch_size, out_dim, device=device, dtype=dtype)

w1 = torch.randn(in_dim, hidden_dim, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(hidden_dim, out_dim, device=device, dtype=dtype, requires_grad=True)

for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    loss.backward()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

(0, 35193776.0)
(1, 30790712.0)
(2, 28676862.0)
(3, 24784416.0)
(4, 18645538.0)
(5, 12128247.0)
(6, 7218638.0)
(7, 4227169.5)
(8, 2612540.75)
(9, 1758789.875)
(10, 1289622.25)
(11, 1008121.0625)
(12, 822126.8125)
(13, 688418.4375)
(14, 586074.5625)
(15, 504609.9375)
(16, 437900.5)
(17, 382394.09375)
(18, 335714.3125)
(19, 295989.09375)
(20, 261986.328125)
(21, 232727.34375)
(22, 207390.890625)
(23, 185341.6875)
(24, 166085.65625)
(25, 149189.46875)
(26, 134321.71875)
(27, 121184.4296875)
(28, 109540.5859375)
(29, 99193.265625)
(30, 89975.2734375)
(31, 81745.0234375)
(32, 74386.4921875)
(33, 67785.8984375)
(34, 61851.97265625)
(35, 56504.10546875)
(36, 51678.4140625)
(37, 47318.00390625)
(38, 43370.4765625)
(39, 39790.15625)
(40, 36539.90234375)
(41, 33585.13671875)
(42, 30895.984375)
(43, 28444.419921875)
(44, 26207.849609375)
(45, 24165.712890625)
(46, 22303.4296875)
(47, 20598.580078125)
(48, 19036.34765625)
(49, 17603.724609375)
(50, 16288.390625)
(51, 15079.8955078125)
(52, 13968.6

(391, 0.00031801106524653733)
(392, 0.00030974240507930517)
(393, 0.00030186231015250087)
(394, 0.0002949918562080711)
(395, 0.00028714380459859967)
(396, 0.00028000236488878727)
(397, 0.00027338473591953516)
(398, 0.00026658212300390005)
(399, 0.0002598441205918789)
(400, 0.0002533080114517361)
(401, 0.00024718474014662206)
(402, 0.00024095755361486226)
(403, 0.00023496084031648934)
(404, 0.00022964300296735018)
(405, 0.00022405190975405276)
(406, 0.00021840805129613727)
(407, 0.00021399446995928884)
(408, 0.00020923782722093165)
(409, 0.00020495813805609941)
(410, 0.0002003757399506867)
(411, 0.00019600984524004161)
(412, 0.00019159844669047743)
(413, 0.00018687402189243585)
(414, 0.0001839596952777356)
(415, 0.00017970566113945097)
(416, 0.00017546387971378863)
(417, 0.00017190459766425192)
(418, 0.00016815125127322972)
(419, 0.00016462230996694416)
(420, 0.00016069023695308715)
(421, 0.0001580353855388239)
(422, 0.0001545690029161051)
(423, 0.00015208066906780005)
(424, 0.000148616

# nn module in Pytorch

In [11]:
import torch

batch_size, in_dim, hidden_dim, out_dim = 64, 1000, 100,10

x = torch.randn(batch_size, in_dim)
y = torch.randn(batch_size, out_dim)

model = torch.nn.Sequential(
        torch.nn.Linear(in_dim,hidden_dim),
        torch.nn.ReLU(),
        torch.nn.Linear(hidden_dim, out_dim))
loss_func = torch.nn.MSELoss(size_average=False)
learning_rate=1e-4
for t in range(500):
    y_pred = model(x)
    loss = loss_func(y_pred, y)
    print(t, loss.item())
    model.zero_grad()
    loss.backward()
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate*param.grad

(0, 668.6790161132812)
(1, 621.2335205078125)
(2, 580.0513916015625)
(3, 543.7630004882812)
(4, 511.7993469238281)
(5, 483.0644836425781)
(6, 456.6650390625)
(7, 432.0572814941406)
(8, 409.314697265625)
(9, 388.319580078125)
(10, 368.7706604003906)
(11, 350.1435241699219)
(12, 332.4015808105469)
(13, 315.62677001953125)
(14, 299.68365478515625)
(15, 284.5004577636719)
(16, 270.0298767089844)
(17, 256.1666564941406)
(18, 242.90594482421875)
(19, 230.219482421875)
(20, 218.09829711914062)
(21, 206.50244140625)
(22, 195.40171813964844)
(23, 184.68804931640625)
(24, 174.4719696044922)
(25, 164.73341369628906)
(26, 155.42881774902344)
(27, 146.59317016601562)
(28, 138.2058563232422)
(29, 130.2658233642578)
(30, 122.7441635131836)
(31, 115.6112289428711)
(32, 108.87203216552734)
(33, 102.49501037597656)
(34, 96.43890380859375)
(35, 90.70771789550781)
(36, 85.29005432128906)
(37, 80.17744445800781)
(38, 75.37023162841797)
(39, 70.8411636352539)
(40, 66.57575225830078)
(41, 62.554073333740234)

(379, 0.0009813044453039765)
(380, 0.000957754033152014)
(381, 0.0009347869781777263)
(382, 0.0009123848867602646)
(383, 0.0008905156282708049)
(384, 0.0008692105766385794)
(385, 0.0008484175777994096)
(386, 0.0008281099726445973)
(387, 0.0008083191351033747)
(388, 0.0007890223641879857)
(389, 0.0007701974827796221)
(390, 0.000751811487134546)
(391, 0.000733872118871659)
(392, 0.0007163730333559215)
(393, 0.0006993016577325761)
(394, 0.0006826472235843539)
(395, 0.0006663870881311595)
(396, 0.0006505261408165097)
(397, 0.0006350568728521466)
(398, 0.0006199560593813658)
(399, 0.0006052230019122362)
(400, 0.0005908519378863275)
(401, 0.0005768249393440783)
(402, 0.0005631278618238866)
(403, 0.0005497708334587514)
(404, 0.000536739535164088)
(405, 0.0005240158061496913)
(406, 0.0005115962121635675)
(407, 0.0004994833143427968)
(408, 0.0004876558086834848)
(409, 0.0004761159361805767)
(410, 0.00046484541962854564)
(411, 0.0004538604407571256)
(412, 0.00044313495163805783)
(413, 0.00043266

In [13]:
! jupyter nbconvert --to=python introducing_Tensors.ipynb

[NbConvertApp] Converting notebook introducing_Tensors.ipynb to python
[NbConvertApp] Writing 3094 bytes to introducing_Tensors.py
