In [1]:
# First we need to download the data 
# We will use shakespeare data
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-01-01 20:47:03--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8003::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-01-01 20:47:05 (1.64 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [13]:
!pip install torch torchvision torchaudio

Collecting torch
  Downloading torch-2.1.2-cp310-none-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting torchvision
  Downloading torchvision-0.16.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting torchaudio
  Downloading torchaudio-2.1.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.4 kB)
Collecting filelock (from torch)
  Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting sympy (from torch)
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hCollecting networkx (from torch)
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2023.12.2-py3-none-any.whl.metadata (6.8 kB)
Collecting mpmath>=0.19 (from sympy->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━

In [136]:
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 200
n_embed = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [4]:
# Read it in to inspect it 
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(f"Length of dataset in characters : {len(text)}")

Length of dataset in characters : 1115394


In [5]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [6]:
# Getting the vocabulary- Explore all new advancements 
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(f"Vocab size is {vocab_size}")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab size is 65


In [8]:
# Tokenization
# TODO: do the survey eg. google's sentencepiece(sub-word), openai tiktoken(byte-pair)
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # encoder take the string, convert it to list of integers 
decode = lambda l: ''.join(itos[i] for i in l) # decoder: take the list of integers, convert into a string 

print(encode("hello there!"))
print(decode(encode("hello there!")))

[46, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43, 2]
hello there!


In [14]:
# tokenize the entire dataset 
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [15]:
# let us split the data into training and validation sets 
n = int(0.9*len(data)) # using first 90% for training 
train_data = data[:n]
val_data = data[n:]

In [70]:
# In order to train the transformers,
# we can't send the entirety of the train section at once as it would be computationally prohitive
# we'll divide the training dataset into the chunks of inputs known as "sequences" or "blocks"
# Think of these sequences as time dimension or sequence-length
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [52]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context}, the target: {target}")
    

When input is tensor([18]), the target: 47
When input is tensor([18, 47]), the target: 56
When input is tensor([18, 47, 56]), the target: 57
When input is tensor([18, 47, 56, 57]), the target: 58
When input is tensor([18, 47, 56, 57, 58]), the target: 1
When input is tensor([18, 47, 56, 57, 58,  1]), the target: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target: 58


In [53]:
# We need batching to properly utilize the training hardware and speed up the process
# 
torch.manual_seed(42)
batch_size = 4
block_size = 8

def get_batch(split):
    # generate a small batch of data of inputs X and outputs y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y
    
xb, yb = get_batch('train')
print('inputs: ')
print(xb.shape)
print(xb)

print('targets: ')
print(yb.shape)
print(yb)

print ("--------------")
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When the input is {context.tolist()}, the output is {target}")

inputs: 
torch.Size([4, 8])
tensor([[57,  1, 46, 47, 57,  1, 50, 53],
        [ 1, 58, 46, 43, 56, 43,  1, 41],
        [17, 26, 15, 17, 10,  0, 32, 53],
        [57, 58,  6,  1, 61, 47, 58, 46]])
targets: 
torch.Size([4, 8])
tensor([[ 1, 46, 47, 57,  1, 50, 53, 60],
        [58, 46, 43, 56, 43,  1, 41, 39],
        [26, 15, 17, 10,  0, 32, 53,  1],
        [58,  6,  1, 61, 47, 58, 46,  0]])
--------------
When the input is [57], the output is 1
When the input is [57, 1], the output is 46
When the input is [57, 1, 46], the output is 47
When the input is [57, 1, 46, 47], the output is 57
When the input is [57, 1, 46, 47, 57], the output is 1
When the input is [57, 1, 46, 47, 57, 1], the output is 50
When the input is [57, 1, 46, 47, 57, 1, 50], the output is 53
When the input is [57, 1, 46, 47, 57, 1, 50, 53], the output is 60
When the input is [1], the output is 58
When the input is [1, 58], the output is 46
When the input is [1, 58, 46], the output is 43
When the input is [1, 58, 46, 

In [137]:
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [138]:
class Head(torch.nn.Module):
    """One head of self attention"""
    def __init__(self, head_size):
        super().__init__()
        self.key = torch.nn.Linear(n_embed, head_size, bias=False)
        self.query = torch.nn.Linear(n_embed, head_size, bias=False)
        self.value = torch.nn.Linear(n_embed, head_size, bias=False)
        self.dropout = torch.nn.Dropout(dropout)

        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    def forward(self, x):
        B,T,C = x.shape
        
        k = self.key(x) # (B, T, 16)
        q = self.query(x) # (B, T, 16)
        wei = q @ k.transpose(-2, -1) * C **-0.5 # (B, T, 16) x (B, 16, T) -> (B, T, T)
        
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        sei = self.dropout(wei)
        v = self.value(x)
        
        out = wei @ v
        return out
        

In [139]:
class MultiHeadAttention(torch.nn.Module):
    """
    Using multiple heads and concatenating the result 
    """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = torch.nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = torch.nn.Linear(n_embed, n_embed)
        self.dropout = torch.nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out
        

In [140]:
class FeedForward(torch.nn.Module):
    """
    Simple linear layer followed by a simple non-linearity 
    """
    def __init__(self, n_embed):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(n_embed, 4 * n_embed),
            torch.nn.ReLU(),
            torch.nn.Linear(4 * n_embed, n_embed),
            torch.nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


In [141]:
class Block(torch.nn.Module):
    """
    Single transformer decoder block
    (Without the cross attention section obviously, because we have a decoder only network)
    """
    def __init__(self, n_embed, n_head):
        super().__init__()
        assert n_embed%n_head ==0, "The embedding size must be exactly divisible by number of heads"
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = torch.nn.LayerNorm(n_embed)
        self.ln2 = torch.nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x 
        

In [142]:
# lets start with the bigram model 
import torch 
from torch.nn import functional as F
torch.manual_seed(42)

class BigramLanguageModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        #each token directly reads off the logits for the next token from a lookup table 
        self.token_embedding_table = torch.nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = torch.nn.Embedding(block_size, n_embed)
        # self.sa_head = Head(n_embed)
        # self.sa_heads = MultiHeadAttention(4, n_embed/4) # four heads each with head_size (output) 1/4th of the embed size
        # self.ffwd = FeedForward(n_embed)
        self.blocks = torch.nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = torch.nn.LayerNorm(n_embed)
        # self.blocks = torch.nn.Sequential(
        #     Block(n_embed, n_head=4),
        #     Block(n_embed, n_head=4),
        #     Block(n_embed, n_head=4),
        #     torch.nn.LayerNorm(n_embed), # there needs to be a Layernorm at the end of transformer blocks as well ?
        # )

        # final decoding head, do not touch
        self.lm_head = torch.nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb # (B, T, C) - broadcast happens 
        # x = self.sa_head(x)
        # x = self.sa_heads(x)
        # x = self.ffwd(x)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(-1)
    
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is the (B, T) tensor of integers 
        for _ in range(max_new_tokens):
            # crop idx to last block_size tokens 
            idx_cond = idx[:,Z -block_size:]
            # get the predictions
            logits, loss = self(idx_cond) # Logits are (B,T,C)
            # focus only on the last time step 
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim = -1) # (B,C)
            # sample from the distribution
            idx_next =  torch.multinomial(probs, num_samples=1) # (B, 1)
            # append the sampled index to the running sequence 
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel()
logits, loss = m(xb, yb)
print(logits.shape, loss)

idx = torch.zeros((1,1), dtype=torch.long)
# print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

# Obviously this is gonna be gibberish because we haven't trained our model yet 


torch.Size([256, 65]) tensor(4.3782, grad_fn=<NllLossBackward0>)


In [None]:
# Create an optimizer 
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
batch_size = 32
for step in range(10000):
    if step % eval_interval == 0:
        losses = estimate_loss(m)
        print(f"Step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    #sample a batch of training data 
    xb, yb = get_batch('train')

    # evaluate the loss 
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())
# TODO learn how to properly record the losses and trends with tensorboard

In [127]:
print(decode(m.generate(idx, max_new_tokens=1000)[0].tolist()))


H's my and mere.

Sechang dray honad,--
And throng,
Bugh we'll noo,
Nurreeing on broint the light time,
And race oven th's farering ethiany thy vity putle by any all in was no thraight us istorom the bust mastorn anture this bestrugh Yorthe Mour upon said dombny. Must way is my lades no complanzed
Wach there thenge,
Anded befe; 'Twimenjeckion.

ANGELO:
That bore nigh the queestank bust is thy you, would then
commanus grif so six''.

GRATHAM:
My rues oth say thy friend for like there;
And we to-ming were would are time?

LERRT:
Ay commind thy calousen are as night may ger brods
Comes honound fight,
unctis.

WERK:
Some, play OF WARGAMNE:
Why lieve queest shought the zell livenk at both,
Duke: paind Gloh outteng of sheas on goison.

Sy gooldfes this a king to may mittell, 'the sech, no.

MERCUTINIUS:
O ploor Henrys laster'd haspech
matimy Roughdams, 'tis have
Siddrain proses arm thrate hisee,
One.

ANTENBELLO:
Yet moves: in you of Menrad.

First trotal their to skeed his you; bear be for

In [58]:
# Mathematical trick in self attention
torch.manual_seed(42)
B,T,C = 4,8,2
x = torch.randn(B,T,C) # Batch, time, channels 
x.shape

torch.Size([4, 8, 2])

In [60]:
# xbow[b, t] is the average of all the tokens before t in batch b 
xbow = torch.zeros(B, T, C)
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        xbow[b, t] = torch.mean(xprev, 0)

In [61]:
x[0]

tensor([[ 1.9269,  1.4873],
        [ 0.9007, -2.1055],
        [ 0.6784, -1.2345],
        [-0.0431, -1.6047],
        [-0.7521,  1.6487],
        [-0.3925, -1.4036],
        [-0.7279, -0.5594],
        [-0.7688,  0.7624]])

In [62]:
xbow[0]

tensor([[ 1.9269,  1.4873],
        [ 1.4138, -0.3091],
        [ 1.1687, -0.6176],
        [ 0.8657, -0.8644],
        [ 0.5422, -0.3617],
        [ 0.3864, -0.5354],
        [ 0.2272, -0.5388],
        [ 0.1027, -0.3762]])

The matmul trick is using lower triangular matrix instead of ones so that we can ignore the tokens following the current token in the computation process 

In [64]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
b = torch.randint(0, 10, (3,2)).float()
c = a@b

print("a = \n", a)
print("-------")
print("b = \n", b)
print("-------")
print("c = \n", c)

a = 
 tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
-------
b = 
 tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
-------
c = 
 tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


In [66]:
# Now lets apply this trick to our xbow computation 
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) --> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [69]:
# version 3: use softmax 
# The reason to do this is we are trying to formulate self attention 
# tril makes sure only the past tokens factor in the context of current token
# wei initialized to 0 here is obtained from the data to record the affinity between the tokens 
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [97]:
# Single head self-attention
torch.manual_seed(1337)
B,T,C = 4, 8, 32
x = torch.randn(B, T, C)

head_size = 16
key = torch.nn.Linear(C, head_size, bias=False)
query = torch.nn.Linear(C, head_size, bias=False)
value = torch.nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) x (B, 16, T) -> (B, T, T)
tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
v = value(x)

out = wei @ v
out.shape

# Note 1:
# Attention is simply a communication network. In a strictly recurrent network, 
# there are edges between consecutive tokens only. In self-attention, we expand this 
# to allow any tokens to have information flow from any of the past tokens. 
# What's better, we weigh the edges by the importance each token should be getting from each of the previous tokens 
# Note 2:
# There is no notion of space. the tokens themselves contain no information on where they are in a sequence, sentence, or paragraph.
# That's why we need to add positional embedding with token embeddings at the beginning.
# Note 3:
# There is no communication along batch dimension even though they are computed together 
# Does batchnorm change this????
# Note 4:
# In *encoder* simply change tril to ones to allow tokens from *future* to influence the token in the past
# thereby enabling the bidirectional communication. 
# Note 5:
# The self-attention just means keys, queries, and values come from the same source
# In enc-dec transformers queries comes from x, but keys and values come from whole different source, sometimes from the output of encoder blocks 
 

tensor([[[-1.7629e+00, -1.3011e+00,  5.6516e-01,  2.1616e+00, -1.0674e+00,
           1.9632e+00,  1.0765e+00, -4.5295e-01],
         [-3.3334e+00, -1.6556e+00,  1.0405e-01,  3.3782e+00, -2.1825e+00,
           1.0415e+00, -5.5714e-02,  2.9273e-01],
         [-1.0226e+00, -1.2606e+00,  7.6228e-02, -3.8125e-01, -9.8430e-01,
          -1.4303e+00,  7.4921e-02, -9.5465e-01],
         [ 7.8359e-01, -8.0143e-01, -3.3680e-01, -8.4963e-01, -5.6023e-01,
          -1.1701e+00, -1.2927e+00, -1.0260e+00],
         [-1.2566e+00,  1.8719e-02, -7.8797e-01, -1.3204e+00,  2.0363e+00,
           8.6381e-01,  3.7188e-01,  9.2577e-01],
         [-3.1262e-01,  2.4152e+00, -1.1059e-01, -9.9305e-01,  3.3449e+00,
          -2.5229e+00,  1.4187e+00,  1.2196e+00],
         [ 1.0876e+00,  1.9652e+00, -2.6213e-01, -3.1579e-01,  6.0905e-01,
           1.2616e+00, -5.4841e-01,  8.0485e-01],
         [-1.8044e+00, -4.1260e-01, -8.3061e-01,  5.8985e-01, -7.9869e-01,
          -5.8560e-01,  6.4332e-01,  6.3028e-01]],

torch.Size([4, 8, 16])

In [96]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [None]:
Residual connections 