Musing 1: Create Conv1D Layer and test its equivalency to torch implementation

In [695]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

In [671]:
#Creating example input
in_channels = 3
sample_input = torch.randn((32, in_channels, 10))
print(sample_input.shape)

torch.Size([32, 3, 10])


In [672]:
#Hyperparameters used to compare torch and custom implementation
out_channels = 6
kernel_size = 3
stride = 4
padding = 1
dilation = 2

In [677]:
#For the purposes of this implementation, ignoring: groups > 2 and padding_mode != zeros
torch_implementation = nn.Conv1d(in_channels, out_channels, kernel_size, stride = stride, padding = padding, dilation = dilation, bias = True)
print(torch_implementation(sample_input).shape)
print(torch_implementation(sample_input).sum())

torch.Size([32, 6, 2])
tensor(4.0717, grad_fn=<SumBackward0>)


In [694]:
print(torch_implementation.weight.shape)

torch.Size([6, 3, 3])


In [678]:
print(torch_implementation.bias)

Parameter containing:
tensor([-0.0237,  0.0176,  0.0527, -0.0591, -0.0552,  0.3033],
       requires_grad=True)


In [679]:
class My_Conv1d:
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation):
        self.filters = torch.randn((out_channels, in_channels, kernel_size))
        self.biases = torch.randn((out_channels, 1))

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size #size of filter
        self.stride = stride #amount filter slides
        self.padding = padding 
        self.dilation = dilation #space between individual weights in filter

    def __call__(self, input):
        #Output Length formula from PyTorch documentation
        output_length = int((((input.shape[-1]) + (2*self.padding) - self.dilation * (self.kernel_size - 1) - 1)/self.stride) + 1)
        output = torch.zeros((input.shape[0], self.out_channels, output_length)) #N, OC, L
        if self.padding != 0:
            #Will add the desired amount of zeros to both sides (didnt implement one-side padding)
            input = torch.cat([torch.zeros((input.shape[0], self.in_channels, self.padding)), input, torch.zeros((input.shape[0], self.in_channels, self.padding))], dim = 2)
        
        if self.dilation != 1:
            #Dilation = N, adds N zeros between all weights, but first and last weight do not get appending or trailing zero
            dilation_kernel_size = self.kernel_size + (self.dilation-1)*(self.kernel_size-1)
            dilation_filters = torch.zeros(self.out_channels, self.in_channels, dilation_kernel_size)
            dilation_filters[:, :, torch.arange(0, dilation_kernel_size, self.dilation)] = self.filters

        for out_channel in range(self.out_channels):
            for in_channel in range(self.in_channels):
                #out_idx corresponds to output matrix, and in_idx corresponds to input matrix
                for out_idx, in_idx in enumerate(range(0, input.shape[-1], self.stride)):
                    if out_idx == output_length: #Terminates instances in which filter only partially covers input
                        break
                    if self.dilation !=1:
                        output[:, out_channel, out_idx] += (input[:, in_channel, in_idx:in_idx+(dilation_kernel_size)]*dilation_filters[out_channel, in_channel, :]).sum(1)
                    else:
                        # += is needed since for each output channel, need to do element wise mul and sum using a unique filter for each input channel
                        #                                                 N, 1, kernel_size                            1, 1, kernel_size
                        output[:, out_channel, out_idx] += (input[:, in_channel, in_idx:in_idx+self.kernel_size]*self.filters[out_channel, in_channel, :]).sum(1)
        output += self.biases
        return output
    
    def parameters(self):
        return [self.filters, self.biases]

In [688]:
my_implementation = My_Conv1d(in_channels, out_channels, kernel_size, stride = stride, padding = padding, dilation = dilation)
print(my_implementation(sample_input).shape)

torch.Size([32, 6, 2])


In [689]:
print(my_implementation(sample_input).sum())

tensor(1.7867)


In [690]:
print(my_implementation(sample_input).sum() == torch_implementation(sample_input).sum())

tensor(False)


In [691]:
#To check if operations are equivlanet assign the params used in the torch implementation to custom implementation
#then check if sums of outputs match

my_implementation.filters = torch.tensor(list(torch_implementation.parameters())[0])
my_implementation.biases = torch.tensor(list(torch_implementation.parameters())[1]).view(-1,1)

  my_implementation.filters = torch.tensor(list(torch_implementation.parameters())[0])
  my_implementation.biases = torch.tensor(list(torch_implementation.parameters())[1]).view(-1,1)


In [692]:
print(my_implementation(sample_input).sum() == torch_implementation(sample_input).sum())

#Thus my_implementation is equivlant to PyTorch

tensor(True)


An interesting idea is that, in the context of autoregressive character-level models, convolutions allow for more efficient training of linear layers relative to traditional MLPs, as do not need to break single sentences into multiple training examples (instead can train all at once. For more information:  https://www.kilians.net/post/convolution-in-autoregressive-neural-networks/


Musing 2: Exploit Normal Conv Layers to train 8-Context autoregressive language model and show it is equivalent to MLP generated in the lecture. Also try out dilated Conv layers which progressively fuse the input

In [398]:
#Creating Dataset
words = open("video_2_dependencies/names.txt").read().splitlines()
chars = ["."] + sorted(list(set("".join(words)))) + [";"]
#Notice, I added ";" as a potential char, it will be used as a STOP token, further explanation in create_datasets line
stoi = {char:idx for idx, char in enumerate(chars)}
itos = {idx: char for char, idx in stoi.items()}

In [399]:
print(len(words))

32033


To create the dataset for conv layers such that it can be trained more efficiently, the batched input sentences must be of equivalent size. However, this is not the case, as sentences differ in size. To remedy this, the function below indentifies the sentence of longest length and pads the remaining sentences to be
of equal length with STOP ";" token. I opted to add this new token instead of using "." token, such that model learns the presence of "." means to predict an actual letter and ";" means to predict no more letters and instead just continue to predict more ";" stop tokens. If I used only "." for both start and stop, it risks the model predicting letters after the previous output predicted stop

In [528]:
def create_dataset(words, block_size = 8, max_len = 0):
    X = []
    Y = []
    for word in words:
        context = [0]*block_size
        for ch in word:
            context.append(stoi[ch])
        X.append(context)
        Y.append(0)
        if len(context) > max_len:
            max_len = len(context)

    for idx, x in tqdm(enumerate(X)):
        if len(x) == max_len:
            print(idx)
        X[idx] = x + [27]*(max_len - len(x))
        Y[idx] = X[idx][block_size:] + [27]
    
    print("Creating Tensors")
    X = torch.tensor(X).view(len(words), max_len)
    Y = torch.tensor(Y)
    return X,Y

In [403]:
import random
random.seed(42)
random.shuffle(words)

In [529]:
n1 = int(len(words)*0.8)
n2 = int(len(words)*0.9)

X_tr, Y_tr = create_dataset(words[:n1])
X_val, Y_val = create_dataset(words[n1:n2], 8, X_tr.shape[-1])
X_te, Y_te = create_dataset(words[n2:], 8, X_tr.shape[-1])

25626it [00:00, 969286.71it/s]


17026
23789
Creating Tensors


3203it [00:00, 1072260.81it/s]


Creating Tensors


3204it [00:00, 1109065.78it/s]

Creating Tensors





In [530]:
print(X_tr.shape, X_val.shape, X_te.shape)

torch.Size([25626, 23]) torch.Size([3203, 23]) torch.Size([3204, 23])


In [531]:
X_tr

tensor([[ 0,  0,  0,  ..., 27, 27, 27],
        [ 0,  0,  0,  ..., 27, 27, 27],
        [ 0,  0,  0,  ..., 27, 27, 27],
        ...,
        [ 0,  0,  0,  ..., 27, 27, 27],
        [ 0,  0,  0,  ..., 27, 27, 27],
        [ 0,  0,  0,  ..., 27, 27, 27]])

In [532]:
X_tr[17026], Y_tr[17026]

(tensor([ 0,  0,  0,  0,  0,  0,  0,  0, 13, 21,  8,  1, 13, 13,  1,  4, 13, 21,
         19, 20,  1,  6,  1]),
 tensor([13, 21,  8,  1, 13, 13,  1,  4, 13, 21, 19, 20,  1,  6,  1, 27]))

In [533]:
print(Y_tr.shape, Y_val.shape, Y_te.shape)

torch.Size([25626, 16]) torch.Size([3203, 16]) torch.Size([3204, 16])


In [534]:
Y_tr[0]

tensor([25, 21,  8,  5, 14,  7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27])

In [633]:
class CNN(nn.Module):
    def __init__(self, vocab_size, block_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 10)
        #Will have output of N, vocab_size, 16 (16 determined by using max_len = 23 and blcok_size = 8; euqal to Y target size)
        #Treat as a multi-class multi-output problem
        #Will have vocab_size output channel, where we can softmax over to determine highest prob next char and compute loss
        #Since output will have 16 multi-class outputs simultaneously, it is a multi-output problem

        #Need stride = 10, since each letters embedding is 10-d, and want filters to slide over indiv. chars only
        self.convolution = nn.Conv1d(1, vocab_size, 10*block_size, 10)

    def __call__(self, input):
        x = self.embedding(input)
        #Will create N, 1, 80 (block_size * embedding -> 8*10) input
        x = x.view(input.shape[0], 1, -1)
        output = self.convolution(x)
        return output


In [634]:
cnn_model = CNN(len(chars), 8)

In [637]:
iterations = 20000
optimizer = torch.optim.SGD(cnn_model.parameters(), lr = 0.1)
for i in tqdm(range(iterations)):
    batch_idx = torch.randint(0, len(X_tr), (32,))
    logits = cnn_model(X_tr[batch_idx])
    loss = F.cross_entropy(logits, Y_tr[batch_idx])
    if i == 15000:
        optimizer = torch.optim.SGD(cnn_model.parameters(), lr = 0.01)
    optimizer.zero_grad()
    loss.backward()
    if i % 1000 == 0:
        print(loss)
    optimizer.step()


  2%|▏         | 308/20000 [00:00<00:12, 1546.56it/s]

tensor(3.4435, grad_fn=<NllLoss2DBackward0>)


  6%|▋         | 1275/20000 [00:00<00:11, 1596.77it/s]

tensor(1.1061, grad_fn=<NllLoss2DBackward0>)


 11%|█         | 2230/20000 [00:01<00:11, 1573.76it/s]

tensor(1.1053, grad_fn=<NllLoss2DBackward0>)


 16%|█▌        | 3177/20000 [00:02<00:10, 1564.97it/s]

tensor(1.0793, grad_fn=<NllLoss2DBackward0>)


 21%|██▏       | 4283/20000 [00:02<00:10, 1530.43it/s]

tensor(1.0954, grad_fn=<NllLoss2DBackward0>)


 26%|██▌       | 5213/20000 [00:03<00:09, 1535.63it/s]

tensor(1.0500, grad_fn=<NllLoss2DBackward0>)


 32%|███▏      | 6301/20000 [00:04<00:08, 1540.72it/s]

tensor(0.9798, grad_fn=<NllLoss2DBackward0>)


 36%|███▌      | 7238/20000 [00:04<00:08, 1539.58it/s]

tensor(0.9851, grad_fn=<NllLoss2DBackward0>)


 41%|████      | 8169/20000 [00:05<00:07, 1543.00it/s]

tensor(1.0715, grad_fn=<NllLoss2DBackward0>)


 46%|████▌     | 9246/20000 [00:05<00:07, 1508.03it/s]

tensor(1.0896, grad_fn=<NllLoss2DBackward0>)


 51%|█████     | 10166/20000 [00:06<00:06, 1517.06it/s]

tensor(1.0234, grad_fn=<NllLoss2DBackward0>)


 56%|█████▌    | 11244/20000 [00:07<00:05, 1519.15it/s]

tensor(0.9675, grad_fn=<NllLoss2DBackward0>)


 61%|██████    | 12165/20000 [00:07<00:05, 1528.08it/s]

tensor(1.0069, grad_fn=<NllLoss2DBackward0>)


 66%|██████▌   | 13233/20000 [00:08<00:04, 1446.68it/s]

tensor(1.0089, grad_fn=<NllLoss2DBackward0>)


 71%|███████   | 14152/20000 [00:09<00:04, 1409.28it/s]

tensor(1.0958, grad_fn=<NllLoss2DBackward0>)


 76%|███████▌  | 15217/20000 [00:09<00:03, 1470.12it/s]

tensor(1.0172, grad_fn=<NllLoss2DBackward0>)


 81%|████████▏ | 16285/20000 [00:10<00:02, 1521.19it/s]

tensor(0.9229, grad_fn=<NllLoss2DBackward0>)


 86%|████████▌ | 17192/20000 [00:11<00:02, 1397.66it/s]

tensor(1.0363, grad_fn=<NllLoss2DBackward0>)


 91%|█████████▏| 18260/20000 [00:12<00:01, 1509.01it/s]

tensor(1.0427, grad_fn=<NllLoss2DBackward0>)


 96%|█████████▌| 19168/20000 [00:12<00:00, 1505.93it/s]

tensor(0.9861, grad_fn=<NllLoss2DBackward0>)


100%|██████████| 20000/20000 [00:13<00:00, 1518.98it/s]


In [638]:
with torch.no_grad():
    train_logits = cnn_model(X_tr)
    train_loss = F.cross_entropy(train_logits, Y_tr)
    print(train_loss)

tensor(1.0138)


In [639]:
with torch.no_grad():
    val_logits = cnn_model(X_val)
    val_loss = F.cross_entropy(val_logits, Y_val)
    print(val_loss)

tensor(1.0075)


You will notice the loss are much smaller then the lexture loss, this is because through padding the inputs, some portions of the sentence are "a ; ; ; ; ; ;" in which the model probably easily learned to predict ";". inflating perceived performance

In [584]:
#Converting datasets used in Conv to function in normal MLP

def create_mlp_datasets(dataset_X, dataset_Y, block_size = 8):
    X = []
    Y = []
    for instance_x, instance_y in zip(dataset_X, dataset_Y):
        instance_x = instance_x.view(-1).tolist()
        for chr_idx in range(0, len(instance_x) - block_size + 1):
            X.append(instance_x[chr_idx:chr_idx+block_size])
            Y.append(instance_y[chr_idx])
    return torch.tensor(X), torch.tensor(Y)

In [585]:
mlp_X_tr, mlp_Y_tr = create_mlp_datasets(X_tr, Y_tr, block_size = 8)
mlp_X_val, mlp_Y_val = create_mlp_datasets(X_val, Y_val, block_size = 8)

In [586]:
print(mlp_X_tr.shape)

torch.Size([410016, 8])


In [578]:
mlp_X_tr[:10]

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0, 25],
        [ 0,  0,  0,  0,  0,  0, 25, 21],
        [ 0,  0,  0,  0,  0, 25, 21,  8],
        [ 0,  0,  0,  0, 25, 21,  8,  5],
        [ 0,  0,  0, 25, 21,  8,  5, 14],
        [ 0,  0, 25, 21,  8,  5, 14,  7],
        [ 0, 25, 21,  8,  5, 14,  7, 27],
        [25, 21,  8,  5, 14,  7, 27, 27],
        [21,  8,  5, 14,  7, 27, 27, 27]])

In [472]:
mlp_Y_tr

tensor([25, 21,  8,  ..., 27, 27, 27])

In [559]:
class MLP(nn.Module):
    def __init__(self, vocab_size, block_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 10)
        self.linear = nn.Linear(block_size*10, vocab_size)
    
    def __call__(self, input):
        x = self.embedding(input)
        x = x.view(input.shape[0], -1)
        output = self.linear(x)
        return output

In [697]:
model = MLP(len(chars), 8 )

Showing how trained parameters used in Conv can be used in simple MLP and achieve the same performance

In [698]:
model.embedding.weight.data = list(cnn_model.embedding.parameters())[0]

In [699]:
model.linear.weight.data = list(cnn_model.convolution.parameters())[0].reshape(len(chars), 80)

In [700]:
model.linear.bias.data = list(cnn_model.convolution.parameters())[1].reshape(28)

In [701]:
with torch.no_grad():
    train_logits = model(mlp_X_tr)
    train_loss = F.cross_entropy(train_logits, mlp_Y_tr)
    print(train_loss)

tensor(1.0138)


In [702]:
with torch.no_grad():
    train_logits = model(mlp_X_val)
    train_loss = F.cross_entropy(train_logits, mlp_Y_val)
    print(train_loss)

tensor(1.0075)


We get the exact same train and val performance, proving in autoregressive setting, can make CONV == MLP

Now lets used a dilated CNN to progressively fuse the inputs (similiar to lecture)

In [724]:
class dilated_CNN(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 10)
        self.convolution = nn.Sequential(
        nn.Conv1d(1, 100, 10*2, stride = 10), nn.Tanh(),  #Embeddings are 10-dim per char, so filter of 20 looks at 2 chars, we use stride of 10 to slide over each char (which is needed for the parallel multi-output training for each time step)
        nn.Conv1d(100, 100, 2, stride = 1, dilation = 2), nn.Tanh(), #Each bigram char has length 1 and channel dim 100, a filter of 2 would consider a 4-gram. We use a dilation factor of 2, such that two 2-grams composed of 4 different characters are considered. If no dilation, would consider a bigram like abcd: "ab bc" but since dilation = 2 ensures the filter weight skips 1 time step, uses "ab cd" instead. We need stride to = 1 for a similiar reason as above
        nn.Conv1d(100, vocab_size, 2, stride = 1, dilation = 4), #Each fourgram has length 1 and channel dim of 100, a filter of 2 would consider a 8-gram. Use dilation = 4, such that two fourgrams composed of 8 different chars/positions are considered
        )

    def __call__(self, input):
        x = self.embedding(input)
        x = x.view(input.shape[0], 1, -1)
        output = self.convolution(x)
        return output

In [725]:
dilated_cnn_model = dilated_CNN(len(chars))

In [726]:
dilated_cnn_model(X_tr[:10]).shape

torch.Size([10, 28, 16])

In [727]:
iterations = 20000
optimizer = torch.optim.SGD(dilated_cnn_model.parameters(), lr = 0.1)
for i in tqdm(range(iterations)):
    batch_idx = torch.randint(0, len(X_tr), (32,))
    logits = dilated_cnn_model(X_tr[batch_idx])
    loss = F.cross_entropy(logits, Y_tr[batch_idx])
    if i == 15000:
        optimizer = torch.optim.SGD(dilated_cnn_model.parameters(), lr = 0.01)
    optimizer.zero_grad()
    loss.backward()
    if i % 1000 == 0:
        print(loss)
    optimizer.step()

  0%|          | 12/20000 [00:00<05:49, 57.22it/s]

tensor(3.2740, grad_fn=<NllLoss2DBackward0>)


  5%|▌         | 1010/20000 [00:20<05:38, 56.05it/s]

tensor(1.0191, grad_fn=<NllLoss2DBackward0>)


 10%|█         | 2011/20000 [00:38<05:51, 51.15it/s]

tensor(0.9787, grad_fn=<NllLoss2DBackward0>)


 15%|█▌        | 3012/20000 [00:57<05:10, 54.65it/s]

tensor(1.0505, grad_fn=<NllLoss2DBackward0>)


 20%|██        | 4008/20000 [01:16<04:47, 55.57it/s]

tensor(0.9523, grad_fn=<NllLoss2DBackward0>)


 25%|██▌       | 5011/20000 [01:34<04:38, 53.91it/s]

tensor(0.9466, grad_fn=<NllLoss2DBackward0>)


 30%|███       | 6011/20000 [01:53<04:12, 55.32it/s]

tensor(1.0167, grad_fn=<NllLoss2DBackward0>)


 35%|███▌      | 7010/20000 [02:11<04:18, 50.27it/s]

tensor(1.0019, grad_fn=<NllLoss2DBackward0>)


 40%|████      | 8010/20000 [02:30<03:27, 57.82it/s]

tensor(0.9970, grad_fn=<NllLoss2DBackward0>)


 45%|████▌     | 9008/20000 [02:47<03:10, 57.67it/s]

tensor(1.0279, grad_fn=<NllLoss2DBackward0>)


 50%|█████     | 10007/20000 [03:05<02:52, 57.81it/s]

tensor(0.9725, grad_fn=<NllLoss2DBackward0>)


 55%|█████▌    | 11012/20000 [03:23<02:32, 58.77it/s]

tensor(0.9704, grad_fn=<NllLoss2DBackward0>)


 60%|██████    | 12016/20000 [03:40<01:33, 85.09it/s]

tensor(0.9152, grad_fn=<NllLoss2DBackward0>)


 65%|██████▌   | 13013/20000 [03:52<01:30, 77.59it/s]

tensor(1.0233, grad_fn=<NllLoss2DBackward0>)


 70%|███████   | 14010/20000 [04:05<01:17, 77.31it/s]

tensor(1.0073, grad_fn=<NllLoss2DBackward0>)


 75%|███████▌  | 15016/20000 [04:18<01:04, 77.47it/s]

tensor(0.9321, grad_fn=<NllLoss2DBackward0>)


 80%|████████  | 16017/20000 [04:31<00:48, 82.95it/s]

tensor(0.9201, grad_fn=<NllLoss2DBackward0>)


 85%|████████▌ | 17009/20000 [04:43<00:36, 82.77it/s]

tensor(0.8832, grad_fn=<NllLoss2DBackward0>)


 90%|█████████ | 18017/20000 [04:55<00:24, 81.50it/s]

tensor(0.8821, grad_fn=<NllLoss2DBackward0>)


 95%|█████████▌| 19012/20000 [05:08<00:12, 79.48it/s]

tensor(0.9268, grad_fn=<NllLoss2DBackward0>)


100%|██████████| 20000/20000 [05:21<00:00, 62.25it/s]


In [728]:
with torch.no_grad():
    train_logits = dilated_cnn_model(X_tr)
    train_loss = F.cross_entropy(train_logits, Y_tr)
    print(train_loss)

tensor(0.9333)


In [729]:
with torch.no_grad():
    val_logits = dilated_cnn_model(X_val)
    val_loss = F.cross_entropy(val_logits, Y_val)
    print(val_loss)

tensor(0.9351)


Note: We used much more parameters and non-linear functions, so its not an apt comparison to the simple Conv Network, but anyhow, I am getting nice performance gains through this dilated network