# Seminar 5 & 6

by Hekmat Taherinejad

# Recurrent neural networks (RNN)

In [None]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

In [None]:
# Hyper-parameters

num_classes = 10
n_iters = 3000
batch_size = 100



In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# MNIST dataset
train_dataset = torchvision.datasets.MNIST(root='./data',
                                           train=True,
                                           transform=transforms.ToTensor(),
                                           download=True)

test_dataset = torchvision.datasets.MNIST(root='./data',
                                          train=False,
                                          transform=transforms.ToTensor())

# Data loader: These loaders handle the shuffling and batching of the dataset during training and testing, respectively.
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 74168834.89it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 56658416.19it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz



100%|██████████| 1648877/1648877 [00:00<00:00, 27897120.28it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 4613835.98it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






In [None]:
print(train_dataset.train_data.size())


torch.Size([60000, 28, 28])




In [None]:
print(train_dataset.train_labels.size())

torch.Size([60000])




In [None]:
print(test_dataset.test_data.size())

torch.Size([10000, 28, 28])




In [None]:
print(test_dataset.test_labels.size())

torch.Size([10000])




<img src="https://raw.githubusercontent.com/ritchieng/deep-learning-wizard/dc6fb5ccfaf6ca4760f673c2384330d5b2069bf2/docs/deep_learning/practical_pytorch/images/rnn4n.png" alt="Deep Recurrent Neural Networks">

In [None]:
num_epochs = n_iters / (len(train_dataset) / batch_size) # This calculation ensures that the model goes through the entire dataset
num_epochs = int(num_epochs)
learning_rate = 0.001

input_size = 28
sequence_length = 28
hidden_size = 128
num_layers = 1

In [None]:
# Fully connected neural network with one hidden layer
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()

        # Hidden dimensions
        self.hidden_size = hidden_size


        # Number of hidden layers
        self.num_layers = num_layers

        # TODO
        # Building your RNN
        # batch_first=True causes input/output tensors to be of shape:
        # (batch_dim, seq_dim, input_dim) -> x needs to be: (batch_size, seq, input_size)
        self.rnn =nn.RNN(input_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):

        # TODO
        # Initialize hidden state with zeros
        # (layer_dim, batch_size, hidden_dim)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # x: (n, 28, 28), h0: (2, n, 128)

        # Forward propagate RNN
        out, _ = self.rnn(x, h0)

        # out: tensor of shape (batch_size, seq_length, hidden_size)
        # out: (n, 28, 128)

        # Decode the hidden state of the last time step
        out = out[:, -1, :]
        # out: (n, 128)

        out = self.fc(out)
        # out: (n, 10)
        return out


In [None]:
model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # origin shape: [N, 1, 28, 28]
        # resized: [N, 28, 28]
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for images, labels in test_loader:
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test images: {acc} %')

Epoch [1/5], Step [100/600], Loss: 1.2213
Epoch [1/5], Step [200/600], Loss: 1.2950
Epoch [1/5], Step [300/600], Loss: 0.9491
Epoch [1/5], Step [400/600], Loss: 0.8533
Epoch [1/5], Step [500/600], Loss: 1.1332
Epoch [1/5], Step [600/600], Loss: 0.5480
Epoch [2/5], Step [100/600], Loss: 0.5500
Epoch [2/5], Step [200/600], Loss: 0.4251
Epoch [2/5], Step [300/600], Loss: 0.4668
Epoch [2/5], Step [400/600], Loss: 0.2962
Epoch [2/5], Step [500/600], Loss: 0.3938
Epoch [2/5], Step [600/600], Loss: 0.3181
Epoch [3/5], Step [100/600], Loss: 0.2425
Epoch [3/5], Step [200/600], Loss: 0.2255
Epoch [3/5], Step [300/600], Loss: 0.3977
Epoch [3/5], Step [400/600], Loss: 0.3861
Epoch [3/5], Step [500/600], Loss: 0.2767
Epoch [3/5], Step [600/600], Loss: 0.2349
Epoch [4/5], Step [100/600], Loss: 0.2043
Epoch [4/5], Step [200/600], Loss: 0.4115
Epoch [4/5], Step [300/600], Loss: 0.2341
Epoch [4/5], Step [400/600], Loss: 0.1383
Epoch [4/5], Step [500/600], Loss: 0.2944
Epoch [4/5], Step [600/600], Loss:

### Deep RNN

<img src="https://raw.githubusercontent.com/ritchieng/deep-learning-wizard/dc6fb5ccfaf6ca4760f673c2384330d5b2069bf2/docs/deep_learning/practical_pytorch/images/rnn6.png" alt="Deep Recurrent Neural Networks">

In [11]:
# TODO
# Increase number of Layers
num_layers = 2

In [12]:
model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # origin shape: [N, 1, 28, 28]
        # resized: [N, 28, 28]
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for images, labels in test_loader:
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test images: {acc} %')

Epoch [1/5], Step [100/600], Loss: 1.1134
Epoch [1/5], Step [200/600], Loss: 0.9162
Epoch [1/5], Step [300/600], Loss: 0.5617
Epoch [1/5], Step [400/600], Loss: 0.3885
Epoch [1/5], Step [500/600], Loss: 0.4364
Epoch [1/5], Step [600/600], Loss: 0.3897
Epoch [2/5], Step [100/600], Loss: 0.3577
Epoch [2/5], Step [200/600], Loss: 0.1870
Epoch [2/5], Step [300/600], Loss: 0.1890
Epoch [2/5], Step [400/600], Loss: 0.1543
Epoch [2/5], Step [500/600], Loss: 0.1740
Epoch [2/5], Step [600/600], Loss: 0.2211
Epoch [3/5], Step [100/600], Loss: 0.2751
Epoch [3/5], Step [200/600], Loss: 0.2018
Epoch [3/5], Step [300/600], Loss: 0.2035
Epoch [3/5], Step [400/600], Loss: 0.1073
Epoch [3/5], Step [500/600], Loss: 0.0875
Epoch [3/5], Step [600/600], Loss: 0.2582
Epoch [4/5], Step [100/600], Loss: 0.2509
Epoch [4/5], Step [200/600], Loss: 0.1978
Epoch [4/5], Step [300/600], Loss: 0.0949
Epoch [4/5], Step [400/600], Loss: 0.2005
Epoch [4/5], Step [500/600], Loss: 0.0550
Epoch [4/5], Step [600/600], Loss:

### Bidirectional RNN
<img src="https://media.geeksforgeeks.org/wp-content/uploads/20230302163012/Bidirectional-Recurrent-Neural-Network-2.png" alt="Bidirectional Recurrent Neural Networks" width="600">

In [18]:
num_layers = 1
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        # TODO: Add "bidirectional=True" argument to the RNN model
        self.rnn =nn.RNN(input_size, hidden_size, num_layers, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # Multiply by 2 for bidirectional

    def forward(self, x):

        # TODO
        # Initialize hidden state with zeros
        # num_layers * 2
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out


In [19]:
model = BiRNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # origin shape: [N, 1, 28, 28]
        # resized: [N, 28, 28]
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for images, labels in test_loader:
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test images: {acc} %')

Epoch [1/5], Step [100/600], Loss: 1.2801
Epoch [1/5], Step [200/600], Loss: 1.0338
Epoch [1/5], Step [300/600], Loss: 0.8040
Epoch [1/5], Step [400/600], Loss: 0.8008
Epoch [1/5], Step [500/600], Loss: 0.7756
Epoch [1/5], Step [600/600], Loss: 0.6269
Epoch [2/5], Step [100/600], Loss: 0.5818
Epoch [2/5], Step [200/600], Loss: 0.3815
Epoch [2/5], Step [300/600], Loss: 0.3869
Epoch [2/5], Step [400/600], Loss: 0.4511
Epoch [2/5], Step [500/600], Loss: 0.3980
Epoch [2/5], Step [600/600], Loss: 0.3471
Epoch [3/5], Step [100/600], Loss: 0.4219
Epoch [3/5], Step [200/600], Loss: 0.3822
Epoch [3/5], Step [300/600], Loss: 0.2486
Epoch [3/5], Step [400/600], Loss: 0.3476
Epoch [3/5], Step [500/600], Loss: 0.3780
Epoch [3/5], Step [600/600], Loss: 0.2647
Epoch [4/5], Step [100/600], Loss: 0.2536
Epoch [4/5], Step [200/600], Loss: 0.2840
Epoch [4/5], Step [300/600], Loss: 0.4722
Epoch [4/5], Step [400/600], Loss: 0.2085
Epoch [4/5], Step [500/600], Loss: 0.1898
Epoch [4/5], Step [600/600], Loss:

For more information:
[deeplearningwizard](https://www.deeplearningwizard.com/deep_learning/practical_pytorch/pytorch_recurrent_neuralnetwork/)


# LSTM

<img src="https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png" alt="LSTM" width="600">



<img src="https://www.deeplearningwizard.com/deep_learning/practical_pytorch/images/lstm2.png" alt="LSTM" width="800">


In [15]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()

        # Hidden dimensions
        self.hidden_size = hidden_size

        # Number of hidden layers
        self.num_layers = num_layers

        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        # -> x needs to be: (batch_size, seq, input_size)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  # Shape: (num_layers, batch_size, hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  # Shape: (num_layers, batch_size, hidden_size)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))

        # out: tensor of shape (batch_size, seq_length, hidden_size)
        # out: (n, 28, 128)

        # Decode the hidden state of the last time step
        out = out[:, -1, :]  # Extract the output of the last time step
        # out: (n, 128)

        out = self.fc(out)
        # out: (n, 10)
        return out


model = LSTM(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # origin shape: [N, 1, 28, 28]
        # resized: [N, 28, 28]
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for images, labels in test_loader:
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test images: {acc} %')

Epoch [1/5], Step [100/600], Loss: 0.7685
Epoch [1/5], Step [200/600], Loss: 0.5941
Epoch [1/5], Step [300/600], Loss: 0.2579
Epoch [1/5], Step [400/600], Loss: 0.4308
Epoch [1/5], Step [500/600], Loss: 0.3272
Epoch [1/5], Step [600/600], Loss: 0.1775
Epoch [2/5], Step [100/600], Loss: 0.1761
Epoch [2/5], Step [200/600], Loss: 0.1087
Epoch [2/5], Step [300/600], Loss: 0.1787
Epoch [2/5], Step [400/600], Loss: 0.2940
Epoch [2/5], Step [500/600], Loss: 0.1122
Epoch [2/5], Step [600/600], Loss: 0.1067
Epoch [3/5], Step [100/600], Loss: 0.0744
Epoch [3/5], Step [200/600], Loss: 0.0936
Epoch [3/5], Step [300/600], Loss: 0.0784
Epoch [3/5], Step [400/600], Loss: 0.1471
Epoch [3/5], Step [500/600], Loss: 0.0537
Epoch [3/5], Step [600/600], Loss: 0.0688
Epoch [4/5], Step [100/600], Loss: 0.0291
Epoch [4/5], Step [200/600], Loss: 0.0859
Epoch [4/5], Step [300/600], Loss: 0.0745
Epoch [4/5], Step [400/600], Loss: 0.0880
Epoch [4/5], Step [500/600], Loss: 0.0455
Epoch [4/5], Step [600/600], Loss:

### Bidirectional LSTM

<img src="https://www.baeldung.com/wp-content/uploads/sites/4/2022/01/bilstm-1-1024x384.png" alt="BiLSTM" width="600">


In [16]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiLSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        # Initialize bidirectional LSTM
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bidirectional=True, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # Multiply by 2 for bidirectional

    def forward(self, x):
        # Set initial hidden states and cell states
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)  # Shape: (num_layers * num_directions, batch_size, hidden_size)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)  # Shape: (num_layers * num_directions, batch_size, hidden_size)

        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))

        # out: tensor of shape (batch_size, seq_length, hidden_size*num_directions)
        # out: (n, seq, hidden_size*2)

        # Decode the hidden state of the last time step
        out = out[:, -1, :]  # Extract the output of the last time step
        # out: (n, hidden_size*2)

        out = self.fc(out)
        # out: (n, num_classes)
        return out


In [17]:
model = BiLSTM(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # origin shape: [N, 1, 28, 28]
        # resized: [N, 28, 28]
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for images, labels in test_loader:
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test images: {acc} %')

Epoch [1/5], Step [100/600], Loss: 0.7727
Epoch [1/5], Step [200/600], Loss: 0.3035
Epoch [1/5], Step [300/600], Loss: 0.3172
Epoch [1/5], Step [400/600], Loss: 0.3497
Epoch [1/5], Step [500/600], Loss: 0.1562
Epoch [1/5], Step [600/600], Loss: 0.2453
Epoch [2/5], Step [100/600], Loss: 0.1293
Epoch [2/5], Step [200/600], Loss: 0.3081
Epoch [2/5], Step [300/600], Loss: 0.0880
Epoch [2/5], Step [400/600], Loss: 0.1361
Epoch [2/5], Step [500/600], Loss: 0.0611
Epoch [2/5], Step [600/600], Loss: 0.1257
Epoch [3/5], Step [100/600], Loss: 0.0849
Epoch [3/5], Step [200/600], Loss: 0.1246
Epoch [3/5], Step [300/600], Loss: 0.1182
Epoch [3/5], Step [400/600], Loss: 0.0985
Epoch [3/5], Step [500/600], Loss: 0.1464
Epoch [3/5], Step [600/600], Loss: 0.0697
Epoch [4/5], Step [100/600], Loss: 0.1079
Epoch [4/5], Step [200/600], Loss: 0.0199
Epoch [4/5], Step [300/600], Loss: 0.1342
Epoch [4/5], Step [400/600], Loss: 0.1825
Epoch [4/5], Step [500/600], Loss: 0.0611
Epoch [4/5], Step [600/600], Loss:

For more information:
[deeplearningwizard](https://www.deeplearningwizard.com/deep_learning/practical_pytorch/pytorch_lstm_neuralnetwork/)

# Building a GPT


In [None]:
# Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
print("length of dataset in characters: ", len(text))

In [None]:
# let's look at the first 1000 characters
print(text[:1000])

In [None]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

In [None]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

In [None]:
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

In [None]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [None]:
block_size = 8
train_data[:block_size+1]

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

In [None]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

In [None]:
print(xb) # our input to the model

### N-gram Language Models

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


In [None]:
 # create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(100): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

In [None]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))



### The mathematical trick in self-attention

In [None]:
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

In [None]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

In [None]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)

In [None]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # future can't communicate with past
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow2, xbow3)

In [None]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
#out = wei @ x

out.shape

In [None]:
wei[0]

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1)

In [None]:
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
#out = wei @ x

out.shape


In [None]:
wei[0]

Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.

* sentence 1: The bank1 of the river.
* sentence 2: Money in the bank2.

![alt text](https://files.readme.io/298afce-image.png)

![alt text](https://files.readme.io/5f8c5fb-image.png)

- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much.

## Attention
An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors.  The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key.

We call our particular attention "Scaled Dot-Product Attention".   The input consists of queries and keys of dimension $d_k$, and values of dimension $d_v$.  We compute the dot products of the query with all keys, divide each by $\sqrt{d_k}$, and apply a softmax function to obtain the weights on the values.


![alt text](https://raw.githubusercontent.com/harvardnlp/annotated-transformer/master/images/ModalNet-19.png)


In practice, we compute the attention function on a set of queries simultaneously, packed together into a matrix $Q$.   The keys and values are also packed together into matrices $K$ and $V$.  We compute the matrix of outputs as:

$$
   \mathrm{Attention}(Q, K, V) = \mathrm{softmax}(\frac{QK^T}{\sqrt{d_k}})V
$$

![alt text](https://lena-voita.github.io/resources/lectures/seq2seq/transformer/qkv_explained-min.png)
Source: [Lena Voita's Lecture about Seq2Seq](https://lena-voita.github.io/nlp_course/seq2seq_and_attention.html)

In [25]:
import torch
from torch import nn

class Attention(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim
        self.scale = self.dim ** -0.5
        self.qkv = nn.Linear(dim, dim * 3)

    def forward(self, x):
        '''
        Args:
            x: Tensor of shape (batch_size, seq_len, input_dim)

        Returns:
            Tensor of shape (batch_size, seq_len, input_dim)
        '''
        Batch, length_seq,input_dim = x.shape# Extract batch size, sequence length, and input dimension.
        qkv = self.qkv(x).reshape(Batch, length_seq, 3, self.dim)# Perform linear transformation and reshape for queries, keys, and values.
        q, k, v = qkv.unbind(2)# Unbind into separate queries, keys, and values.
        q = q * self.scale# Scale the queries.
        attn = q @ k.transpose(-2, -1)  # Compute attention scores.
        attn = attn.softmax(dim=-1)    # Apply softmax to obtain attention weights
        x = attn @ v# Compute weighted sum of values using attention weights.

        return x

In [22]:
x = torch.ones(11, 12, 8)
assert Attention(8)(x).shape == x.shape

# Multi-head attention


![](Attention.png)

- Divide each vector in a sequence into `num_heads` vectors ($d$ mod `num_heads` = 0)
- Apply attention layers independently, concatenate the result
$$\text{head}_i = \text{Attention}(Q_i, K_i, V_i)$$
$$ \textrm{concat} \left( \text{head}_1, \text{head}_2, \ldots, \text{head}_h \right) $$
- Apply an extra linear layer to mix independent attention branches
- **How to implement without loops?**

![alt text](https://uvadlc-notebooks.readthedocs.io/en/latest/_images/multihead_attention.svg)


In [26]:
class MultiHeadAttention(nn.Module):
    def __init__(self, dim, num_heads=8):
        super().__init__()
        if dim % num_heads:
            raise ValueError('dim % num_heads != 0')
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = self.head_dim ** -0.5
        self.qkv = nn.Linear(dim, dim * 3, bias=False)
        self.proj = nn.Linear(dim, dim)

    def forward(self, x):
        '''
        Args:
            x: Tensor of shape (batch_size, seq_len, input_dim)

        Returns:
            Tensor of shape (batch_size, seq_len, input_dim)
        '''
        # Hint: you might want to use torch.permute function

        # Extract batch size, sequence length, and input dimension.


        # qkv: 3 × B × num_heads × N × head_dim

        # Unbind into separate queries, keys, and values.

        # Scale the queries.

        # Compute attention scores.

        # Apply softmax to obtain attention weights.

        # Compute weighted sum of values using attention weights.

        # x: B × num_heads × N × head_dim

        # Reshape and transpose back to original shape.

        # x: B × N × (num_heads × head_dim)


        Batch_size, length_seq, input_dim = x.shape# Extract batch size, sequence length, and input dimension.
         # Perform linear transformation and reshape for queries, keys, and values.
        qkv = self.qkv(x).reshape(Batch_size, length_seq, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        # qkv: 3 × B × num_heads × N × head_dim
        q, k, v = qkv.unbind(0) # Unbind into separate queries, keys, and values.
        q = q * self.scale# Scale the queries.
        attn = q @ k.transpose(-2, -1) # Compute attention scores.
        attn = attn.softmax(dim=-1) # Apply softmax to obtain attention weights.
        x = attn @ v  # attn: B × num_heads × N × N    v: B × num_heads × N × head_dim,  # Compute weighted sum of values using attention weights.
        # B × num_heads × N × head_dim
        x = x.transpose(1, 2).reshape(Batch_size, length_seq, input_dim) # Reshape and transpose back to original shape.
        # B × N × (num_heads × head_dim)
        x = self.proj(x)  # Project the output.


        return x

In [27]:
MultiHeadAttention(128, 8)(torch.ones(11, 12, 128)).shape

torch.Size([11, 12, 128])

# ViT

### Einops.rearrange

https://github.com/arogozhnikov/einops

In [28]:
!python3 -m pip install einops -q
from einops import rearrange

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [29]:
# Transposition:
rearrange(torch.arange(1024).reshape(2, 4, 8, 16), 'aa b c d -> d c b aa').shape

torch.Size([16, 8, 4, 2])

In [30]:
res = rearrange(torch.arange(30).reshape(5, 6), 'a (b c) -> a b c', b=2, c=3)
res.shape

torch.Size([5, 2, 3])

## Patches crafting

In [34]:
! python3 -m pip install einops -q
from einops import rearrange

def img2patches(img, patch_size=8):
    '''
    Args:
        img: (batch_size, c, h, w) Tensor

    Returns:
        (batch_size, num_patches, vectorized_patch) Tensor
    '''
    # Your code is here
    # Rearrange the image tensor to extract patches
    batch_size, channels, height, width = img.shape
    num_patches = (height // patch_size) * (width // patch_size)
    patches = rearrange(img, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_size, p2=patch_size)
    return patches
img2patches(torch.ones(2,3,264,264)).shape

torch.Size([2, 1089, 192])

##  Build ViT

<img src="https://raw.githubusercontent.com/oseledets/dl2023/b5018a354b1a10e7f498d3a8649f604f4d63d920/seminars/seminar-9/vit.webp" alt="ViT" width="600">


* Split an image into patches

* Flatten the patches

* Produce lower-dimensional linear embeddings from the flattened patches

* Add positional embeddings

* Feed the sequence as an input to a standard transformer encoder

* Pretrain the model with image labels (fully supervised on a huge dataset)

* Finetune on the downstream dataset for image classification



In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Block(nn.Module):

    def __init__(
            self,
            dim,
            num_heads,
            mlp_ratio=4,  # ratio between hidden_dim and input_dim in MLP
            act_layer=nn.GELU,
            norm_layer=nn.LayerNorm
    ):
        super().__init__()

        self.norm1 = norm_layer(dim)
        self.attn = nn.MultiheadAttention(dim, num_heads)
        self.norm2 = norm_layer(dim)

        hidden_dim = dim * mlp_ratio
        self.mlp = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            act_layer(),
            nn.Linear(hidden_dim, dim)
        )

    def forward(self, x):
        # Multi-head self-attention mechanism
        attn_output, _ = self.attn(self.norm1(x), self.norm1(x), self.norm1(x))
        # Add the output of attention mechanism to the input (with normalization)
        x = x + attn_output

        # MLP
        mlp_output = self.mlp(self.norm2(x))
        # Add the output of MLP to the input (with normalization)
        x = x + mlp_output

        return x


In [36]:
depth = 12
many_layers = nn.Sequential(*[Block(128, 8) for _ in range(depth)])

![](vit.webp)

- CLS token: an extra learnable token
- Position embeddings: `x = x + pos_embedding`, where `pos_embedding` is trained for every element is a sequence

In [39]:
class ViT(nn.Module):
    def __init__(
                    self,
                    img_size=(224, 224),
                    patch_size=16,
                    in_chans=3,
                    num_classes=10,
                    embed_dim=768,
                    depth=12,
                    num_heads=12,
                    mlp_ratio=4,
                    norm_layer=nn.LayerNorm,
                    act_layer=nn.GELU
            ):
        # Your code is here
        # Initialize instance variables.

        # Size of patches used for tokenization.

        # Sequential container for the Transformer blocks.

        # Projection layer for patches.

        # Length of positional embeddings.

        # Learnable token for classification.

        # Linear layer for classification.
        super().__init__()
        self.patch_size = patch_size
        self.blocks = nn.Sequential(*[
            Block(embed_dim, num_heads, mlp_ratio, act_layer, norm_layer) for _ in range(depth)
        ])
        self.patch_proj = nn.Linear(3 * patch_size * patch_size, embed_dim)
        self.embed_len = (img_size[0] * img_size[1]) // (patch_size * patch_size)
        self.pos_embed = nn.Parameter(torch.randn(1, self.embed_len, embed_dim) * .02)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.head = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        '''
        Args:
            x: (batch_size, in_channels, img_size[0], img_size[1])

        Return:
            (batch_size, num_classes)

        '''
        # Convert input image into patches.

        # Project patches into the embedding space.

        # Add positional embeddings.

        # Add classification token.

        # Pass through Transformer blocks.

        # Extract only the CLS token.

        # Pass the CLS token through the classification layer.
        x = img2patches(x, patch_size=self.patch_size)
        x = self.patch_proj(x)
        x = x + self.pos_embed
        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
        x = self.blocks(x)
        x = x[:, 0, :]  # take CLS token
        return self.head(x)

In [40]:
ViT()(torch.ones(5, 3, 224, 224)).shape

torch.Size([5, 10])

https://github.com/lucidrains/vit-pytorch

In [43]:
!pip install vit-pytorch

Collecting vit-pytorch
  Downloading vit_pytorch-1.6.5-py3-none-any.whl (100 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m92.2/100.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10->vit-pytorch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10->vit-pytorch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10->vit-pytorch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from tor

In [44]:
import torch
from vit_pytorch import ViT

v = ViT(
    image_size = 256,
    patch_size = 32,
    num_classes = 1000,
    dim = 1024,
    depth = 6,
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1,
    emb_dropout = 0.1
)

img = torch.randn(1, 3, 256, 256)

preds = v(img) # (1, 1000)

In [45]:
preds

tensor([[ 2.0762e-02, -8.6543e-01, -1.6649e-01,  4.7023e-01,  2.1664e-01,
          2.3016e-01,  8.7242e-02, -6.8354e-02,  4.6087e-01,  6.1433e-01,
          9.7971e-01, -5.7961e-01,  2.3350e-01, -6.0116e-01,  6.5677e-01,
         -8.1762e-01, -5.7986e-01, -4.3245e-01,  8.9714e-01, -1.9794e-01,
         -2.4843e-01, -8.0247e-01,  9.4122e-01,  7.1552e-01, -2.6166e-01,
          2.9301e-01, -5.6692e-01, -6.0118e-01, -1.9850e-01,  9.7364e-01,
          1.5168e-01, -9.4772e-01, -4.7470e-02, -1.0317e+00,  1.3323e-01,
         -5.9982e-01, -6.5861e-01, -4.6528e-01, -4.3678e-01,  5.9566e-01,
         -2.3113e-01, -3.2104e-01, -5.0885e-01,  3.9568e-01, -1.2112e-01,
          3.5049e-01,  6.6690e-01, -3.8649e-01,  2.5779e-02,  3.7177e-02,
         -7.9264e-01,  1.7746e-01,  6.6669e-01,  1.1157e-01,  9.8442e-02,
         -8.0961e-01, -1.0259e+00,  7.1363e-02,  7.9590e-01,  2.7726e-01,
          3.1313e-01, -3.7796e-01, -6.6821e-01, -1.4317e+00, -2.4080e-01,
         -2.3442e-02,  3.9605e-01,  5.

# HuggingFace

Hugging Face is an open-source library that provides easy access to state-of-the-art transformer-based models for NLP tasks. It offers a comprehensive set of tools for working with these models, including loading pre-trained models, fine-tuning on custom datasets, and deploying models for inference.

### Getting started on a task with a pipeline

The easiest way to use a pre-trained model on a given task is to use pipeline(). 🤗 Transformers provides the following tasks out of the box:
Sentiment analysis: is a text positive or negative?

1. Text generation: provide a prompt and the model will generate what follows.
2. Name entity recognition (NER): in an input sentence, label each word with the entity it represents (person, place, etc.)
3. Question answering: provide the model with some context and a question, extract the answer from the context.
4. Filling masked text: given a text with masked words (e.g., replaced by [MASK]), fill the blanks.
5. Summarization: generate a summary of a long text.
6. Language Translation: translate a text into another language.
7. Feature extraction: return a tensor representation of the text.

In [None]:
!pip install transformers

### GPT2

#### Model description

**GPT-2** is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it was trained to guess the next word in sentences.

More precisely, inputs are sequences of continuous text of a certain length and the targets are the same sequence, shifted one token (word or piece of word) to the right. The model uses internally a mask-mechanism to make sure the predictions for the token i only uses the inputs from 1 to i but not the future tokens.

This way, the model learns an inner representation of the English language that can then be used to extract features useful for downstream tasks. The model is best at what it was pretrained for however, which is generating texts from a prompt.

### Text generation

In [None]:
from transformers import pipeline, set_seed
import warnings
warnings.filterwarnings("ignore")

In [None]:
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, in this seminar we will learn how to,", max_length=60, num_return_sequences=7)

In [None]:
generator("Machine learning is evolving technology", max_length=10, num_return_sequences=5)

### Sentiment analysis

In [None]:
# Allocate a pipeline for sentiment-analysis
classifier = pipeline('sentiment-analysis')
classifier('The weather is awesome!')

### Question Answering

In [None]:
# Allocate a pipeline for question-answering
question_answerer = pipeline('question-answering')
question_answerer({
    'question': 'What is the Newtons third law of motion?',
    'context': 'Newton’s third law of motion states that, "For every action there is equal and opposite reaction"'})

In [None]:
nlp = pipeline("question-answering")

context = r"""
Micorsoft was founded by Bill gates and Paul allen in the year 1975.
The property of being prime (or not) is called primality.
A simple but slow method of verifying the primality of a given number n is known as trial division.
It consists of testing whether n is a multiple of any integer between 2 and itself.
Algorithms much more efficient than trial division have been devised to test the primality of large numbers.
These include the Miller–Rabin primality test, which is fast but has a small probability of error, and the AKS primality test, which always produces the correct answer in polynomial time but is too slow to be practical.
Particularly fast methods are available for numbers of special forms, such as Mersenne numbers.
As of January 2016, the largest known prime number has 22,338,618 decimal digits.
"""

#Question 1
result = nlp(question="What is a simple method to verify primality?", context=context)

print(f"Answer 1: '{result['answer']}'")

#Question 2
result = nlp(question="When did Bill gates founded Microsoft?", context=context)

print(f"Answer 2: '{result['answer']}'")

### BERT

The BERT model was proposed in BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It’s a bidirectional transformer pretrained using a combination of masked language modeling objective and next sentence prediction on a large corpus comprising the Toronto Book Corpus and Wikipedia.

The abstract from the paper is the following:

> We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications.

BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).

### Text prediction

In [None]:
unmasker = pipeline('fill-mask', model='bert-base-cased')
unmasker("Hello, My name is [MASK].")

### Text Summarization

In [None]:
summarizer = pipeline("summarization")

ARTICLE = """The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972.
First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space,
Apollo was later dedicated to President John F. Kennedy's national goal of "landing a man on the Moon and returning him safely to the Earth" by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress.
Project Mercury was followed by the two-man ProjectGemini (1962–66).
The first manned flight of Apollo was in 1968.
Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966.
Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions.
Apollo used Saturn family rockets as launch vehicles.
Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973–74, and the Apollo–Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.
 """

summary=summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False)[0]

print(summary['summary_text'])

### English to German translation

In [None]:
# English to German
translator_ger = pipeline("translation_en_to_de")
print("German: ",translator_ger("Joe Biden became the 46th president of U.S.A.", max_length=40)[0]['translation_text'])

# English to French
translator_fr = pipeline('translation_en_to_fr')
print("French: ",translator_fr("Joe Biden became the 46th president of U.S.A",  max_length=40)[0]['translation_text'])

### Fill MASK

In [None]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased')
unmasker("Artificial Intelligence [MASK] take over the world.")

In conclusion, Hugging Face provides a user-friendly interface for working with transformer-based models in NLP. We've covered how to load pre-trained models, fine-tune them for specific tasks, and even implement custom transformers. With its extensive documentation and active community, Hugging Face is an invaluable tool for NLP practitioners.

# Mamba

<img src="https://github.com/state-spaces/mamba/blob/main/assets/selection.png?raw=true" alt="Mamba" >

In [None]:
!pip install causal-conv1d>=1.2.0
!pip install mamba-ssm


In [None]:
from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-2.8b-hf")
model = MambaForCausalLM.from_pretrained("state-spaces/mamba-2.8b-hf")
input_ids = tokenizer("LOOK!", return_tensors="pt")["input_ids"]

out = model.generate(input_ids, max_new_tokens=10)
print(tokenizer.batch_decode(out))
