In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        # Create a matrix of [max_len, d_model] representing positional encodings
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Register as a buffer so it's part of the state_dict but not a trainable parameter
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # Add positional encoding to the input embeddings
        return x + self.pe[:, :x.size(1), :]

class MiniGPT(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, max_len=100):
        super(MiniGPT, self).__init__()
        self.d_model = d_model

        # 1. Embedding Layer
        self.embedding = nn.Embedding(vocab_size, d_model)

        # 2. Positional Encoding
        self.pos_encoder = PositionalEncoding(d_model, max_len)

        # 3. Transformer Block (Decoder-style)
        # We use TransformerEncoderLayer but will apply a causal mask to make it behave like a Decoder
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward=256, dropout=0.1, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)

        # 4. Output Head
        self.fc_out = nn.Linear(d_model, vocab_size)

    def _generate_square_subsequent_mask(self, sz):
        # Generates a mask (upper triangular matrix with -inf) to prevent looking ahead
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, src):
        # src shape: [batch_size, seq_len]

        # Create the causal mask
        seq_len = src.size(1)
        mask = self._generate_square_subsequent_mask(seq_len).to(src.device)

        # Embed and add position info
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)

        # Pass through Transformer with the mask
        output = self.transformer_encoder(src, mask=mask)

        # Project to vocabulary size
        output = self.fc_out(output)
        return output

# --- hyper parameters ---
vocab_size = 100  # Tiny vocabulary for demonstration
d_model = 64      # Embedding dimension
nhead = 4         # Number of attention heads
num_layers = 2    # Number of transformer layers
learning_rate = 0.001
epochs = 100

# --- Data Preparation ---
# We will teach the model to complete the sequence: 1, 2, 3 ... 10
data = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4]], dtype=torch.long)
x = data[:, :-1] # Input:  1, 2, 3...
y = data[:, 1:]  # Target: 2, 3, 4... (Next token prediction)

# --- Model Initialization ---
model = MiniGPT(vocab_size, d_model, nhead, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# --- Training Loop ---
print("Training...")
model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    output = model(x) # Output shape: [batch, seq_len, vocab_size]

    # Flatten for CrossEntropyLoss
    loss = criterion(output.view(-1, vocab_size), y.view(-1))
    loss.backward()
    optimizer.step()

    if (epoch+1) % 20 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# --- Text Generation (Inference) ---
print("\nGenerating...")
model.eval()
start_sequence = [1, 2] # Start with "1, 2"
input_tensor = torch.tensor([start_sequence], dtype=torch.long)

# Generate next 10 tokens
with torch.no_grad():
    for _ in range(10):
        # Forward pass
        output = model(input_tensor)

        # Get the logits for the last token only
        next_token_logits = output[:, -1, :]

        # Greedy decoding: pick the token with highest probability
        next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)

        # Append prediction to input for next iteration
        input_tensor = torch.cat((input_tensor, next_token), dim=1)

print(f"Input: {start_sequence}")
print(f"Generated: {input_tensor.tolist()[0]}")

Training...
Epoch 20, Loss: 0.7689
Epoch 40, Loss: 0.2190
Epoch 60, Loss: 0.1011
Epoch 80, Loss: 0.0638
Epoch 100, Loss: 0.0447

Generating...
Input: [1, 2]
Generated: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2]


In [2]:
import time 
import numpy as np

matrix = np.array([[1,2,3],[4,5,6],[7,8,9]])

vector = np.array([1,2,3])
# record the time before running the code
start_time = time.time()

# place code to run here:

for i in range(1000):
    99*99

    result = matrix.dot(vector)
# record the time after running the code
end_time = time.time()

# compute the difference 
diff = end_time - start_time
diff


0.0010018348693847656