<a href="https://colab.research.google.com/github/Castorres/Memory-Power-Performance-Analysis-of-Transformer-on-a-traditional-computer/blob/main/Memory_Power_Performance_Analysis_of_Transformer_on_a_traditional_computer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0))

CUDA available: True
Device: Tesla T4


In [32]:
!wget -q -O tiny_shakespeare.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


In [33]:
!ls -lh tiny_shakespeare.txt

-rw-r--r-- 1 root root 1.1M Sep 30 18:24 tiny_shakespeare.txt


In [34]:
text = open("tiny_shakespeare.txt", "r", encoding="utf-8").read()
#reads the text ^^^
# open :this opens the file, r :makes it so its only reading no editing
# encoding :makes sure special characters are handled properly
# .read() : it reads the enter files as one string
# this results in var text contatining the entire file

split = int(0.9 * len(text))
# len(text) :this counts the total chars in the text
# .9 * len(text) :calculates 90% of the text (will be used for training)
# int() :converts into a integer(to slice indexes must be whole numbers)
# this resulrs in var split being an index used to cut text into train and validation parts

train_text = text [:split]
val_text = text[split:]
# text[:split] : this takes all the chars from the start to split, 90%
# text[split:] : this takes all the chars from split to end, 10%
# this results in var train_text which will train the model
# and var val_text which will test the model's accuracy

#splits it into 90% train / 10% validation^^^

with open("train.txt", "w", encoding="utf-8") as f:
  f.write(train_text)
# with open( , "w") as f : this opens a new file named train.txt in write mode
#f.write(train_text) : saves the training text into the new file
#using with will have the file close one it is done

with open("val.txt", "w", encoding="utf-8") as f:
  f.write(val_text)
#this will the same as the code before it but for the val text

#this saves them as new files^^^

print("training characters:", len(train_text))
print("validation characters:", len(val_text))
# len(train_text) : counts the chars in the training set
# len(val_text) : counts the chars in the val set
#check character counts^^^

training characters: 1003854
validation characters: 111540


In [35]:
train_text = open("train.txt", "r", encoding="utf-8").read()
val_text = open("val.txt", "r", encoding="utf-8").read()
# open(...,"r",encoding) :this opens the files and using UTF-8 to protect special chars
# .read() : this loads the file into one string - train_text
# the same is done for val_text
#reads training data^^^

chars = sorted(list(set(train_text)))
# set(train_text) :this is a set the keeps only unique chars (no duplicates)
# list(..) :this converts the set back into a list (allowing it to be ordered)
# sorted(..) : sorts the chars by Unicode code point making the index stable
vocab_size = len(chars)
# vocab_size = len : this counts how many distinct chars are in the training text
#creates vocabulary: unique set of characters^^^

print("Characters in vocab:", chars)
print("Vocabulary size:", vocab_size)

Characters in vocab: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Vocabulary size: 65


In [36]:
stoi = {ch: i for i, ch in enumerate(chars)}  #string to index
itos = {i: ch for i, ch in enumerate(chars)}  #index to string
# enumerate(chars) :this yields (index, character) pairs like(1,' ')
#create mappings, char -> int and int -> char^^^

def encode(s):
  """Encodes a string into a list of intergers"""
  return [stoi[ch] for ch in s]
# encode(s) :list comprehension that replaces each char ch in s with its integer
def decode(l):
  """Decodes a list of intergers back into a string"""
  return ''.join([itos[i] for i in l])
# decode() :turns chars back into a string using reverse mapping

print("Encoded:", encode("ROMEO"))
print("Decoded:", decode(encode("ROMEO")))
#quick test^^

Encoded: [30, 27, 25, 17, 27]
Decoded: ROMEO


In [37]:
import torch
#this loads PyTorch

train_data = torch.tensor(encode(train_text), dtype=torch.long)
# encode() :turns the training string into a list of ints, these are the token IDs
# torch.tensor() :this converts the list into 1D tensor of 64-bit ints
val_data = torch.tensor(encode(val_text), dtype=torch.long)
# same goes for val_data
#converts the entire dataset into torch tensors^^^

print("Training data tensor shape:", train_data.shape)
print("validation data tensor shape:", val_data.shape)

Training data tensor shape: torch.Size([1003854])
validation data tensor shape: torch.Size([111540])


In [38]:
batch_size = 64     #this is how many sequences are per batch (64)
block_size = 128    # this is the length of each sequence (128 chars)

def get_batch(split):   #this will randoming pick where to start from either train_data or val_data
  data = train_data if split == 'train' else val_data   #this will pick a 1D tensor from the token IDs
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])   #this is the input sequences
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])   #this is the target sequences
  # this will be how the model is trained to predict the next char
  return x,y

In [39]:
mkdir -p /content/transformer_shakespeare/src /content/transformer_shakespeare/data /content/transformer_shakespeare/logs /content/transformer_shakespeare/output

In [40]:
mv train.txt val.txt tiny_shakespeare.txt /content/transformer_shakespeare/data/

In [41]:
%%writefile /content/transformer_shakespeare/src/model.py

import torch    #this is the main PyTorch libary
import torch.nn as nn   #this has prebuilt neural network layers (linear, embedding, layerNorm)
import torch.nn.functional as F   #this has lower level functions that are used inside models

batch_size = 64
block_size = 128
n_embd = 256    #this embeds simension
#this is the hidden size of vectors
n_head = 4     #this is the number of attention heads
n_layer = 4   #this is the number of transform layers
dropout = 0.1 #this prevents overfitting
#these hyperparameters will be used later in train.py^^^

#this is the transformer model
class TransformerModel(nn.Module):
  #nn.Module : this is the base class for all neural nets
  def __init__(self, vocab_size):
  #vocab_size : this is the number of unique tokens in the dataset
      super().__init__()

      #1) the token and positional embeddings
      self.token_embedding = nn.Embedding(vocab_size,n_embd)
      #this will convert token IDs into vectors
      self.position_embedding = nn.Embedding(block_size, n_embd)
      #this will make positional encoding making it so each position gets a vector

      #2) the transformer blocks
      self.blocks = nn.Sequential(*[nn.Identity() for _ in range(n_layer)])
      #nn.itenity is a placeholder, will be replaced with real attention + MLP blocks

      #3) the layernorm before final projection
      self.ln_f = nn.LayerNorm(n_embd)  #this stabilizes the training

      #4) Final linear layer (projects to vocab size)
      self.head = nn.Linear(n_embd, vocab_size) #this projects back to vocab, logits for next-char prediction

  def forward(self, idx):
    B, T = idx.shape    #this is the batch size and sequence length

    #these are the token + position embeddings
    tok_emb = self.token_embedding(idx)   #(B, T, n_embd)
    pos_emb = self.position_embedding(torch.arange(T, device=idx.device)) #(T, n_embd)
    x = tok_emb + pos_emb

    #transformer blocks, current placeholders
    x = self.block(x)

    #final normalization and output
    x = self.ln_f(x)
    logits = self.head(x)   #(B, T, vocab_size)

    return logits

Overwriting /content/transformer_shakespeare/src/model.py


In [58]:
!cd /content/transformer_shakespeare/src && python train.py

Epoch 1/100 | Train Loss: 4.3657 | Val Loss: 3.9945
Epoch 1/100 | Train Loss: 4.3657
Epoch 2/100 | Train Loss: 3.9734 | Val Loss: 3.7235
Epoch 2/100 | Train Loss: 3.9734
Epoch 3/100 | Train Loss: 3.7212 | Val Loss: 3.5422
Epoch 3/100 | Train Loss: 3.7212
Epoch 4/100 | Train Loss: 3.5789 | Val Loss: 3.4522
Epoch 4/100 | Train Loss: 3.5789
Epoch 5/100 | Train Loss: 3.4464 | Val Loss: 3.3477
Epoch 5/100 | Train Loss: 3.4464
Epoch 6/100 | Train Loss: 3.3161 | Val Loss: 3.3314
Epoch 6/100 | Train Loss: 3.3161
Epoch 7/100 | Train Loss: 3.2702 | Val Loss: 3.2870
Epoch 7/100 | Train Loss: 3.2702
Epoch 8/100 | Train Loss: 3.2673 | Val Loss: 3.2392
Epoch 8/100 | Train Loss: 3.2673
Epoch 9/100 | Train Loss: 3.2173 | Val Loss: 3.1984
Epoch 9/100 | Train Loss: 3.2173
Epoch 10/100 | Train Loss: 3.1656 | Val Loss: 3.1823
Epoch 10/100 | Train Loss: 3.1656
Epoch 11/100 | Train Loss: 3.1509 | Val Loss: 3.1660
Epoch 11/100 | Train Loss: 3.1509
Epoch 12/100 | Train Loss: 3.1278 | Val Loss: 3.1406
Epoch 12

In [59]:
!cd /content/transformer_shakespeare/src && python generate.py

ROMEO: his Wherd, tes mlor f mf bororanorake noud hangy in he theret wl yom thakis Thee t mendore m cas.
LI thabllontho d se onerteangounesu


ENLowhatoul wsos thean; in st t st tce ilmea arelllliche a s fo
