In [7]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
          text=f.read()
print(len(text))
print(text[:200]) # getting the first 200 characters of text

232309
﻿DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW Y


In [8]:
chars=sorted(set(text)) # get the sorted set of characters in the text
print(chars)
print('length of chars list: ', len(chars))

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
length of chars list:  81


In [9]:
# takes in an input string
# creates a mapping of characters to their integer index - i.e. {a:0, b:1, c:2}
# returns a list of integers representing the encoding of the input string using the character map
# character level tokenizer (small vocabulary with a lot of tokens to convert)

# encoding strings to integer
def encode(inputString):
    string_to_int = {ch:i for i,ch in enumerate(chars)}
    return [string_to_int[c] for c in inputString]

encodeHello=encode('hello')
print(encodeHello)

# decoding integer encoding to string
def decode(encodedList):
    int_to_string = {i:ch for i,ch in enumerate(chars)}
    return ''.join([int_to_string[i] for i in encodedList])

decodeHello=decode(encodeHello)
print(decodeHello)

# this is an example of a CHARACTER LEVEL TOKENIZER where we have a small vocabulary (81 characters in this case) and a lot of tokens to encode
# a WORD LEVEL TOKENIZER has a very large vocabulary but a small(er) amount of tokens to encode
# a SUB LEVEL TOKENIZER is something in between a character level and word level tokenizer

[61, 58, 65, 65, 68]
hello


In [10]:
# pytorch will help with a lot of the calculations/linear algebra
# tensors: data structures representing multi-dimensional inputs (matrices, vectors, scalars)

import torch

data = torch.tensor(encode(text), dtype=torch.long) # turning the text into a super long sequence of integers (non-floating point numbers)
print(data[:100])

tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,  0,
         0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,  0,
         1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47, 33,
        50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36, 25,
        38, 28,  1, 39, 30,  1, 39, 50,  9,  1])


In [11]:
# training and validation splits (80/20)
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

# bigram language model: given a certain character, predict the next
# hello
# start -> h, h->e, e->l, l->l, l->o

# block size: a hyperparameter determining the size of a data chunk used in batch training/processing
# we're using it to make predictions and targets from the text
# predictions: [5,67,21,58,40] 35  [:5]
# targets: 5 [67,21,58,40,35]   [1:blockSize+1]
# in the bigram language model we're seeing how far the prediction is from the target and will try to reduce that error

block_size_test = 8

x = train_data[:block_size_test] # [80, 28, 39, 42, 39, 44, 32, 49] 1
y = train_data[1:block_size_test+1] # 80 [28, 39, 42, 39, 44, 32, 49,  1]

for t in range(block_size_test):
    context = x[:t+1]
    target = y[t]
    print('when input is', context, 'target is', target)

when input is tensor([80]) target is tensor(28)
when input is tensor([80, 28]) target is tensor(39)
when input is tensor([80, 28, 39]) target is tensor(42)
when input is tensor([80, 28, 39, 42]) target is tensor(39)
when input is tensor([80, 28, 39, 42, 39]) target is tensor(44)
when input is tensor([80, 28, 39, 42, 39, 44]) target is tensor(32)
when input is tensor([80, 28, 39, 42, 39, 44, 32]) target is tensor(49)
when input is tensor([80, 28, 39, 42, 39, 44, 32, 49]) target is tensor(1)


In [12]:
# GPU (graphics processing unit): designed to efficiently process large blocks of data simultaneously/in parallel
# can do a simple task very efficiently and multiple at the same time 
# by putting blocks in a stack, we can give it to the GPU which can perform processing on the blocks scaled way up
# GPUs differ from CPUs which can do sequential complex tasks

# batch size: a hyperparameter telling how many blocks we are processing at the same time
# batch size tells us the height of a block while block_size tells us the length of it

# [1,2,3,4,5]
# [1,2,3,4,5]
# [1,2,3,4,5]
# [1,2,3,4,5]

# unable to get the GPU to work, so I'm stuck with the cpu (slower runtimes)
# https://github.com/Infatoshi/fcc-intro-to-llms
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
print(device)
print(torch.version.cuda)
print(torch.cuda.is_available())
print("Device Count:", torch.cuda.device_count())

cpu
11.8
False
Device Count: 0


In [14]:
block_size = 8
batch_size = 4

In [15]:
randint = torch.randint(-100, 100, (6,)) # returns a tensor with 6 values between -100 and 100
randint

tensor([ 61,  53, -33,  -5,  -4,  14])

In [18]:
tensor = torch.tensor([[0.1,1.2],[2.2,3.1],[4.9,5.2]]) # creating a 2x3 tensor with float values
tensor

tensor([[0.1000, 1.2000],
        [2.2000, 3.1000],
        [4.9000, 5.2000]])

In [35]:
# PyTorch functions

zeros = torch.zeros(2,3) # zeros(rows, columns) returns a tensor with just zeros
print(zeros)

ones = torch.ones(3,4)
print(ones)

empty = torch.empty(2,3)
print(empty)

arange = torch.arange(5)
print(arange)

linespace = torch.linspace(3,10,steps=5) # goes from 3 to 10 in 5 equal steps
print(linespace)

logspace = torch.logspace(start=-10,end=10,steps=5) # start at -10, then gets to 10 in 5 equal steps
print(logspace)

eye = torch.eye(5) # returns a tensor in reduced row echelon form with kxk dimensions if eye(k)
print(eye)

# torch.empty() and torch.empty_like() create tensors with uninitialized values - any values appearing in the tensors are random garbage values
a = torch.empty((2,3),dtype=torch.int64)
print(a)
empty_like = torch.empty_like(a)
print(empty_like)

tensor([[0., 0., 0.],
        [0., 0., 0.]])
tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])
tensor([[-4.8935e+32,  7.5950e-43,  1.0000e+00],
        [ 1.0000e+05,  1.0000e+10,  0.0000e+00]])
tensor([0, 1, 2, 3, 4])
tensor([ 3.0000,  4.7500,  6.5000,  8.2500, 10.0000])
tensor([1.0000e-10, 1.0000e-05, 1.0000e+00, 1.0000e+05, 1.0000e+10])
tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])
tensor([[      2331995282736,                   1,                   2],
        [                  3,                   4, 4575657222473777152]])
tensor([[4613937818241073152, 4617034042984890368, 4619004367821864960],
        [4620833955170484224, 4621819117588971520, 4575657222473777152]])


In [45]:
# linear algebra torch functions

probs = torch.tensor([0.1,0.9]) # probabilities that add up to 1
samples = torch.multinomial(probs, num_samples=10, replacement=True)
print(samples)

first = torch.tensor([1,2,3,4])
out = torch.cat((first,torch.tensor([5])), dim=0) # concatenation of tensors
print(out)

# figuring out probabilities, then concatenating the predictions

tril = torch.tril(torch.ones(5,5)) # tril = triangle lower - looks similar to how predictions build on each other
print(tril)

triu = torch.triu(torch.ones(5,5)) # triu = torch upper
print(triu)

# masked_fill? 

test = torch.zeros(2,3,4)
test1 = test.transpose(0,2) # transposing - in this case, the values at elements 0 and 2 are swapped/transposed
print(test1.shape)

# stacking tensors together to make the giant stack to be passed to the GPU
tensor1 = torch.tensor([1,2,3])
tensor2 = torch.tensor([4,5,6])
tensor3 = torch.tensor([7,8,9])

stacked_tensor = torch.stack([tensor1,tensor2,tensor3])
print(stacked_tensor)

tensor([1, 1, 1, 0, 1, 1, 1, 1, 1, 0])
tensor([1, 2, 3, 4, 5])
tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])
tensor([[1., 1., 1., 1., 1.],
        [0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.]])
torch.Size([4, 3, 2])
tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])


In [47]:
import torch.nn as nn

sample = torch.tensor([10.,10.,10.])
linear = nn.Linear(3,3,bias=False) # nn.Linear
print(linear(sample))

tensor([ 8.9204, -0.5037, -1.5343], grad_fn=<SqueezeBackward4>)


In [49]:
# exponentiating involving 2.71 = e

import torch.nn.functional as F

tensor1a = torch.tensor([1.0,2.0,3.0])
softmax_output = F.softmax(tensor1a,dim=0) # softmax converts a vector of numbers into a vector of probabilities

print(softmax_output)

tensor([0.0900, 0.2447, 0.6652])


In [50]:
# embedding vectors

# using nn.Embedding on a character level
# an example is if our alphabet is 26 characters and we want vector embeddings of size 5 (a hyperparameter) for each character
# the nn.Embedding layer can be visualized as a table with 26 rows and 5 columns

# character --> embedding vector
# 'a' --> [0.2,0.1,0.5,0.3,0.9] # these values are learned weights updated during back propagation
# 'b' --> [0.7,0.8,0.2,0.6,0.4]
# ...
# 'z' --> [0.3,0.5,0.6,0.8,0.2]

vocab_size=1000
embedding_dim = 100 # how many dimensions will a vector embedding be?

# creates an embedding layer (analagous to a lookup table) turning each token into a vector embedding of size embedding_dim
# embedding layer contains matrix of learned weights with a shape of (vocab_size, embedding_dim)
embedding = nn.Embedding(vocab_size, embedding_dim) 

# a tensor containing indices of tokens that you want to look up in the embedding layer
input_indices = torch.LongTensor([1,5,3,2])

# embedding lookup - get me the embeddings for the tokens at index 1,5,3,and 2 of the matrix
embedded_output = embedding(input_indices)

# the shape will be the number of indices, so 4 in this case, by the embedding_dim so 100 since each token is represented by 100 dimensions
print(embedded_output.shape)

# weight*x + bias = y
# more specifically sigmoid(w_1*a_1+w_2*a_2+w_3*a_3+...+w_n*a_n + bias) 
# sigmoid is a funciton with a range of (0,1)
# weight is a weight matrix, multiplied by the input data x (a value or vector)
# bias is an additive hyperparameter helping the model fit data
# the function of how data moves from one layer to another in a neural network

torch.Size([4, 100])


In [52]:
# matrix multiplication
a = torch.tensor([[1,2],[3,4],[5,6]])
b = torch.tensor([[7,8,9],[10,11,12]])
print(a@b)
print(torch.matmul(a,b))

tensor([[ 27,  30,  33],
        [ 61,  68,  75],
        [ 95, 106, 117]])
tensor([[ 27,  30,  33],
        [ 61,  68,  75],
        [ 95, 106, 117]])


In [58]:
int_64 = torch.randint(5,(3,2)) # returns a 3 by 2 matrix with random integer values going from 0 to 5
print(int_64)
float_32 = torch.rand(2,3) # returns a 2 by 3 matrix with random floating point numbers
print(float_32)

# you can cast integer to float to make matrix multiplication work
print(torch.matmul(float_32,int_64.float()))

tensor([[4, 0],
        [1, 0],
        [2, 3]])
tensor([[0.5381, 0.1094, 0.2450],
        [0.7957, 0.1353, 0.1590]])
tensor([[2.7517, 0.7349],
        [3.6361, 0.4771]])
