In [1]:
with open('GreenFinance.txt', 'r', encoding='utf-8') as f:
    text=f.read()

In [2]:
print('Lenght of dataset in characters: ', len(text))

Lenght of dataset in characters:  156194


## All the unique characters that occur in this text

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !#$%&'()*+,-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz£·ö–—‘’“”•€−
98


## Tokenisation

Create a mapping for characters to integers

In [4]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

# Encoders take a string and output a list of integers
# Decoders take a list of integers and output a string

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode('hi THERE!'))
print(decode(encode('hi THERE!')))

[67, 68, 1, 50, 38, 35, 48, 35, 2]
hi THERE!


Now lets encode the entire chunck of text we are having and save it to torch.tensor

In [6]:
import torch
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([156194]) torch.int64
tensor([31, 61, 78, 79, 77, 60, 62, 79,  1,  0, 50, 67, 68, 78,  1, 75, 60, 75,
        64, 77,  1, 77, 64, 81, 68, 64, 82, 78,  1, 79, 67, 64,  1, 64, 83, 68,
        78, 79, 68, 73, 66,  1, 77, 64, 78, 64, 60, 77, 62, 67,  1, 74, 73,  1,
        66, 77, 64, 64, 73,  1, 65, 68, 73, 60, 73, 62, 64, 14,  1, 39, 79,  1,
        68, 63, 64, 73, 79, 68, 65, 68, 64, 78,  1, 79, 67, 64,  1, 68, 72, 75,
        74, 77, 79, 60, 73, 79,  1, 79, 67, 64, 72, 64, 78,  1, 68, 73,  1,  0,
        79, 67, 64,  1, 66, 77, 64, 64, 73,  1, 65, 68, 73, 60, 73, 62, 64,  1,
        71, 68, 79, 64, 77, 60, 79, 80, 77, 64, 12,  1, 75, 60, 77, 79, 68, 62,
        80, 71, 60, 77, 71, 84, 12,  1, 79, 67, 64,  1, 78, 79, 77, 60, 79, 64,
        66, 68, 64, 78,  1, 79, 74,  1, 68, 73, 62, 77, 64, 60, 78, 64,  1, 66,
        77, 64, 64, 73,  1, 65, 68, 73, 60, 73, 62, 68, 73, 66, 27,  1, 64, 65,
        65, 74, 77, 79, 78,  1, 79, 74,  1,  0, 72, 60, 70, 64,  1, 66, 77, 64,
       

## Spliting the data

Split the data in the validation and testing set

In [7]:
# We will keep 90 percent of proportion into testing and 10 percent to validate

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
# Usually we train the model of samples of data or block of data here we are calling them block_size

block_size = 8
train_data[:block_size + 1]

tensor([31, 61, 78, 79, 77, 60, 62, 79,  1])

A simple illustraion of working on the data with the specified block size

On what basis the predictions will be made

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target is:{target}")

When input is tensor([31]) the target is:61
When input is tensor([31, 61]) the target is:78
When input is tensor([31, 61, 78]) the target is:79
When input is tensor([31, 61, 78, 79]) the target is:77
When input is tensor([31, 61, 78, 79, 77]) the target is:60
When input is tensor([31, 61, 78, 79, 77, 60]) the target is:62
When input is tensor([31, 61, 78, 79, 77, 60, 62]) the target is:79
When input is tensor([31, 61, 78, 79, 77, 60, 62, 79]) the target is:1


Now picking random blocks from the chunks starting from the random positions

In [11]:
torch.manual_seed(1020)
batch_size = 4  # How many independent sequences will we process in parallel?
block_size = 8  # What is the maximum context length for prediction?

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('Inputs: ')
print(xb.shape)
print(xb)
print('Output: ')
print(yb.shape)
print(yb)

print('_______')

for b in range(batch_size):  # Batch dimention
    for t in range(block_size):  # Time dimention
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"When input is {context.tolist()} target is:{target}")

Inputs: 
torch.Size([4, 8])
tensor([[62, 71, 68, 72, 60, 79, 64, 61],
        [ 1, 68, 78,  1, 63, 68, 65, 65],
        [80, 68, 78, 78, 64, 14, 62, 74],
        [79, 68, 73, 66, 14, 75, 63, 65]])
Output: 
torch.Size([4, 8])
tensor([[71, 68, 72, 60, 79, 64, 61, 74],
        [68, 78,  1, 63, 68, 65, 65, 68],
        [68, 78, 78, 64, 14, 62, 74, 72],
        [68, 73, 66, 14, 75, 63, 65,  1]])
_______
When input is [62] target is:71
When input is [62, 71] target is:68
When input is [62, 71, 68] target is:72
When input is [62, 71, 68, 72] target is:60
When input is [62, 71, 68, 72, 60] target is:79
When input is [62, 71, 68, 72, 60, 79] target is:64
When input is [62, 71, 68, 72, 60, 79, 64] target is:61
When input is [62, 71, 68, 72, 60, 79, 64, 61] target is:74
When input is [1] target is:68
When input is [1, 68] target is:78
When input is [1, 68, 78] target is:1
When input is [1, 68, 78, 1] target is:63
When input is [1, 68, 78, 1, 63] target is:68
When input is [1, 68, 78, 1, 63, 68] t

## Feeding to the model

Let us start with constructing a simple bigram model

In [17]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1020)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        
        # Each token directly reads off the logitsfor the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, target):
        
        # idx and target are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) #(B,T,C)

        # Lets calculate the loss function for the prediction that are made
        B, T, C = logits.shape # Reshaping for the loss calculation
        logits = logits.view(B*T, C) # Conversion to 2D for Pytorch to process
        target = target.view(B*T)
        loss = F.cross_entropy(logits, target)        

        return logits, loss

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 98])
tensor(5.2132, grad_fn=<NllLossBackward0>)
