In [31]:
import pandas as pd
df = pd.read_csv("../data/reuters_headlines.csv")
df['Headlines'][0:2]

0    TikTok considers London and other locations fo...
1    Disney cuts ad spending on Facebook amid growi...
Name: Headlines, dtype: object

In [32]:
# Testing with using <> as a separator, NOTE: this is not really a good idea!
text = ' <> '.join(df['Headlines'].tolist())
text[0:201]

'TikTok considers London and other locations for headquarters <> Disney cuts ad spending on Facebook amid growing boycott: WSJ <> Trail of missing Wirecard executive leads to Belarus, Der Spiegel report'

In [33]:
# Compute chars
chars_list = sorted(list(set(text)))
chars = ''.join(chars_list)
len(chars), chars

(98,
 ' !"#$%&\'()*+,-./0123456789:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz\xa0£´çéó\u200b\u200d–‘’“”€')

In [34]:
# Used for testing purposes

#chars_idx = list(range(len(chars)))
#chars_map = list(zip(chars_idx, chars))
#chars_map[0:5],chars_idx[0:5]
#test = list(map(str, chars_idx))


In [35]:
# Encode and decode the given text
def encode(text):
    text_encode = []
    for i in text:
        text_encode.append(chars.find(i))
    return text_encode

def decode(text, chars_list):
    text_decode = []
    for i in text:
        text_decode.append(chars_list[i])
    return ''.join(text_decode)

text_encode = encode(text[0:30])
text_decode = decode(text_encode, chars_list)

print(text[0:30])
print(text_encode)
print(text_decode)

TikTok considers London and ot
[50, 66, 68, 50, 72, 68, 0, 60, 72, 71, 76, 66, 61, 62, 75, 76, 0, 42, 72, 71, 61, 72, 71, 0, 58, 71, 61, 0, 72, 77]
TikTok considers London and ot


In [36]:
# Load to torch
import torch 
x = torch.tensor(encode(text), dtype=torch.long)
print(x.shape)
print(x[0:30])

torch.Size([2270648])
tensor([50, 66, 68, 50, 72, 68,  0, 60, 72, 71, 76, 66, 61, 62, 75, 76,  0, 42,
        72, 71, 61, 72, 71,  0, 58, 71, 61,  0, 72, 77])


In [37]:
# Split train and validation data
split = int(len(x)*0.9)
x_train = x[:split]
x_valid = x[split:]
len(x_train), len(x_valid)

(2043583, 227065)

In [38]:
batch_size = 5
block_size = 10


def load_batch(x):
    index_rand = torch.randint(0,len(x),(batch_size,))
    x_batch = torch.stack([x[i:i+block_size] for i in index_rand])
    y_batch  = torch.stack([x[i+1:i+block_size+1] for i in index_rand])
    return x_batch, y_batch 


x_batch, y_batch = load_batch(x_train)
x_batch, y_batch

(tensor([[ 0, 28, 29,  0, 52, 72, 69, 68, 76, 80],
         [82,  0, 71, 62, 81, 77,  0, 82, 62, 58],
         [ 0, 80, 66, 73, 62,  0,  4, 19, 16,  0],
         [76, 78, 70, 62,  0, 13,  0, 76, 72, 78],
         [35, 78, 75, 72, 63, 66, 64, 65, 77, 62]]),
 tensor([[28, 29,  0, 52, 72, 69, 68, 76, 80, 58],
         [ 0, 71, 62, 81, 77,  0, 82, 62, 58, 75],
         [80, 66, 73, 62,  0,  4, 19, 16,  0, 59],
         [78, 70, 62,  0, 13,  0, 76, 72, 78, 75],
         [78, 75, 72, 63, 66, 64, 65, 77, 62, 75]]))

In [52]:
import torch.nn as nn
from torch.nn import functional as F

# PyTorch BigramLanguageModel model:
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size) -> None:
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)   

    def forward(self, input, targets):

        # TODO: Read about logits and embedding table
        logits = self.token_embedding_table(input) # (B,T,C) - (batch,size,chars) 
        
        # (B,T,C) -> (B*C,T) Conversion to accommodating of torch specs
        logits = logits.reshape(-1,logits.shape[2])
        targets = targets.reshape(-1)

        loss = F.cross_entropy(logits, targets)

        return logits, loss   

model = BigramLanguageModel(len(chars))
logits, loss = model(x_batch, y_batch)
#len(chars)
logits, loss

(tensor([[ 1.1564,  0.7610,  0.9967,  ...,  0.2704, -1.5716, -1.1562],
         [ 0.0680, -0.8400, -0.3601,  ...,  1.3283,  2.1320,  0.4959],
         [-0.5556,  0.1151,  0.1085,  ..., -1.6258, -0.7202, -0.7451],
         ...,
         [ 1.7350, -1.2608, -0.3377,  ...,  0.4802,  2.5273, -0.7180],
         [ 0.1624,  1.1256,  0.8528,  ...,  0.4465,  0.3264, -0.7047],
         [ 0.6640, -0.5125, -0.1376,  ...,  0.6319,  0.5815, -0.0959]],
        grad_fn=<ReshapeAliasBackward0>),
 tensor(5.1286, grad_fn=<NllLossBackward0>))