In [81]:
import pandas as pd
df = pd.read_csv("../data/reuters_headlines.csv")
df['Headlines'][0:2]

0    TikTok considers London and other locations fo...
1    Disney cuts ad spending on Facebook amid growi...
Name: Headlines, dtype: object

In [82]:
# Testing with using <> as a separator, NOTE: this is not really a good idea!
text = ' @ '.join(df['Headlines'].tolist())
text[0:201]

'TikTok considers London and other locations for headquarters @ Disney cuts ad spending on Facebook amid growing boycott: WSJ @ Trail of missing Wirecard executive leads to Belarus, Der Spiegel reports '

In [83]:
# Compute chars
chars_list = sorted(list(set(text)))
chars = ''.join(chars_list)
len(chars), chars

(97,
 ' !"#$%&\'()*+,-./0123456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz\xa0£´çéó\u200b\u200d–‘’“”€')

In [84]:
# Used for testing purposes

#chars_idx = list(range(len(chars)))
#chars_map = list(zip(chars_idx, chars))
#chars_map[0:5],chars_idx[0:5]
#test = list(map(str, chars_idx))


In [85]:
# Encode and decode the given text
def encode(text):
    text_encode = []
    for i in text:
        text_encode.append(chars.find(i))
    return text_encode

def decode(text, chars_list):
    text_decode = []
    for i in text:
        text_decode.append(chars_list[i])
    return ''.join(text_decode)

text_encode = encode(text[0:30])
text_decode = decode(text_encode, chars_list)

print(text[0:30])
print(text_encode)
print(text_decode)

TikTok considers London and ot
[49, 65, 67, 49, 71, 67, 0, 59, 71, 70, 75, 65, 60, 61, 74, 75, 0, 41, 71, 70, 60, 71, 70, 0, 57, 70, 60, 0, 71, 76]
TikTok considers London and ot


In [86]:
# Load to torch
import torch 
x = torch.tensor(encode(text), dtype=torch.long)
print(x.shape)
print(x[0:30])

torch.Size([2237879])
tensor([49, 65, 67, 49, 71, 67,  0, 59, 71, 70, 75, 65, 60, 61, 74, 75,  0, 41,
        71, 70, 60, 71, 70,  0, 57, 70, 60,  0, 71, 76])


In [87]:
# Split train and validation data
split = int(len(x)*0.9)
x_train = x[:split]
x_valid = x[split:]
len(x_train), len(x_valid)

(2014091, 223788)

In [88]:
batch_size = 5
block_size = 10


def load_batch(x):
    index_rand = torch.randint(0,len(x),(batch_size,))
    x_batch = torch.stack([x[i:i+block_size] for i in index_rand])
    y_batch  = torch.stack([x[i+1:i+block_size+1] for i in index_rand])
    return x_batch, y_batch 


x_batch, y_batch = load_batch(x_train)
x_batch, y_batch

(tensor([[71, 70, 61,  0, 79, 71, 61, 75, 26,  0],
         [67, 61, 75,  0, 71, 70, 59, 61,  0, 77],
         [64, 65, 76,  0, 74, 61, 59, 71, 74, 60],
         [59, 71, 74, 74, 71, 60, 61, 60,  0, 72],
         [57, 59, 65, 68, 65, 76, 65, 61, 75,  0]]),
 tensor([[70, 61,  0, 79, 71, 61, 75, 26,  0, 31],
         [61, 75,  0, 71, 70, 59, 61,  0, 77, 70],
         [65, 76,  0, 74, 61, 59, 71, 74, 60, 12],
         [71, 74, 74, 71, 60, 61, 60,  0, 72, 65],
         [59, 65, 68, 65, 76, 65, 61, 75,  0, 29]]))

In [94]:
import torch.nn as nn
from torch.nn import functional as F

# PyTorch BigramLanguageModel model:
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size) -> None:
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)   

    # idx  is input data
    def forward(self, idx, targets=None):

        # TODO: Read about logits and embedding table
        logits = self.token_embedding_table(idx) # (B,T,C) - (batch,size,chars) 
        
        loss = None
        if targets is not None:
            # (B,T,C) -> (B*C,T) Conversion to accommodating of torch specs
            logits = logits.reshape(-1,logits.shape[2])
            targets = targets.reshape(-1)
            loss = F.cross_entropy(logits, targets)

        return logits, loss   

    def generate(self, idx, limit_new_tokens):
        for _ in range(limit_new_tokens):
            logits, loss = self(idx) # Fetch predictions
            logits = logits[:,-1,:] # Last step
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx,idx_next), dim=1)
        
        return idx


model = BigramLanguageModel(len(chars))
logits, loss = model(x_batch, y_batch)

# For now @ is "newline char"
idx = torch.tensor((encode('@'),), dtype=torch.long)

print(loss)
print(logits.shape)
print(decode(model.generate(idx, limit_new_tokens=120)[0].tolist(), chars_list))


tensor(4.9400, grad_fn=<NllLossBackward0>)
torch.Size([50, 97])
@S+2T´g‍EW5tA)8Aé$y‘€çQVçQ vR:6Dk;´Jyv)´eI:6+! Dc/6"Fv;TI9Kó‘(rNCSu(U01@')kç_i6I9Q9"$T€5xer$F:€´.4iu$Y Qq:KZzuCe77) ?t“7E
