In [262]:
import pandas as pd
df_news = pd.read_csv("../data/reuters_headlines.csv")
df_news['Headlines'][0:2]

0    TikTok considers London and other locations fo...
1    Disney cuts ad spending on Facebook amid growi...
Name: Headlines, dtype: object

In [263]:
df_lj = pd.read_csv("../data/lj_speech.csv", sep='|', header=None)
df_lj[1][0:2]

0    Printing, in the only sense with which we are ...
1                       in being comparatively modern.
Name: 1, dtype: object

In [264]:
def df_to_text(df, colum):
    return ' @ '.join(df[colum].tolist())

In [265]:
# NOTE: Testing with using @ as a separator

#text = df_to_text(df_news, 'Headlines')
text = df_to_text(df_news, 'Time')
#text = df_to_text(df_news, 'Description')
#text = df_to_text(df_lj, 1)

text[0:201]

'Jul 18 2020 @ Jul 18 2020 @ Jul 18 2020 @ Jul 18 2020 @ Jul 17 2020 @ Jul 17 2020 @ Jul 17 2020 @ Jul 17 2020 @ Jul 17 2020 @ Jul 17 2020 @ Jul 17 2020 @ Jul 17 2020 @ Jul 17 2020 @ Jul 17 2020 @ Jul 1'

In [266]:
# Compute chars
chars_list = sorted(list(set(text)))
chars = ''.join(chars_list)
len(chars), chars

(34, ' 0123456789@ADFJMNOSabceglnoprtuvy')

In [267]:
# Used for testing purposes

#chars_idx = list(range(len(chars)))
#chars_map = list(zip(chars_idx, chars))
#chars_map[0:5],chars_idx[0:5]
#test = list(map(str, chars_idx))


In [268]:
# Encode and decode the given text
def encode(text):
    text_encode = []
    for i in text:
        text_encode.append(chars.find(i))
    return text_encode

def decode(text, chars_list):
    text_decode = []
    for i in text:
        text_decode.append(chars_list[i])
    return ''.join(text_decode)

text_encode = encode(text[0:30])
text_decode = decode(text_encode, chars_list)

print(text[0:30])
print(text_encode)
print(text_decode)

Jul 18 2020 @ Jul 18 2020 @ Ju
[15, 31, 25, 0, 2, 9, 0, 3, 1, 3, 1, 0, 11, 0, 15, 31, 25, 0, 2, 9, 0, 3, 1, 3, 1, 0, 11, 0, 15, 31]
Jul 18 2020 @ Jul 18 2020 @ Ju


In [269]:
# Load to torch
import torch 
x = torch.tensor(encode(text), dtype=torch.long)
print(x.shape)
print(x[0:30])

torch.Size([458777])
tensor([15, 31, 25,  0,  2,  9,  0,  3,  1,  3,  1,  0, 11,  0, 15, 31, 25,  0,
         2,  9,  0,  3,  1,  3,  1,  0, 11,  0, 15, 31])


In [270]:
# Split train and validation data
split = int(len(x)*0.9)
x_train = x[:split]
x_valid = x[split:]
len(x_train), len(x_valid)

(412899, 45878)

In [271]:
batch_size = 5
block_size = 10

# BUG: RuntimeError: stack expects each tensor to be equal size, but got [10] at entry 0 and [3] at entry 1
# len(x)-block_size HAX to avoid this bug, look into this a but more

def load_batch(x):
    index_rand = torch.randint(0,len(x)-block_size,(batch_size,))
    x_batch = torch.stack([x[i:i+block_size] for i in index_rand])
    y_batch  = torch.stack([x[i+1:i+block_size+1] for i in index_rand])
    return x_batch, y_batch 


x_batch, y_batch = load_batch(x_train)
x_batch, y_batch

(tensor([[ 3,  1,  3,  1,  0, 11,  0, 15, 31, 25],
         [ 2,  3,  0,  3,  1,  2, 10,  0, 11,  0],
         [11,  0, 16, 20, 29,  0,  2,  1,  0,  3],
         [ 0, 18, 22, 30,  0,  3,  9,  0,  3,  1],
         [ 3,  1,  2,  9,  0, 11,  0, 17, 27, 32]]),
 tensor([[ 1,  3,  1,  0, 11,  0, 15, 31, 25,  0],
         [ 3,  0,  3,  1,  2, 10,  0, 11,  0, 15],
         [ 0, 16, 20, 29,  0,  2,  1,  0,  3,  1],
         [18, 22, 30,  0,  3,  9,  0,  3,  1,  2],
         [ 1,  2,  9,  0, 11,  0, 17, 27, 32,  0]]))

In [272]:
import torch.nn as nn
from torch.nn import functional as F

# PyTorch BigramLanguageModel model:
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size) -> None:
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)   

    # idx  is input data
    def forward(self, idx, targets=None):

        # TODO: Read about logits and embedding table
        logits = self.token_embedding_table(idx) # (B,T,C) - (batch,size,chars) 
        
        loss = None
        if targets is not None:
            # (B,T,C) -> (B*C,T) Conversion to accommodating of torch specs
            logits = logits.reshape(-1,logits.shape[2])
            targets = targets.reshape(-1)
            loss = F.cross_entropy(logits, targets)

        return logits, loss   

    def generate(self, idx, limit_new_tokens):
        for _ in range(limit_new_tokens):
            logits, loss = self(idx) # Fetch predictions
            logits = logits[:,-1,:] # Last step
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx,idx_next), dim=1)
        
        return idx


model = BigramLanguageModel(len(chars))
logits, loss = model(x_batch, y_batch)

# For now @ is "newline char"
idx = torch.tensor((encode('@'),), dtype=torch.long)

print(loss)
print(logits.shape)
print(decode(model.generate(idx, limit_new_tokens=120)[0].tolist(), chars_list))


tensor(3.9338, grad_fn=<NllLossBackward0>)
torch.Size([50, 34])
@oFbMlM@4Fc6n9D rl55aoSNvr782O8NON61MleF5rON0 7MlA6nu0uMl@FONp4o946n75OF1M17Oe1lll@MO8026SoOt49l5uve2cg c6SN00ce1ve4l5too


In [273]:
# Optimizer 
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001)

In [274]:

for _ in range(10000):
    # Load in one batch
    x_batch, y_batch = load_batch(x)

    # Compute/evaluate the loss
    logits, loss = model(x_batch, y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

1.1247494220733643


In [275]:
print(decode(model.generate(idx, limit_new_tokens=120)[0].tolist(), chars_list))
print(decode(model.generate(idx, limit_new_tokens=120)[0].tolist(), chars_list))
print(decode(model.generate(idx, limit_new_tokens=120)[0].tolist(), chars_list))
print(decode(model.generate(idx, limit_new_tokens=120)[0].tolist(), chars_list))
print(decode(model.generate(idx, limit_new_tokens=120)[0].tolist(), chars_list))

@ 18 Oc Apr 018 28 018 2020 Jun 19 2019 2019 202020 Seb @ @ 24 20 Jun @ 07 1 18 28 @ Apr Sect 018 19 20 019 @ 3 May 19 20
@ 2015 @ 19 2019 @ 201 Seb 2017 201 Aug @ 20 @ 2019 27 Aun Oct Ju6Seb 13019 Jug 2018 @ Sepr Dect 2019 Ap 16 @ @ Oc @ 208 
@ 20 29n @ 2011 @ @ @ Jun 2014 @ 120120202 @ @ 202018 Mar 020 @ Aun 202020 20 @ @ @ 206 109 2n Jul @ @ 2019 02029 @ 2019 
@ @ Nov @ @ @ MarAun @ 24 Oc 20 20 DeJun Jay 207 20 Seb @ @ 202019 Jull Mar 3018 @ Jun 20 2013 18 20 @ 1 @ Nov 201 @ 2018
@ 2018 019 20120 2012014 Jar 2 19 2018 @ @ 20 2019 20 Jun May 19 @ Jan Jug Ap 209 2020206 @ 0 206 Jay 2019 1 20 @ 18 Ap J
