In [17]:
import pandas as pd
df = pd.read_csv("../data/reuters_headlines.csv")
df['Headlines'][0:2]

0    TikTok considers London and other locations fo...
1    Disney cuts ad spending on Facebook amid growi...
Name: Headlines, dtype: object

In [18]:
# Testing with using <> as a separator, NOTE: this is not really a good idea!
text = ' <> '.join(df['Headlines'].tolist())
text[0:201]

'TikTok considers London and other locations for headquarters <> Disney cuts ad spending on Facebook amid growing boycott: WSJ <> Trail of missing Wirecard executive leads to Belarus, Der Spiegel report'

In [19]:
# Compute chars
chars_list = sorted(list(set(text)))
chars = ''.join(chars_list)
len(chars), chars

(98,
 ' !"#$%&\'()*+,-./0123456789:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz\xa0£´çéó\u200b\u200d–‘’“”€')

In [20]:
# Used for testing purposes

#chars_idx = list(range(len(chars)))
#chars_map = list(zip(chars_idx, chars))
#chars_map[0:5],chars_idx[0:5]
#test = list(map(str, chars_idx))


In [21]:
# Encode and decode the given text
def encode(text):
    text_encode = []
    for i in text:
        text_encode.append(chars.find(i))
    return text_encode

def decode(text, chars_list):
    text_decode = []
    for i in text:
        text_decode.append(chars_list[i])
    return ''.join(text_decode)

text_encode = encode(text[0:30])
text_decode = decode(text_encode, chars_list)

print(text[0:30])
print(text_encode)
print(text_decode)

TikTok considers London and ot
[50, 66, 68, 50, 72, 68, 0, 60, 72, 71, 76, 66, 61, 62, 75, 76, 0, 42, 72, 71, 61, 72, 71, 0, 58, 71, 61, 0, 72, 77]
TikTok considers London and ot


In [22]:
# Load to torch
import torch 
x = torch.tensor(encode(text), dtype=torch.long)
print(x.shape)
print(x[0:30])

torch.Size([2270648])
tensor([50, 66, 68, 50, 72, 68,  0, 60, 72, 71, 76, 66, 61, 62, 75, 76,  0, 42,
        72, 71, 61, 72, 71,  0, 58, 71, 61,  0, 72, 77])


In [23]:
# Split train and validation data
split = int(len(x)*0.9)
x_train = x[:split]
x_valid = x[split:]
len(x_train), len(x_valid)

(2043583, 227065)

In [26]:
batch_size = 5
block_size = 10


def load_batch(x):
    index_rand = torch.randint(0,len(x),(block_size,))
    x_batch = torch.stack([x[i:i+block_size] for i in index_rand])
    y_batch  = torch.stack([x[i+1:i+block_size+1] for i in index_rand])
    return x_batch, y_batch 


x_batch, y_batch = load_batch(x_train)
x_batch, y_batch

(tensor([[73, 75, 62, 73, 58, 75, 58, 77, 72, 75],
         [62, 62, 68, 76,  0, 77, 72,  0, 72, 78],
         [76,  0, 61, 75, 66, 79, 66, 71, 64,  0],
         [29,  0, 31, 66, 75,  0, 33, 58, 71, 58],
         [58, 71, 61, 62, 70, 66, 60, 26,  0, 75],
         [76, 12,  0, 76, 65, 58, 75, 62,  0, 59],
         [69, 72, 75, 62, 76,  0, 72, 73, 77, 66],
         [58, 69, 69, 76,  0, 59, 62, 69, 72, 80],
         [65, 66, 77, 62,  0, 38, 72, 78, 76, 62],
         [62, 65, 66, 60, 69, 62, 76,  0, 66, 71]]),
 tensor([[75, 62, 73, 58, 75, 58, 77, 72, 75, 82],
         [62, 68, 76,  0, 77, 72,  0, 72, 78, 76],
         [ 0, 61, 75, 66, 79, 66, 71, 64,  0, 28],
         [ 0, 31, 66, 75,  0, 33, 58, 71, 58, 61],
         [71, 61, 62, 70, 66, 60, 26,  0, 75, 62],
         [12,  0, 76, 65, 58, 75, 62,  0, 59, 78],
         [72, 75, 62, 76,  0, 72, 73, 77, 66, 72],
         [69, 69, 76,  0, 59, 62, 69, 72, 80,  0],
         [66, 77, 62,  0, 38, 72, 78, 76, 62,  0],
         [65, 66, 60, 69, 62,