## Data Prep

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('tweets.csv')

In [15]:
import re

def clean_tweet(tweet):
    # regex pattern to match mentions
    mention_pattern = r'@\w+'
    # regex patterns for emojis and links
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F700-\U0001F77F"
                               u"\U0001F780-\U0001F7FF"
                               u"\U0001F800-\U0001F8FF"
                               u"\U0001F900-\U0001F9FF"
                               u"\U0001FA00-\U0001FA6F"
                               u"\U0001FA70-\U0001FAFF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    
    link_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 
                              flags=re.UNICODE)
    
    # Remove emojis and links from the tweet
    tweet = emoji_pattern.sub('', tweet)
    tweet = link_pattern.sub('', tweet)
    clean_t = re.sub(mention_pattern, '', tweet)
    
    return clean_t

In [5]:
df['text'] = df['text'].apply(lambda x: clean_tweet(x).strip().replace('&amp;', "and"))

In [6]:
corpus = df['text'].apply(lambda x: x + '.' if x[-1]!='.' else x).str.cat(sep=' ')

## Tokenization

Character level tokenization is sufficient for a small dataset. First some stats for the dataset.

In [8]:
corpus_len = len(corpus)
print("Corpus length:", corpus_len)
vocab = sorted(list(set(corpus)))
vocab_size = len(vocab)
print("Vocab size:", vocab_size)
print("Vocab content:", vocab)

Corpus length: 293038
Vocab size: 101
Vocab content: ['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\xa0', 'à', 'é', 'ō', '\u200d', '–', '‘', '’', '“', '”', '…', '≠']


Creating a char:index mapping where the index will serve as the index for the token embedding of the char it maps to.

In [9]:
char2idx = {char:idx for idx,char in enumerate(vocab)}
idx2char = {idx:char for char,idx in char2idx.items()}
encode = lambda x: [char2idx[char] for char in x]
decode = lambda idxs: "".join([idx2char[idx] for idx in idxs])
print("char to idx:", char2idx)
print("idx to char:", idx2char)
print("tokenizing/encoding 'Elon Musk': ", encode("Elon Musk"))
print("detokenizing/decoding it back: ", decode(encode("Elon Musk")))

char to idx: {'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, '&': 7, "'": 8, '(': 9, ')': 10, '*': 11, '+': 12, ',': 13, '-': 14, '.': 15, '/': 16, '0': 17, '1': 18, '2': 19, '3': 20, '4': 21, '5': 22, '6': 23, '7': 24, '8': 25, '9': 26, ':': 27, ';': 28, '?': 29, 'A': 30, 'B': 31, 'C': 32, 'D': 33, 'E': 34, 'F': 35, 'G': 36, 'H': 37, 'I': 38, 'J': 39, 'K': 40, 'L': 41, 'M': 42, 'N': 43, 'O': 44, 'P': 45, 'Q': 46, 'R': 47, 'S': 48, 'T': 49, 'U': 50, 'V': 51, 'W': 52, 'X': 53, 'Y': 54, 'Z': 55, '[': 56, ']': 57, '_': 58, 'a': 59, 'b': 60, 'c': 61, 'd': 62, 'e': 63, 'f': 64, 'g': 65, 'h': 66, 'i': 67, 'j': 68, 'k': 69, 'l': 70, 'm': 71, 'n': 72, 'o': 73, 'p': 74, 'q': 75, 'r': 76, 's': 77, 't': 78, 'u': 79, 'v': 80, 'w': 81, 'x': 82, 'y': 83, 'z': 84, '{': 85, '|': 86, '}': 87, '~': 88, '\xa0': 89, 'à': 90, 'é': 91, 'ō': 92, '\u200d': 93, '–': 94, '‘': 95, '’': 96, '“': 97, '”': 98, '…': 99, '≠': 100}
idx to char: {0: '\n', 1: ' ', 2: '!', 3: '"', 4: '#', 5: '$', 6: '%', 7: '&'

Encoding the entire corpus in torch tensor

In [10]:
import torch
encoded_corpus = torch.tensor(encode(corpus))
print("encoded corpus shape:", encoded_corpus.shape, "dtype:", encoded_corpus.dtype)
encoded_corpus

encoded corpus shape: torch.Size([293038]) dtype: torch.int64


tensor([38,  1, 64,  ..., 61, 83, 15])

Creating a training/validation split

In [11]:
train_split = int(len(encoded_corpus)*0.8)
train_data = encoded_corpus[:train_split]
valid_data = encoded_corpus[train_split:]
print("train data shape:", train_data.shape)
print("valid data shape:", valid_data.shape)

train data shape: torch.Size([234430])
valid data shape: torch.Size([58608])


Context length == max sequence length == block size  
The transformer is trained on each combination of tokens up to the "context length".  
We are using a context length of 8 so, training would include (0, 1), (0, 1, 2), (0, 1, 2, 3),...,(0, 1, 2, 3, 4, 5, 6, 7)

In [12]:
context_length = 8
for i in range(context_length):
    x, y = train_data[:i+1], train_data[i+1]
    print(f"for index {i}: x: {decode(x.tolist())}\ty: {decode(y[None].tolist())}")

for index 0: x: I	y:  
for index 1: x: I 	y: f
for index 2: x: I f	y: i
for index 3: x: I fi	y: n
for index 4: x: I fin	y: d
for index 5: x: I find	y:  
for index 6: x: I find 	y: t
for index 7: x: I find t	y: h


## Data Loader

A function to retrieve a "batch" of data from either training or validation datasets. A batch is a sequence of tokens. In a batch, each sub-sequence of tokens must have a target token (the token that comes next).

In [13]:
TORCH_SEED = 2000
torch.manual_seed(TORCH_SEED)
batch_size = 4

In [14]:
def get_batch(data):
    data_len = len(data)
    start_idxs = torch.randint(high=data_len - context_length, size=(batch_size,))
    x = torch.stack([data[i:i+context_length] for i in start_idxs])
    y = torch.stack([data[i+1: i+context_length+1] for i in start_idxs])
    return x, y

xb, yb = get_batch(train_data)
print('inputs:')
print('shape:',xb.shape)
print(xb)
print('targets:')
print('shape:',yb.shape)
print(yb)

inputs:
shape: torch.Size([4, 8])
tensor([[73, 79, 76,  1, 78, 81, 63, 63],
        [67, 72, 69,  1, 67, 78,  1, 81],
        [67, 76, 67, 72, 65,  1, 72, 63],
        [77,  1, 73, 79, 78,  1, 78, 66]])
targets:
shape: torch.Size([4, 8])
tensor([[79, 76,  1, 78, 81, 63, 63, 78],
        [72, 69,  1, 67, 78,  1, 81, 59],
        [76, 67, 72, 65,  1, 72, 63, 63],
        [ 1, 73, 79, 78,  1, 78, 66, 59]])


The target value is the token that comes after the input value in the encoded_corpus

## Bigram Model

Given a token, a bigram model predicts the probability of the next.

In [16]:
import torch.nn as nn
from torch.nn import functional as F

In [44]:
torch.manual_seed(TORCH_SEED)
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.token_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=vocab_size)
    
    def forward(self, idx, targets):
        logits = self.token_embedding(idx)
        if targets is not None:
            B, T, C = logits.shape
            logits_reshaped = logits.view(B*T, C)
            targets_reshaped = targets.view(B*T)
            loss = F.cross_entropy(logits_reshaped, targets_reshaped)
        else:
            loss = None
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(idx, targets=None)
            logits_last_time_step = logits[:, -1] # Becomes shape (B, C)
            # softmax
            probs = F.softmax(logits_last_time_step, dim=-1) # Becomes shape (B, C)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], dim=1) 
        return idx

In [119]:
bigram_model = BigramLanguageModel(vocab_size)
out, loss = bigram_model(xb, yb)
print("out shape:", out.shape)
print("xb: ", xb.shape)
print("yb: ", yb.shape)
print("loss:", loss)
idx = torch.zeros((1, 1), dtype=torch.long)
print("100 generated tokens:", decode(bigram_model.generate(idx, 100)[0].tolist()))

out shape: torch.Size([32, 8, 101])
xb:  torch.Size([32, 8])
yb:  torch.Size([32, 8])
loss: tensor(5.1334, grad_fn=<NllLossBackward0>)
100 generated tokens: 
Fsc3nCIK!
68~{5l9AUag≠rX}…,u9 0sn‘0UairUM”h6–YN(N "Ggl’0ZC Pz’D,7KS/f?]“W…J_9f_y≠'.S!x-
J6_C4Vl:IQ?‘


Need an Optimizer for training

In [120]:
optimizer = torch.optim.Adam(bigram_model.parameters(), lr=0.001)

Training Loop

In [121]:
batch_size = 32
for step in range(10000):
    xb, yb = get_batch(train_data)
    logits, loss = bigram_model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if step % 1000 == 0:
        print(f"Step {step}: loss {loss.item():.3f}")

Step 0: loss 4.974
Step 1000: loss 3.959
Step 2000: loss 3.255
Step 3000: loss 2.952
Step 4000: loss 2.687
Step 5000: loss 2.620
Step 6000: loss 2.516
Step 7000: loss 2.643
Step 8000: loss 2.650
Step 9000: loss 2.463


In [122]:
print(decode(bigram_model.generate(torch.zeros((1,1), dtype=torch.long),max_new_tokens=30)[0].tolist()))


Havenco ncin r nderf ts, gorge


Let's use this gibberish as a baseline for now.

Loop over validation set

In [123]:
batch_size = 32
torch.manual_seed(TORCH_SEED)
losses = []
for steps in range(len(valid_data)//batch_size//8):
    xb, yb = get_batch(valid_data)
    with torch.no_grad():
        logits, loss = bigram_model(xb, yb)
        losses.append(loss)

    if steps % 10 == 0:
        print(f"Step {steps}: loss {loss.item():.3f}")
print(f"Validation loss: {torch.stack(losses, dim=0).mean()}")

Step 0: loss 2.557
Step 10: loss 2.606
Step 20: loss 2.516
Step 30: loss 2.593
Step 40: loss 2.584
Step 50: loss 2.526
Step 60: loss 2.573
Step 70: loss 2.584
Step 80: loss 2.469
Step 90: loss 2.639
Step 100: loss 2.681
Step 110: loss 2.545
Step 120: loss 2.503
Step 130: loss 2.502
Step 140: loss 2.510
Step 150: loss 2.531
Step 160: loss 2.476
Step 170: loss 2.544
Step 180: loss 2.556
Step 190: loss 2.405
Step 200: loss 2.564
Step 210: loss 2.543
Step 220: loss 2.701
Validation loss: 2.538381576538086


## Self Attention