## Data Prep

In [110]:
import pandas as pd

In [111]:
df = pd.read_csv('tweets.csv')

In [112]:
import re

def clean_tweet(tweet):
    # Define a regular expression pattern to match mentions
    mention_pattern = r'@\w+'
    # Define regular expression patterns for emojis and links
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # Emoticons
                               u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # Transport & map symbols
                               u"\U0001F700-\U0001F77F"  # Alphabetic presentation forms
                               u"\U0001F780-\U0001F7FF"  # Geometric shapes
                               u"\U0001F800-\U0001F8FF"  # Miscellaneous symbols
                               u"\U0001F900-\U0001F9FF"  # Supplemental symbols and pictographs
                               u"\U0001FA00-\U0001FA6F"  # Extended-A
                               u"\U0001FA70-\U0001FAFF"  # Extended-B
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    
    link_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 
                              flags=re.UNICODE)
    
    # Remove emojis and links from the tweet
    tweet = emoji_pattern.sub('', tweet)
    tweet = link_pattern.sub('', tweet)
    

    # Use re.sub to replace mentions with an empty string
    clean_t = re.sub(mention_pattern, '', tweet)
    
    return clean_t

In [113]:
df['text'] = df['text'].apply(lambda x: clean_tweet(x).strip().replace('&amp;', "and"))

In [119]:
corpus = df['text'].apply(lambda x: x + '.' if x[-1]!='.' else x).str.cat(sep=' ')

## Tokenization

Character level tokenization is sufficient for a small dataset. First some stats for the dataset.

In [121]:
corpus_len = len(corpus)
print("Corpus length:", corpus_len)
vocab = sorted(list(set(corpus)))
vocab_size = len(vocab)
print("Vocab size:", vocab_size)
print("Vocab content:", vocab)

Corpus length: 293038
Vocab size: 101
Vocab content: ['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\xa0', 'à', 'é', 'ō', '\u200d', '–', '‘', '’', '“', '”', '…', '≠']


Creating a char:index mapping where the index will serve as the index for the token embedding of the char it maps to.

In [129]:
char2idx = {char:idx for idx,char in enumerate(vocab)}
idx2char = {idx:char for char,idx in char2idx.items()}
encode = lambda x: [char2idx[char] for char in x]
decode = lambda idxs: "".join([idx2char[idx] for idx in idxs])
print("char to idx:", char2idx)
print("idx to char:", idx2char)
print("tokenizing/encoding 'Elon Musk': ", encode("Elon Musk"))
print("detokenizing/decoding it back: ", decode(encode("Elon Musk")))

char to idx: {'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, '&': 7, "'": 8, '(': 9, ')': 10, '*': 11, '+': 12, ',': 13, '-': 14, '.': 15, '/': 16, '0': 17, '1': 18, '2': 19, '3': 20, '4': 21, '5': 22, '6': 23, '7': 24, '8': 25, '9': 26, ':': 27, ';': 28, '?': 29, 'A': 30, 'B': 31, 'C': 32, 'D': 33, 'E': 34, 'F': 35, 'G': 36, 'H': 37, 'I': 38, 'J': 39, 'K': 40, 'L': 41, 'M': 42, 'N': 43, 'O': 44, 'P': 45, 'Q': 46, 'R': 47, 'S': 48, 'T': 49, 'U': 50, 'V': 51, 'W': 52, 'X': 53, 'Y': 54, 'Z': 55, '[': 56, ']': 57, '_': 58, 'a': 59, 'b': 60, 'c': 61, 'd': 62, 'e': 63, 'f': 64, 'g': 65, 'h': 66, 'i': 67, 'j': 68, 'k': 69, 'l': 70, 'm': 71, 'n': 72, 'o': 73, 'p': 74, 'q': 75, 'r': 76, 's': 77, 't': 78, 'u': 79, 'v': 80, 'w': 81, 'x': 82, 'y': 83, 'z': 84, '{': 85, '|': 86, '}': 87, '~': 88, '\xa0': 89, 'à': 90, 'é': 91, 'ō': 92, '\u200d': 93, '–': 94, '‘': 95, '’': 96, '“': 97, '”': 98, '…': 99, '≠': 100}
idx to char: {0: '\n', 1: ' ', 2: '!', 3: '"', 4: '#', 5: '$', 6: '%', 7: '&'

Encoding the entire corpus in torch tensor

In [130]:
import torch
encoded_corpus = torch.tensor(encode(corpus))
print("encoded corpus shape:", encoded_corpus.shape, "dtype:", encoded_corpus.dtype)
encoded_corpus

encoded corpus shape: torch.Size([293038]) dtype: torch.int64


tensor([38,  1, 64,  ..., 61, 83, 15])

Creating a training/validation split

In [133]:
train_split = int(len(encoded_corpus)*0.8)
train_data = encoded_corpus[:train_split]
valid_data = encoded_corpus[train_split:]
print("train data shape:", train_data.shape)
print("valid data shape:", valid_data.shape)

train data shape: torch.Size([234430])
valid data shape: torch.Size([58608])


Context length == max sequence length == block size  
The transformer is trained on each combination of tokens up to the "context length".  
We are using a context length of 8 so, training would include (0, 1), (0, 1, 2), (0, 1, 2, 3),...,(0, 1, 2, 3, 4, 5, 6, 7)

In [169]:
context_length = 8
for i in range(context_length):
    x, y = train_data[:i+1], train_data[i+1]
    print(f"for index {i}: x: {decode(x.tolist())}\ty: {decode(y[None].tolist())}")

for index 0: x: I	y:  
for index 1: x: I 	y: f
for index 2: x: I f	y: i
for index 3: x: I fi	y: n
for index 4: x: I fin	y: d
for index 5: x: I find	y:  
for index 6: x: I find 	y: t
for index 7: x: I find t	y: h


## Data Loader

A function to retrieve a "batch" of data from either training or validation datasets. A batch is a sequence of tokens. In a batch, each sub-sequence of tokens must have a target token (the token that comes next).

In [171]:
TORCH_SEED = 2000
torch.manual_seed(TORCH_SEED)
batch_size = 4

In [179]:
def get_batch(data):
    data_len = len(data)
    start_idxs = torch.randint(high=data_len - context_length, size=(batch_size,))
    x = torch.stack([data[i:i+context_length] for i in start_idxs])
    y = torch.stack([data[i+1: i+context_length+1] for i in start_idxs])
    return x, y

xb, yb = get_batch(train_data)
print('inputs:')
print('shape:',xb.shape)
print(xb)
print('targets:')
print('shape:',yb.shape)
print(yb)

inputs:
shape: torch.Size([4, 8])
tensor([[73, 77, 77, 67, 60, 67, 70, 67],
        [ 1, 63, 77, 74, 63, 99, 15,  1],
        [52, 63, 96, 76, 63,  1, 81, 73],
        [67, 63, 62, 15,  1, 33, 73, 72]])
targets:
shape: torch.Size([4, 8])
tensor([[77, 77, 67, 60, 67, 70, 67, 78],
        [63, 77, 74, 63, 99, 15,  1, 49],
        [63, 96, 76, 63,  1, 81, 73, 76],
        [63, 62, 15,  1, 33, 73, 72, 96]])


The target value is the token that comes after the input value in the encoded_corpus

## Bigram Model