# AI Lytics Generator in Taylor Swift Style
In this project we're going to build a gpt-like decoder only tranformer to generate lyrics similar to Taylor Swift songs.

# Part 0  Setup

In [None]:
import torch

# # Confirm that the GPU is detected
# assert torch.cuda.is_available()

# # Get the GPU device name.
# device_name = torch.cuda.get_device_name()
# n_gpu = torch.cuda.device_count()
# print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Part 1 Data Preparing
We're going to import a database with lyrics from all Taylor Swift songs and clean the data to better use for model input.

First, we open download the file and read the data.

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("PromptCloudHQ/taylor-swift-song-lyrics-from-all-the-albums")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/PromptCloudHQ/taylor-swift-song-lyrics-from-all-the-albums?dataset_version_number=1...


100%|██████████| 68.7k/68.7k [00:00<00:00, 33.1MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/PromptCloudHQ/taylor-swift-song-lyrics-from-all-the-albums/versions/1





In [None]:
import pandas as pd

file_path = "/root/.cache/kagglehub/datasets/PromptCloudHQ/taylor-swift-song-lyrics-from-all-the-albums/versions/1/taylor_swift_lyrics.csv"

data = pd.read_csv(file_path, encoding='ISO-8859-1')  # You can also try 'latin1' or 'unicode_escape'
print(data.head())

         artist         album track_title  track_n  \
0  Taylor Swift  Taylor Swift  Tim McGraw        1   
1  Taylor Swift  Taylor Swift  Tim McGraw        1   
2  Taylor Swift  Taylor Swift  Tim McGraw        1   
3  Taylor Swift  Taylor Swift  Tim McGraw        1   
4  Taylor Swift  Taylor Swift  Tim McGraw        1   

                                         lyric  line  year  
0          He said the way my blue eyes shined     1  2006  
1  Put those Georgia stars to shame that night     2  2006  
2                       I said, "That's a lie"     3  2006  
3                  Just a boy in a Chevy truck     4  2006  
4         That had a tendency of gettin' stuck     5  2006  


Then, we organize all lyrics into a string and build an character-level embedding for lyrics.

In [None]:
# Combine all lyrics into one string
all_lyrics = ' '.join(data['lyric'])

# Print the first 500 characters of the combined lyrics to verify
print(all_lyrics[:500])
print(len(all_lyrics))

He said the way my blue eyes shined Put those Georgia stars to shame that night I said, "That's a lie" Just a boy in a Chevy truck That had a tendency of gettin' stuck On backroads at night And I was right there beside him all summer long And then the time we woke up to find that summer gone But when you think Tim McGraw I hope you think my favorite song The one we danced to all night long The moon like a spotlight on the lake When you think happiness I hope you think that little black dress Thi
173603


In [None]:
pip install -q sentence-transformers==2.2.2 transformers==4.17.0

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [None]:
# Original self-implemented character-level encoding
# here are all the unique characters that occur in this text
chars = sorted(list(set(all_lyrics)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# # Use GPT2 tokenizer to achieve subword-level tokenization
# from transformers import GPT2Tokenizer

# # Initialize the GPT2 tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# # Define a maximum chunk size in characters
# chunk_size = 1000
# # Split the text into smaller chunks
# text_chunks = [all_lyrics[i:i + chunk_size] for i in range(0, len(all_lyrics), chunk_size)]

# # Tokenize each chunk and combine into a single list of token IDs
# encoded_chunks = [tokenizer.encode(chunk, add_special_tokens=True) for chunk in text_chunks]
# # Flatten the list of tokenized chunks into a single sequence
# encoded = [token for chunk in encoded_chunks for token in chunk]

# Part 2 Model Traning

In this part, we're going to train a decoder only tranformer using the Taylor Swift lyrics we cleaned and encoded. The resulting model should be able to produce lyrics similar to Taylor Swift styles.

To start with,let's import torch and define hyperparameters' value for our model.

In [None]:
import torch.nn as nn
from torch.nn import functional as F


# hyperparameters
batch_size = 16
block_size = 32
max_iters = 1000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32
n_head = 4
n_layer = 4
dropout = 0.5

torch.manual_seed(1337)

<torch._C.Generator at 0x7e4ca4286ff0>

Next, let's split the train and text data.

In [None]:
#data = torch.tensor(encoded, dtype=torch.long)
data = torch.tensor(encode(all_lyrics), dtype=torch.long)

# Split into training and validation datasets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print(f"Train data size: {len(train_data)} tokens")
print(f"Validation data size: {len(val_data)} tokens")

Train data size: 156242 tokens
Validation data size: 17361 tokens


Then, let's define the function for building the model

In [None]:
def get_batch(split):
    """Get inputs and targets."""
    data = train_data if split == 'train' else val_data
    if len(data) <= block_size:
        raise ValueError("Block size is larger than the dataset length.")

    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

@torch.no_grad()
def estimate_loss():
    """Estimates loss on train and validation datasets."""
    model.eval()
    out = {}
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for i in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[i] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """Single head of self-attention"""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)

        # Compute attention weights
        scale = C**-0.5
        wei = q @ k.transpose(-2,-1) * scale
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        # Weighted aggregation
        return wei @ v

class MultiHeadAttention(nn.Module):
    """Self-attention with multiple heads"""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """Simple single-layer feedfoward"""

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """Transformer block"""

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

Now, we can train our model using all_lyrics and use it to generate Taylor Swift style lyrics

In [None]:
model = BigramLanguageModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

early_stopping_patience = 10
best_val_loss = float('inf')
patience_counter = 0

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

for iter in range(max_iters):

    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"Step {iter}: Train Loss {losses['train']:.4f}, Val Loss {losses['val']:.4f}")

        # Early stopping
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= early_stopping_patience:
                print("Early stopping triggered.")
                break


    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

0.05685 M parameters
step 0: train loss 4.5160, val loss 4.5145
step 100: train loss 2.8053, val loss 2.8167
step 200: train loss 2.5908, val loss 2.6219
step 300: train loss 2.4840, val loss 2.5301
step 400: train loss 2.4603, val loss 2.4946
step 500: train loss 2.4180, val loss 2.4531
step 600: train loss 2.3966, val loss 2.4307
step 700: train loss 2.3754, val loss 2.4223
step 800: train loss 2.3577, val loss 2.3926
step 900: train loss 2.3432, val loss 2.3937
step 999: train loss 2.3237, val loss 2.3600
 ourald d theraud acMallevely al in on way u'tinn sthoout hle wakin jr e w Wemilhoh yot be Iou'thnd in? goeond ig he timelobmke p oveann's w s p, yous pedou I bay mece ghade ash waheme I'thae inain'r meyoupacan g mime'scay), ht s Wane thim outhaithe we gow pyouink yonett be-an onis wwhe An'ng tha the d folow g hoe githeesh eil yore- th ay I Thearak oulome, It ame mut sines Soss id lllind besthyo Thgr stea ameetsth od 4yohnelistilf hrror Angopls s ig " t I'l kt Tid mee Bure Ystheto