# distilGPT2 Fine-Tuning

In [186]:
import torch
from tqdm.notebook import tqdm
tqdm.pandas
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('dark_background')

ver = 1

## Load Data

In [2]:
import pandas as pd
data = pd.read_csv('../out.csv')
data

Unnamed: 0.1,Unnamed: 0,lastfm_url,track,artist,seeds,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,mbid,spotify_id,genre,Lyric,language
0,1,https://www.last.fm/music/metallica/_/st.%2banger,St. Anger,Metallica,['aggressive'],8,3.710000,5.833000,5.427250,727a2529-7ee8-4860-aef6-7959884895cb,3fOc9x06lKJBhz435mInlH,metal,Saint Anger 'round my neck\nSaint Anger 'round...,en
1,3,https://www.last.fm/music/m.i.a./_/bamboo%2bbanga,Bamboo Banga,M.I.A.,"['aggressive', 'fun', 'sexy', 'energetic']",13,6.555071,5.537214,5.691357,99dd2c8c-e7c1-413e-8ea4-4497a00ffa18,6tqFC1DIOphJkCwrjVzPmg,hip-hop,"Road runner, road runner\nGoing hundred mile p...",en
2,5,https://www.last.fm/music/drowning%2bpool/_/st...,Step Up,Drowning Pool,['aggressive'],9,2.971389,5.537500,4.726389,49e7b4d2-3772-4301-ba25-3cc46ceb342e,4Q1w4Ryyi8KNxxaFlOQClK,metal,Come!\n\nIf our own lives aren’t directly affe...,en
3,11,https://www.last.fm/music/kanye%2bwest/_/feedback,Feedback,Kanye West,['aggressive'],1,3.080000,5.870000,5.490000,,49fT6owWuknekShh9utsjv,hip-hop,"Ayy, y'all heard about the good news?\nY'all s...",en
4,13,https://www.last.fm/music/deftones/_/7%2bwords,7 Words,Deftones,"['aggressive', 'angry']",10,3.807121,5.473939,4.729091,1a826083-5585-445f-a708-415dc90aa050,6DoXuH326aAYEN8CnlLmhP,nu metal,"I'll never be the same, breaking decency\nDon'...",en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16792,229432,https://www.last.fm/music/noblegases/_/xenon,Xenon,NobleGases,['noble'],2,6.160000,3.695000,6.130000,,1AePjgLLtzF0abbfcgYdLI,chill,You're floating out astray\nThis cold and life...,en
16793,229435,https://www.last.fm/music/kurt%2bvile/_/wild%2...,Wild Imagination,Kurt Vile,['transparent'],2,6.925000,4.975000,6.190000,,1Gn0oYQiQHp7KF4DcR2g4t,,I'm looking at you\nBut It's only a picture so...,en
16794,229436,https://www.last.fm/music/portugal.%2bthe%2bma...,Oh Lord,Portugal. The Man,['transparent'],1,5.370000,3.450000,5.330000,7ea228f9-16d0-474d-8c51-5a1a9810ddde,6YG8cjbrjhDhlYMiQnibUD,indie,\n\n\nWhere do I fit in\nI am waiting here for...,en
16795,229443,https://www.last.fm/music/porcelain%2band%2bth...,Transparent,Porcelain and The Tramps,['transparent'],3,6.613333,4.633333,5.773333,,,industrial,Wish I were transparent\nYou could see right t...,en


## Load Model & Tokenizer

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

## Preprocess Tags

In [9]:
data['seeds'] = data['seeds'].apply(lambda i: eval(i))

In [10]:
data['seeds'].apply(len).mean()

1.4778234208489611

## Preprocess Lyrics

In [20]:
def text_repr(lyrics: str, tags: list) -> tuple[str, str]:
    return f'Q: {lyrics}\nA: {", ".join(tags)} |EndOfText|\n'

This code creates a training file with all song lyrics + labels.

In [73]:
prompts = data['Lyric']
tags = data['seeds']

texts = '\n'.join([text_repr(prompts.iloc[i], tags.iloc[i]) for i in tqdm(range(len(prompts)))])

  0%|          | 0/16797 [00:00<?, ?it/s]

## Build Dataset

In [74]:
tokenized_data = tokenizer(texts)

In [75]:
chunk_size = 128

In [85]:
chunks = {
    f'chunk {i // chunk_size}': tokenized_data['input_ids'][i:i + chunk_size]
    for i in range(0, len(tokenized_data['input_ids']), chunk_size)
}

In [86]:
np.array(tokenized_data['input_ids']).shape

(5986757,)

In [32]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, len(tokenized_data), chunk_size)]
    for k, t in tqdm(tokenized_data.items())
}

  0%|          | 0/2 [00:00<?, ?it/s]

In [97]:
if len(chunks[list(chunks.keys())[-1]]) < chunk_size:
    del chunks[list(chunks.keys())[-1]]  # Drop last chunk if it's smaller than chunk_size

In [104]:
from sklearn.model_selection import train_test_split

x_train, x_val = train_test_split(list(chunks.values()), test_size=.1)

In [122]:
np.array(x_train).shape

(42093, 128)

In [100]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [138]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir=f"distilgpt2-finetuned-v{ver}",
    save_total_limit=10,
    evaluation_strategy="epoch",
    # evaluation_strategy="steps",
    # eval_steps=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=15
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=x_train,
    eval_dataset=x_val,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,3.0064,3.010101
2,2.9514,2.995259
3,2.9102,2.992666
4,2.8829,2.980284
5,2.8927,2.972169
6,2.8452,2.967006
7,2.8496,2.962205
8,2.8276,2.959093
9,2.8121,2.952968
10,2.8003,2.954942


KeyboardInterrupt: 

In [139]:
from transformers import pipeline
text_generator = pipeline('text-generation', tokenizer=tokenizer, model=model)

In [151]:
LYRICS = """
Fuck it all and no regrets
I hit the lights on these dark sets
I need a voice to let myself
To let myself go free
Fuck it all and fuckin' no regrets
I hit the lights on these dark sets
Medallion noose, I hang myself
Saint Anger 'round my neck

I feel my world shake
Like an earthquake
Hard to see clear
Is it me? Is it fear?
I'm madly in anger with you (x4)
"""

In [152]:
processed_lyrics = 'Q: ' + LYRICS + '\nA: '

In [185]:
print(text_generator(processed_lyrics, max_length=len(tokenizer(processed_lyrics)['input_ids']) + 10)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: 
Fuck it all and no regrets
I hit the lights on these dark sets
I need a voice to let myself
To let myself go free
Fuck it all and fuckin' no regrets
I hit the lights on these dark sets
Medallion noose, I hang myself
Saint Anger 'round my neck

I feel my world shake
Like an earthquake
Hard to see clear
Is it me? Is it fear?
I'm madly in anger with you (x4)

A: Get down, brooding |EndOfText|


In [141]:
print(data['Lyric'].iloc[0])

Saint Anger 'round my neck
Saint Anger 'round my neck
She never gets respect
Saint Anger 'round my neck
(You flush it out, you flush it out)
Saint Anger 'round my neck
(You flush it out, you flush it out)
He never gets respect
(You flush it out, you flush it out)
Saint Anger 'round my neck
(You flush it out, you flush it out)
She never gets respect

Fuck it all and no regrets
I hit the lights on these dark sets
I need a voice to let myself
To let myself go free
Fuck it all and fuckin' no regrets
I hit the lights on these dark sets
Medallion noose, I hang myself
Saint Anger 'round my neck

I feel my world shake
Like an earthquake
Hard to see clear
Is it me? Is it fear?
I'm madly in anger with you (x4)

Saint Anger 'round my neck
Saint Anger 'round my neck
She never gets respect
Saint Anger 'round my neck
(You flush it out, you flush it out)
Saint Anger 'round my neck
(You flush it out, you flush it out)
She never gets respect
(You flush it out, you flush it out)
Saint Anger 'round my ne

In [141]:
print(data['Lyric'].iloc[0])

Saint Anger 'round my neck
Saint Anger 'round my neck
She never gets respect
Saint Anger 'round my neck
(You flush it out, you flush it out)
Saint Anger 'round my neck
(You flush it out, you flush it out)
He never gets respect
(You flush it out, you flush it out)
Saint Anger 'round my neck
(You flush it out, you flush it out)
She never gets respect

Fuck it all and no regrets
I hit the lights on these dark sets
I need a voice to let myself
To let myself go free
Fuck it all and fuckin' no regrets
I hit the lights on these dark sets
Medallion noose, I hang myself
Saint Anger 'round my neck

I feel my world shake
Like an earthquake
Hard to see clear
Is it me? Is it fear?
I'm madly in anger with you (x4)

Saint Anger 'round my neck
Saint Anger 'round my neck
She never gets respect
Saint Anger 'round my neck
(You flush it out, you flush it out)
Saint Anger 'round my neck
(You flush it out, you flush it out)
She never gets respect
(You flush it out, you flush it out)
Saint Anger 'round my ne