## Fine Tuning TinyLlama on Eminem Lyrics


Data - https://www.kaggle.com/datasets/aditya2803/eminem-lyrics

In [1]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl

In [2]:
import torch
import glob
import pandas as pd
import numpy as np
import re
from peft import get_peft_model, PeftConfig, PeftModel, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from trl import SFTTrainer
from datasets import Dataset
import glob

### Load all .csv files from a directory which contains the Taylor Swift Lyrics

In [3]:
import logging

def find_csv_files(path, file_extension="*.csv"):
    try:
        files = glob.glob(f"{path}/{file_extension}")
        if not files:
            logging.warning(f"No files found in {path} with extension {file_extension}")
        return files
    except Exception as e:
        logging.error(f"Error finding files in {path}: {e}")
        return []

def read_csv_files(file_paths, column_name='Lyrics'):
    df_list = []
    for file in file_paths:
        try:
            df = pd.read_csv(file)
            if column_name in df.columns:
                df_list.append(df)
            else:
                logging.warning(f"Column {column_name} not found in {file}")
        except Exception as e:
            logging.error(f"Error reading {file}: {e}")
    return pd.concat(df_list, ignore_index=True) if df_list else pd.DataFrame()

def concatenate_lyrics(df, column_name='Lyrics'):
    if column_name in df.columns:
        try:
            return '\n'.join(str(lyric) for lyric in df[column_name])
        except Exception as e:
            logging.error(f"Failed to concatenate lyrics: {e}")
            return ""
    else:
        logging.warning(f"Column {column_name} not found in DataFrame")
        return ""

def load_and_concatenate_lyrics(path, file_extension="*.csv", column_name='Lyrics'):
    files = find_csv_files(path, file_extension)
    if not files:
        return ""
    df = read_csv_files(files, column_name)
    if df.empty:
        return ""
    return concatenate_lyrics(df, column_name)
path = '/content'
lyrics = load_and_concatenate_lyrics(path)
print(lyrics[:200])

[Intro: Alfred Hitchcock]
Thus far, this album has provided musical accompaniment to make your passing pleasant
Our next number is designed to drown out the sound of shovels
Music to be buried by
[Cho


In [4]:
print(' '.join(sorted(set(lyrics))))


   ! " $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] a b c d e f g h i j k l m n o p q r s t u v w x y z { } ß à á â ä ç è é í ï ó ö ú ü – — ‘ ’ ‚ “ ” …


In [5]:
import re

def replace_characters(text, replacement_dict):
    return text.translate(str.maketrans(replacement_dict))

def remove_patterns(text, pattern_list):
    for pattern in pattern_list:
        text = re.sub(pattern, '', text)
    return text

def clean_lyrics(lyrics):
    replace_with_space = ['\u2005', '\u200b', '\u205f', '\xa0', '-']
    replace_letters = {'í':'i', 'é':'e', 'ï':'i', 'ó':'o', ';':',', '‘':'\'', '’':'\'', ':':',', 'е':'e'}
    remove_list = ['\)', '\(', '–','"','”', '"', '\[.*\]', '.*\|.*', '—']
    lyrics = replace_characters(lyrics, replace_letters)
    for string in replace_with_space:
        lyrics = lyrics.replace(string, ' ')
    lyrics = remove_patterns(lyrics, remove_list)
    return lyrics
cleaned_lyrics = clean_lyrics(lyrics)

In [6]:
print(''.join(sorted(set(cleaned_lyrics))))


 !$%&'*+,./0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZ]abcdefghijklmnopqrstuvwxyz{}ßàáâäçèöúü‚“…


In [7]:
cleaned_lyrics



In [8]:
from datasets import Dataset

def create_train_test_datasets(cleaned_lyrics, train_ratio=0.95, segment_length=500):
    split_point = int(len(cleaned_lyrics) * train_ratio)
    train_data = cleaned_lyrics[:split_point]
    test_data = cleaned_lyrics[split_point:]
    train_data_segments = [train_data[i:i + segment_length]
                           for i in range(0, len(train_data), segment_length)]
    train_dataset = Dataset.from_dict({'text': train_data_segments})
    return train_dataset, test_data
train_dataset, test_data = create_train_test_datasets(cleaned_lyrics)

In [9]:
print(len(train_dataset))

2495


In [10]:
train_dataset

# Check the dataset structure
# Should output below
"""
Dataset({
    features: ['text'],
    num_rows: 557
})
"""

"\nDataset({\n    features: ['text'],\n    num_rows: 557\n})\n"

In [11]:
from pprint import pprint
pprint(train_dataset[0])

{'text': '\n'
         'Thus far, this album has provided musical accompaniment to make your '
         'passing pleasant\n'
         'Our next number is designed to drown out the sound of shovels\n'
         'Music to be buried by\n'
         '\n'
         'Black magic, night walker Yeah\n'
         'She haunts me like no other Feel like\n'
         "Nobody told me I don't know love is pain, oh I know we just met\n"
         "Black magic, dark water But it's like\n"
         "Surrounds me like no other It's like I know you better than anyone\n"
         "She's got my heart in chains\n"
         '\n'
         '\n'
         "We're volatile, I can't call it, though\n"
         "It's like too large"}


In [12]:
def load_quantized_model(model_identifier: str, compute_dtype: torch.dtype) -> AutoModelForCausalLM:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=compute_dtype,
    )
    model =  AutoModelForCausalLM.from_pretrained(
        model_identifier,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True,
    )
    return model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

model = load_quantized_model(model_name, torch.bfloat16)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def generate_lyrics(query, model):
    encoding = tokenizer(query, return_tensors="pt").to(device)
    generation_config = GenerationConfig(max_new_tokens=250, pad_token_id = tokenizer.eos_token_id,repetition_penalty=1.3, eos_token_id = tokenizer.eos_token_id)

    outputs = model.generate(input_ids=encoding.input_ids, generation_config=generation_config)

    text_output = tokenizer.decode(outputs[0],skip_special_tokens=True)
    print('INPUT\n', query, '\n\nOUTPUT\n', text_output[len(query):])

generate_lyrics(test_data[200:700], model)

INPUT
 is I draw visual, pictures when a nigga vents
So welcome to the art department


It's all psychologic, why am I so maniacal?
Am I a psycho or some kind of psychotic shtick?
Or am I truly psychotic? Or my molecules just diabolical?
'Cause biologically, I defy logic, by golly, haul me off on a trolley
Probably want a quack, or I'm back on that whacky tobacca
I'ma step away from crack, Obie smacked the shit out of me
I'm so slap happy, man, snap me back to reality
I'm a walkin' art gallery, a scien 

OUTPUT
 ziest with no name
But it ain't easy for this artist, who can be both an idiot and genius at once.

(Verse 2)
The world around us seems like a nightmarish place
A labyrinthine maze where we never seem to find our way
And yet, in spite of these obstacles, there are moments of beauty
Like the sunset over the city skyline, or the sound of rain falling softly
These fleeting glimpses into life make everything feel worthwhile
Even if they don't always lead you straight home


Therefo

In [15]:
model = prepare_model_for_kbit_training(model)
lora_alpha = 32
lora_dropout = 0.05
lora_rank = 32
lora_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",
    task_type="CAUSAL_LM")
peft_model = get_peft_model(model, lora_config)

In [17]:
output_dir = "anshchoudhary/tinylama_eminem"
per_device_train_batch_size = 3
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_strategy="steps"
save_steps = 10
logging_steps = 10
learning_rate = 2e-3
max_grad_norm = 0.3
max_steps = 200
warmup_ratio = 0.03
lr_scheduler_type = "cosine"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    report_to='none'
)

In [18]:
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=train_dataset,
    peft_config=lora_config,
    max_seq_length=500,
    dataset_text_field='text',
    tokenizer=tokenizer,
    args=training_arguments
)
peft_model.config.use_cache = False

Map:   0%|          | 0/2495 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [19]:
trainer.train()



Step,Training Loss
10,3.0227
20,3.0203
30,2.8763
40,2.9768
50,2.9064
60,3.0395
70,2.7455
80,2.9089
90,2.9654
100,2.7514




TrainOutput(global_step=200, training_loss=2.885619812011719, metrics={'train_runtime': 688.7253, 'train_samples_per_second': 1.742, 'train_steps_per_second': 0.29, 'total_flos': 1290740870983680.0, 'train_loss': 2.885619812011719, 'epoch': 0.48})

In [25]:
train_dataset[1200]

{'text': " keeper, I'm sleepin'\nWhat the fuck you keep on fuckin' with me for?\nSlut, you need to leave me the fuck alone, I ain't playin'\nGo find you a white crayon and color a fuckin' zebra\n\nMy mom loved Valium and lots of drugs\nThat's why I am like I am 'cause I'm like her\nBecause my mom loved Valium and lots of drugs\nThat's why I'm on what I'm on 'cause I'm my mom\n\n\nWait a minute, this ain't dinner, this is paint thinner\nYou ate it yesterday, I ain't hear no complaints, did I?\nNow here's a plate full o"}

In [26]:
generate_lyrics(" keeper, I'm sleepin'\nWhat the fuck you keep on fuckin' with me for?\nSlut, you need to leave me the fuck alone, I ain't playin'\nGo find you a white crayon and color a fuckin' zebra\n\nMy mom loved Valium and lots of drugs\nThat's why I am like I am 'cause I'm like her\nBecause my mom loved Valium and lots of drugs\nThat's why I'm on what I'm on 'cause I'm my mom\n\n\nWait a minute, this ain't dinner, this is paint thinner\nYou ate it yesterday, I ain't hear no complaints, did I?\nNow here's a plate full o", model)



INPUT
  keeper, I'm sleepin'
What the fuck you keep on fuckin' with me for?
Slut, you need to leave me the fuck alone, I ain't playin'
Go find you a white crayon and color a fuckin' zebra

My mom loved Valium and lots of drugs
That's why I am like I am 'cause I'm like her
Because my mom loved Valium and lots of drugs
That's why I'm on what I'm on 'cause I'm my mom


Wait a minute, this ain't dinner, this is paint thinner
You ate it yesterday, I ain't hear no complaints, did I?
Now here's a plate full o 

OUTPUT
 ' shit that was just thrown at me
I gotta get rid of all these bitches who hate me
And they don't even know how much I love them
So let's go through every one of em, see if we can make an exception
For some of us, but not everyone
But hey, I guess I could use another drink
Just so long as she doesn't have any kids or grandkids
She better be careful though, because I might hit ya too hard
If there were more than two people in the room
Then I would probably end up hitting your as