In [1]:
import tensorflow as tf### models
import numpy as np### math computations
import matplotlib.pyplot as plt### plotting bar chart
import sklearn### machine learning library
import cv2## image processing
from sklearn.metrics import confusion_matrix, roc_curve### metrics
import seaborn as sns### visualizations
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import gensim.downloader as api
from PIL import Image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense,Flatten,InputLayer,BatchNormalization,Dropout,Input,LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from datasets import load_dataset
from transformers import GPT2TokenizerFast,create_optimizer,DataCollatorForLanguageModeling,TFGPT2LMHeadModel




In [2]:
MAX_LENGTH = 256
BATCH_SIZE = 6

## Dataset Preperation

In [3]:
filepath = 'drake_data.csv'
dataset = load_dataset('csv', data_files=filepath)

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['album', 'lyrics_title', 'lyrics_url', 'lyrics', 'track_views'],
        num_rows: 290
    })
})

In [5]:
dataset['train'][18]

{'album': 'Dark Lane Demo Tapes',
 'lyrics_title': 'Pain 1993 (Ft. Playboi Carti) Lyrics',
 'lyrics_url': 'https://genius.com/Drake-pain-1993-lyrics',
 'lyrics': "[Intro: Drake]\n(Yo, Pi'erre, yo, Pi’erre)\nAyy\n\n[Chorus: Drake & Playboi Carti]\nAyy\nNiggas ain't gotta respect\nNiggas just gotta accept\nI put that top left\nI love my crodie to death (Codeine)\nHype Williams, Lil' X\nIf you don’t say it direct\nCould give a fuck, ayy, yeah, yeah\n\n[Verse 1: Drake & Playboi Carti]\nI put some ice on her hand\nI let her take an advance\nShit wasn't goin' as planned\nI put that shit in the van\nShe gotta move with her friend\nHeard she went back to her man\nGive a fuck, ayy\nI just put a Wagen in the driveway, you know I did\nWhen I shoot my shot it's the Kawhi way, it's goin' in\nMe and lil' Sicko sittin' sideways, breakin’ tens\nUsed to be an antisocial nigga, now I’m makin' friends\nI just got a mansion out in Turks and it’s a beachfront (Okay)\nIf she's bringin' four friends, I know 

In [6]:
model_id = 'gpt2-medium'
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

In [9]:
# n_wasted = 0

# for i in range(len(dataset['train'])):
#     try:
#         outputs=tokenizer(
#             dataset['train'][i]['lyrics'],
#             truncation=True,
#             max_length=256,
#             return_overflowing_tokens=True,
#             return_length=True
#         )
#         print(i,outputs['length'])

#         for k in outputs['lenght']:
#             if k!=256:
#                 n_wasted+=k
#     except:
#         print(i)

In [10]:
def preprocess_function(example):
    try:
        outputs=tokenizer(
            example['lyrics'],
            truncation=True,
            max_length=MAX_LENGTH,
            return_overflowing_tokens=True,
            return_length=True
        )
        input_batch = []
        for length, input_ids in zip(outputs['length'], outputs['input_ids']):
            if length==MAX_LENGTH:
                input_batch.append(input_ids)
                valid_input_ids=input_ids
        if len(input_batch)!=0:
            for i in range(BATCH_SIZE-len(input_batch)):
                input_batch.append(valid_input_ids)
    except:
        print(example)
        input_batch=[]
    return {'input_ids':input_batch}

In [11]:
tokenized_dataset = dataset.map(
    preprocess_function, remove_columns=dataset['train'].column_names
)

Map:   0%|          | 0/290 [00:00<?, ? examples/s]

{'album': 'Thank Me Later', 'lyrics_title': 'Thank Me Later [Booklet] Lyrics', 'lyrics_url': 'https://genius.com/Drake-thank-me-later-booklet-annotated', 'lyrics': None, 'track_views': '6.2K'}
{'album': 'Unreleased Songs', 'lyrics_title': 'Untitled DaBaby Collaboration* (Ft. DaBaby) Lyrics', 'lyrics_url': 'https://genius.com/Drake-untitled-dababy-collaboration-lyrics', 'lyrics': None, 'track_views': '(Unreleased)'}


In [12]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 290
    })
})

In [13]:
def filter_out(example):
    if len(example['input_ids'])>=1:
        return example

In [14]:
tokenized_full_dataset = tokenized_dataset.filter(filter_out)
print(tokenized_full_dataset)

Filter:   0%|          | 0/290 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 270
    })
})


In [15]:
max_batch_len=0

In [16]:
for i in range(270):
    if len(tokenized_full_dataset['train'][i]['input_ids'])>max_batch_len:
        max_batch_len = len(tokenized_full_dataset['train'][i]['input_ids'])

In [17]:
print(max_batch_len)

6


In [18]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors='tf')

In [19]:
tf_train_dataset = tokenized_full_dataset['train'].to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'labels'],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=1
)

In [20]:
for i in tf_train_dataset.take(1):
    print(i)

{'input_ids': <tf.Tensor: shape=(1, 6, 256), dtype=int64, numpy=
array([[[   58, 13414,   325, ..., 18869,   766,   502],
        [  198,  7556,  4386, ...,  5211,  1243,   618],
        [  198,  7556,  4386, ...,  5211,  1243,   618],
        [  198,  7556,  4386, ...,  5211,  1243,   618],
        [  198,  7556,  4386, ...,  5211,  1243,   618],
        [  198,  7556,  4386, ...,  5211,  1243,   618]]], dtype=int64)>, 'attention_mask': <tf.Tensor: shape=(1, 6), dtype=int64, numpy=array([[1, 1, 1, 1, 1, 1]], dtype=int64)>, 'labels': <tf.Tensor: shape=(1, 6, 256), dtype=int64, numpy=
array([[[   58, 13414,   325, ..., 18869,   766,   502],
        [  198,  7556,  4386, ...,  5211,  1243,   618],
        [  198,  7556,  4386, ...,  5211,  1243,   618],
        [  198,  7556,  4386, ...,  5211,  1243,   618],
        [  198,  7556,  4386, ...,  5211,  1243,   618],
        [  198,  7556,  4386, ...,  5211,  1243,   618]]], dtype=int64)>}


In [21]:
def adjust_attention_mask(input):
    return {'input_ids':input['input_ids'],
            'attention_mask':tf.ones([1,BATCH_SIZE, MAX_LENGTH]),
            'labels':input['labels']}

In [22]:
train_dataset=tf_train_dataset.map(adjust_attention_mask)

In [23]:
for i in train_dataset.take(1):
    print(i)

{'input_ids': <tf.Tensor: shape=(1, 6, 256), dtype=int64, numpy=
array([[[   58,  5317,   305, ...,    11,   788,   340],
        [ 3011,   257,  1310, ...,    11,  1842,   198],
        [   47,  8023,  1263, ...,   262, 20041,   345],
        [ 1053,   587,  1708, ...,  2514,   766,   611],
        [ 1053,   587,  1708, ...,  2514,   766,   611],
        [ 1053,   587,  1708, ...,  2514,   766,   611]]], dtype=int64)>, 'attention_mask': <tf.Tensor: shape=(1, 6, 256), dtype=float32, numpy=
array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]], dtype=float32)>, 'labels': <tf.Tensor: shape=(1, 6, 256), dtype=int64, numpy=
array([[[   58,  5317,   305, ...,    11,   788,   340],
        [ 3011,   257,  1310, ...,    11,  1842,   198],
        [   47,  8023,  1263, ...,   262, 20041,   345],
        [ 1053,   58

In [24]:
unbatched_dataset = train_dataset.unbatch()

In [25]:
for i in unbatched_dataset.take(1):
    print(i)

{'input_ids': <tf.Tensor: shape=(6, 256), dtype=int64, numpy=
array([[   58,  5317,   305, ..., 13300,   314,   815],
       [   58,  5317,   305, ..., 13300,   314,   815],
       [   58,  5317,   305, ..., 13300,   314,   815],
       [   58,  5317,   305, ..., 13300,   314,   815],
       [   58,  5317,   305, ..., 13300,   314,   815],
       [   58,  5317,   305, ..., 13300,   314,   815]], dtype=int64)>, 'attention_mask': <tf.Tensor: shape=(6, 256), dtype=float32, numpy=
array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)>, 'labels': <tf.Tensor: shape=(6, 256), dtype=int64, numpy=
array([[   58,  5317,   305, ..., 13300,   314,   815],
       [   58,  5317,   305, ..., 13300,   314,   815],
       [   58,  5317,   305, ..., 13300,   314,   815],
       [   58,  5317,   305, ..., 13300,   314

## Modeling

In [26]:
model = TFGPT2LMHeadModel.from_pretrained(model_id)
model.summary()

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]




All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLay  multiple                  354823168 
 er)                                                             
                                                                 
Total params: 354823168 (1.32 GB)
Trainable params: 354823168 (1.32 GB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
num_train_steps = len(unbatched_dataset)
optimizer, schedule = create_optimizer(
    init_lr = 1e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
)
model.compile()




In [29]:
history=model.fit(unbatched_dataset, epochs=2)

Epoch 1/2



KeyboardInterrupt: 

In [None]:
model.save_weights('drake_lyrics_generator.h5')

In [None]:
input_text = "Do you want me to"

In [None]:
input_ids = tokenizer(input_text, return_tensors='tf')['input_ids']

In [None]:
init_time = time.time()
output_greedy = model.generate(input_ids, max_length=256, do_sample=False)
print(tokenizer.decode(output_greedy[0]))
print(time.time()-init_time)

In [None]:
init_time=time.time()
output_beam = model.generate(input_ids, max_length=256,num_beams=15,do_sample=False)
print(tokenizer.decode(output_beam[0]))
print(time.time()-init_time)

In [None]:
init_time=time.time()
output_temp = model.generate(input_ids, max_length=256, do_sample=True,temperature=1.0, top_k=0)
print(tokenizer.decode(output_temp[0]))
print(time.time()-init_time)

In [None]:
init_time=time.time()
output_temp = model.generate(input_ids, max_length=256, do_sample=True,temperature=2.0, top_k=0)
print(tokenizer.decode(output_temp[0]))
print(time.time()-init_time)

In [None]:
init_time=time.time()
output_temp = model.generate(input_ids, max_length=256, do_sample=True,temperature=0.5, top_k=0)
print(tokenizer.decode(output_temp[0]))
print(time.time()-init_time)

In [None]:
init_time=time.time()
output_topk = model.generate(input_ids, max_length=256, do_sample=True,top_k=50)
print(tokenizer.decode(output_topk[0]))
print(time.time()-init_time)

In [None]:
init_time=time.time()
output_topk = model.generate(input_ids, max_length=256, do_sample=True,temperature=2.0,top_k=50)
print(tokenizer.decode(output_topk[0]))
print(time.time()-init_time)

In [None]:
init_time=time.time()
output_topp = model.generate(input_ids, max_length=256, do_sample=True,top_p=0.90)
print(tokenizer.decode(output_topp[0]))
print(time.time()-init_time)

In [None]:
input_ids = tokenizer(input_text, return_tensors="tf")["input_ids"]
output_greedy = model.generate(input_ids,max_length=256,do_sample=False)
print(tokenizer.decode(output_greedy[0]))

In [None]:
output_beam = model.generate(input_ids, max_length=256,num_beams=5,do_sample=False)
print(tokenizer.decode(output_beam[0]))

In [None]:
output_temp = model.generate(input_ids, max_length=256, do_sample=True,temperature=2.0, top_k=0)
print(tokenizer.decode(output_temp[0]))

In [None]:
output_temp = model.generate(input_ids, max_length=256, do_sample=True,temperature=0.5, top_k=0)
print(tokenizer.decode(output_temp[0]))

In [None]:
output_topk = model.generate(input_ids, max_length=256, do_sample=True,top_k=50)
print(tokenizer.decode(output_topk[0]))

In [None]:
output_topp = model.generate(input_ids, max_length=256, do_sample=True,top_p=0.90)
print(tokenizer.decode(output_topp[0]))

In [None]:
from transformers import pipeline

pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, max_length=256,
)

In [None]:
txt="I put my knee on the floor, baby please open the door, it's getting rough on me, someone please come for me"

In [None]:
print(pipe(input_text, num_return_sequences=1)[0]["generated_text"])

In [None]:
print(pipe(input_text, num_return_sequences=1)[0]["generated_text"])