# Imports

In [2]:
import tensorflow as tf
import transformers
import pandas as pd
import numpy as np
import os

2024-03-14 17:21:43.115348: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# if you have train error, try this:
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [8]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            
    except RuntimeError as e:
        print(e)

Physical devices cannot be modified after being initialized


# Model initilization

In [9]:
MODEL_NAME = 'gpt2'

In [10]:
config = transformers.GPT2Config.from_pretrained(MODEL_NAME)

In [11]:
model = transformers.TFGPT2LMHeadModel.from_pretrained(MODEL_NAME)
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [12]:
enc = tokenizer(['some sentence'], add_special_tokens=True, return_tensors='np', max_length=512, truncation=True)
print('enc =', enc['input_ids'])

enc = [[11246  6827]]


Cheking EOS token:

In [19]:
[key for key, value in tokenizer.get_vocab().items() if value == 50256]

['<|endoftext|>']

# Dataset preparation

For this purpose i am using Spotify Million Song Dataset from kaggle (https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset). It contains song names, artists names, link to the song and lyrics.

In [18]:
df = pd.read_csv('Spotify Million Song Dataset_exported.csv').drop('link', axis = 1)

Creating a dataframe and filling it with data in form [sentence, attention_mask]. In this case
I am using only texts, but it is possible to add name of song in data.

In [21]:
tokenized_df = pd.DataFrame()

In [20]:
def encode_text(x):
    # tokenization
    tokenized = tokenizer(x, add_special_tokens=False, return_tensors='np')

    # adding eos to ids and mask
    tokenized['input_ids'] = np.append(tokenized['input_ids'], 50256)
    tokenized['attention_mask'] = np.append(tokenized['attention_mask'], 1)
    
    return tokenized['input_ids'], tokenized['attention_mask']

In [23]:
for col in df:
    if col == 'text':
        tokenized_df[['sentence', 'attention_mask']] = df[col].apply(encode_text).apply(pd.Series)

Using tensorflow dataset for training

In [15]:
from tensorflow.data import Dataset
from tensorflow.keras.preprocessing.sequence import pad_sequences

Defining dataset form

In [16]:
def format_dataset(texts, attention_mask):
    source = {"input_ids": texts,
              "attention_mask": attention_mask}
    target = texts
    
    return (source, target)

In [24]:
tokenized_df = tokenized_df.sample(frac=1).reset_index(drop=True) #shuffle ds

Padding sequences to length 512 (with this model possible is up to 1024 tokens). Using values -1 for texts and 0 for attention mask.

In [1]:
def make_dataset(df, batch_size=1, maxlen = 512): #batch_size = 1 due to lack of video memory

    texts = tf.constant(pad_sequences(df['sentence'], maxlen = maxlen, truncating = "post", padding = "post", value = -1))
    mask = tf.constant(pad_sequences(df['attention_mask'], maxlen = maxlen, truncating = "post", padding = "post", value = 0))
    
    dataset = tf.data.Dataset.from_tensor_slices((texts, mask))
    
    return dataset.shuffle(2048) \
                  .batch(batch_size) \
                  .map(format_dataset) \
                  .prefetch(16).cache()

In [19]:
#train validation split
train_ds = make_dataset(tokenized_df.head(55650))
val_ds = make_dataset(tokenized_df.tail(2000))

# Training

In [20]:
from tensorflow.keras.optimizers import Adam

In [21]:
model.compile(optimizer=Adam(3e-5)) # compiling model. Don't use loss, as it says in the documentation

I`ve fited model for 1 epoch. It is possible that more epochs will lead to better generation, but it is time consuming.

In [22]:
model.fit(train_ds, validation_data = val_ds) 

2024-02-15 10:38:13.748905: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f33d4025600 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-15 10:38:13.748967: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1080, Compute Capability 6.1
2024-02-15 10:38:13.753728: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-02-15 10:38:13.895741: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2024-02-15 10:38:14.008685: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




<keras.src.callbacks.History at 0x7f34002ca3a0>

In [28]:
# saving model
output_dir = 'fine_tuned_gpt2'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('fine_tuned_gpt2/tokenizer_config.json',
 'fine_tuned_gpt2/special_tokens_map.json',
 'fine_tuned_gpt2/vocab.json',
 'fine_tuned_gpt2/merges.txt',
 'fine_tuned_gpt2/added_tokens.json',
 'fine_tuned_gpt2/tokenizer.json')

# Generation

In [3]:
pipe = transformers.pipeline(task='text-generation', model='fine_tuned_gpt2', )

2024-03-14 17:21:51.950525: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:07:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-14 17:21:51.952260: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:07:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-14 17:21:51.952685: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:07:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-14 17:21:51.954311: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:07:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-14 17:21:51.954701: I tensorflow/compile

To generate lyrics write down start_point (can be empty).

In [None]:
start_point = ""

#max_length - length of the returned sentence. A longer length requires more time to generate.
#num_return_sequences - the number of sequences generated.
#temperature - this parameter defines how diverse or "crazy" the model will be in its predictions.
#do_sample - this parameter defines whether the model will use top-p/top-k sampling. It allows the use of more rare tokens during generation.
generated = pipe(start_point, max_length=128, num_return_sequences=1, return_full_text=True, temperature = 0.9, do_sample = True)

#print(start_point)
for sent in generated:
     print(sent['generated_text'])