In [None]:
# !pip install transformers

In [2]:
import pandas as pd
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# GPT2 with Fine Tuning

### PIPELINE


In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/Mahmoud-Hesham99/Arabic-Lyrics-Generation/main/arabicLyrics.csv')
df = df.replace("غيرمعروف",np.NAN)
df = df.replace("غير معروف",np.NAN)
df = df.drop(['SongTitle','SongWriter','Composer','SingerNationality'],axis=1)
grouped_df = df.groupby('songID')['Lyrics'].apply('\n'.join).reset_index()
temp = pd.merge(grouped_df,df.drop(["LyricsOrder","Lyrics"],axis=1), on="songID")
temp = temp.drop_duplicates(keep="first").reset_index().drop(["songID","index"],axis=1).reset_index()
temp = temp.rename({"index":"songID"},axis="columns")
df = temp



In [4]:
import re
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)


In [None]:
# Define a custom dataset class for the song lyrics
class SongLyrics(Dataset):
    
    def __init__(self, input_df, gpt2_type="gpt2"):
        # Initialize the GPT-2 tokenizer
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        # Encode lyrics and store them in a list
        for row in input_df:
            temp = self.tokenizer.encode(f"{row[:1024]}")
            # if length of encoding less than 1000 add it
            if len(temp) <= 1000:
                self.lyrics.append(torch.tensor(temp))
        
        # Store the number of lyrics in the dataset
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]

In [6]:
#Accumulated batch size (since GPT2 is so big)
# Utility function to pack tensors based on a maximum sequence length
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [None]:
# Function for training the model
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=20, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False, save_model_on_epoch=False,
):
    # Set up training parameters
    acc_steps = 100
    device = torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss = 0
    accumulating_batch_count = 0
    input_tensor = None

    # Iterate over epochs
    for epoch in range(epochs):
        print(f"Training epoch {epoch}")
        print(loss)
        
        # Iterate over batches in the dataloader
        for idx, entry in tqdm(enumerate(train_dataloader)):
            # Pack the input tensors based on the maximum sequence length
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        
        # Save the model at each epoch if specified
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    
    return model

In [None]:
# Process songs by dialect
for name in temp["SongDialect"].unique():
    # Filter songs based on dialect
    df = temp[temp["SongDialect"]==name]
    
    # Preprocess the lyrics (e.g., remove punctuation)
    df['Lyrics'] = df['Lyrics'].apply(remove_punctuation)
    
    # Create a small test set for evaluation
    test_set = df.sample(n=10, random_state=32)
    df = df.loc[~df.index.isin(test_set.index)]

    # Reset the indexes
    test_set = test_set.reset_index()
    df = df.reset_index()

    # Store the last 20 words in a separate column for evaluation
    test_set['True_end_lyrics'] = test_set['Lyrics'].str.split().str[-20:].apply(' '.join)
    test_set['Lyrics'] = test_set['Lyrics'].str.split().str[:-20].apply(' '.join)
    
    # Create the dataset using the preprocessed lyrics and GPT-2 tokenizer
    dataset = SongLyrics(df['Lyrics'], gpt2_type="gpt2")
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    # Create or load the GPT-2 model
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    
    # Fine-tune the GPT-2 model on the lyrics dataset
    model = train(dataset, model, tokenizer)
    
    # Save the trained model for future use
    torch.save(model, f'/kaggle/working/model_{name}.pt')