In [32]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace
from transformers import AutoTokenizer

In [33]:
# import kagglehub
# path = kagglehub.dataset_download("mateibejan/multilingual-lyrics-for-genre-classification")
# print(path)

In [34]:
# !!! Execute this cell to load the dataset if it is not already loaded

import pandas as pd
modified_path = path.replace("C:\\", "/").replace("\\", "/") + "/train.csv"
df = pd.read_csv(modified_path)

In [35]:
# df = pd.read_csv("lyrics_genre.csv")
# df.head()
# print(len(df))
# print(df.Genre.unique())

In [36]:
df = df[df['Language']=='en']
columns_to_drop = ['Language']
df = df.drop(columns=columns_to_drop)

df = df.groupby(['Artist', 'Song']).agg({
    'Lyrics': 'first',  # Take the first instance of the lyrics for each group
    'Genre': lambda x: list(set(x))       # Aggregate all genres into a list
}).reset_index()
print(len(df))

210010


In [37]:
multi_genre_songs = df[df['Genre'].apply(len) > 1]

# Display these songs
print(multi_genre_songs.head())

       Artist          Song  \
122  2 chainz    100 joints   
169  2 chainz         kesha   
713   50 cent  all his love   
714   50 cent     all of me   
726   50 cent      bad news   

                                                Lyrics           Genre  
122  [Hook]\nNo matter what they say I smoke 100 jo...  [Hip-Hop, Pop]  
169  I am in love, with what we are, not what we sh...  [Hip-Hop, Pop]  
713  [Chorus: singing]\nHe gon' give you all his lo...  [Hip-Hop, Pop]  
714  Mary~\nnow if i give ya all of me what ya gon ...  [Hip-Hop, Pop]  
726  Lloyd Banks in the house, bad news\nTony Yayo ...  [Hip-Hop, Pop]  


In [38]:
columns_to_drop = ['Artist',"Song"]
df = df.drop(columns=columns_to_drop)


In [39]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

def split_into_sentences(lyrics):
    sentences = sent_tokenize(lyrics)
    return '\n'.join(sentences)

df['Lyrics'] = df['Lyrics'].apply(split_into_sentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [40]:
print(df['Lyrics'][1])

Hazy Days With Lazy Ways You get less done but more Out of your days How can you work yet avoid Getting out of your lazy ways You'll never get up if you don't get up You'll always stay down if you sit around You'll never get up if you don't get up You'll always stay down if you sit around Hazy Days Lazy Ways We got less done but more Out of our days How can we ever recapture the feeling Of lazy ways You'll never get up if you don't get up You'll always stay down if you sit around Where nobody cares and nobody tries 'Cos a daydreams resting on the back of your eyes On the back of your eyes Taking five Bring love to me With your body Let me hold you Endless, endlessly You'll never get up if you don't get up You'll always stay down if you sit around Where nobody cares and nobody tries 'Cos a daydreams resting on the back of your eyes On the back of your eyes Hazy days Lazy ways You get less done but more Out of your days Crazy days Lazy ways


In [41]:
import re
# to reduce vocab size let s try to remove more unecessary symbols and parts
# Define a cleaning function
def clean_lyrics(lyrics):
    # Remove section tags like [Intro], [Verse 1], etc.
    cleaned = re.sub(r'\[.*?\]', '', lyrics)

    # Remove credits or text after "---"
    cleaned = re.split(r'---', cleaned)[0]

    # Remove symbols except for line breaks, alphanumeric characters, and key punctuation
    cleaned = re.sub(r"[^\w\s\n.!?]", '', cleaned)

    # Remove extra whitespace and blank lines
    cleaned = re.sub(r'\n\s*\n', '\n', cleaned).strip()

    cleaned = cleaned.lower()

    # Repeated punctuation sign normalization
    cleaned = re.sub(r'(\!{2,})', ' exclamations ', cleaned)
    cleaned = re.sub(r'(\?{2,})', ' questionMarks ', cleaned)
    cleaned = re.sub(r'(\.{3,})', ' ellipsis ', cleaned)

    return cleaned

# Apply the cleaning function
df['Lyrics'] = df['Lyrics'].apply(clean_lyrics)
print(df['Lyrics'][1])

hazy days with lazy ways you get less done but more out of your days how can you work yet avoid getting out of your lazy ways youll never get up if you dont get up youll always stay down if you sit around youll never get up if you dont get up youll always stay down if you sit around hazy days lazy ways we got less done but more out of our days how can we ever recapture the feeling of lazy ways youll never get up if you dont get up youll always stay down if you sit around where nobody cares and nobody tries cos a daydreams resting on the back of your eyes on the back of your eyes taking five bring love to me with your body let me hold you endless endlessly youll never get up if you dont get up youll always stay down if you sit around where nobody cares and nobody tries cos a daydreams resting on the back of your eyes on the back of your eyes hazy days lazy ways you get less done but more out of your days crazy days lazy ways


In [42]:
print(df['Lyrics'][12])

times are far between and few i bet we can look upon our lives without regret of all the things i have done you think im proud of everyone without exception til you make your peace with yesterday never build a future i swear by what i say whatever penance you do decide what its worth to you and then respect it however long it will take to weather your mistakes why not accept it?
my hands for now are tied im a body frozen im a will thats paralyzed when will you ever set aside your pain and misery?
no matter how i beg no matter how i wish or plead youll never be more than alive youll never do more than survive until you expect it do you want to build a world with our lives?
you better soon decide or you can forget it my hands for now are tied im a body frozen im a will thats paralyzed til you drop that heavy baggage youre dragging behind there wont be room for us to both go this ride


In [43]:
# def handle_negations(text):
#     sub_tokens = text.split()
#     neg = False
#     neg_tokens = []
#     punctuation = ['.', '?', '!', ';', 'exclamations', 'questionMarks', 'ellipsis']
#     negation_words = ["n't", "not", "no", "never"]

#     for token in sub_tokens:
#         if token in punctuation:
#             neg = False

#         neg_token = f"NOT_{token}" if neg else token

#         if '\'t' in token or token == 'not':
#             neg = True

#         if any(neg_word in token for neg_word in negation_words):
#             neg = True

#         neg_tokens.append(neg_token)


#     return ' '.join(neg_tokens)

# df['Lyrics'] = df['Lyrics'].apply(handle_negations)

# print(df['Lyrics'][2])

In [44]:
print(df['Lyrics'][11])

well old dan tanengraph you can keep your cold dead stare your pretty ladys waiting out in the coal stained air and ive watched your hair turn grey throughout the coarse of one fateful day and she got shy water in her eyes and she got shy water in her eyes well on that creaky wooden porch you can think about what youve lost while some nurse you dont know brings you paper cups full of pills well your pretty ladys somewhere lonely and turning grey oh what a mistake you have made and she got shy water in her eyes and she got shy water in her eyes well you can stick to your guns if you please it dont bother me oh no you can stick to your guns if you need and you need well if theres a moral here thats to be learned its not to let a good thing pass you by well right now youre staring straight ahead beside some girl with shy water in her eyes and she got shy water in her eyes and she got shy water in her eyes well you can stick to your guns if you please it dont bother me oh no


In [46]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

# Step 2: Initialize the tokenizer with an OOV token
tokenizer = Tokenizer(oov_token="<UNK>")

# Fit the tokenizer on the sampled lyrics data
tokenizer.fit_on_texts(df['Lyrics'])

# Step 3: Convert lyrics to sequences
sequences = tokenizer.texts_to_sequences(df['Lyrics'])

# Determine the longest sequence length dynamically
sequence_lengths = np.array([len(seq) for seq in sequences])
max_sequence_length = int(np.percentile(sequence_lengths, 95))
print(f"Dynamic max_sequence_length: {max_sequence_length}")

# Step 4: Pad or truncate sequences to match the dynamic length
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Print the top 10 most frequent words in the tokenizer's word index
print(f"Top 10 most frequent words in tokenizer vocabulary: {list(tokenizer.word_index.items())[:10]}")

Dynamic max_sequence_length: 529
Top 10 most frequent words in tokenizer vocabulary: [('<UNK>', 1), ('the', 2), ('you', 3), ('i', 4), ('and', 5), ('to', 6), ('a', 7), ('me', 8), ('my', 9), ('it', 10)]
