## This notebook is dedicated to the exploration of the Lyrics dataset


In [11]:
import pandas as pd


In [12]:
n_rows = 40000
lyrics = pd.read_csv('../song_lyrics.csv', nrows=n_rows)
# lyrics[:10, 1:]

In [13]:
#lyrics.iloc[:10, 2:8]
# print(lyrics.iat[9,6])

In [14]:
lyrics.iloc[:10 ]

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en
5,Lollipop Remix,rap,Lil Wayne,2008,580832,"{""Kanye West"",""Static Major""}",[Intro: Lil Wayne]\nHaha\nUh-huh\nNo homo (You...,7,en,en,en
6,Im Not You,rap,Clipse,2002,28645,"{Jadakiss,""Styles P"",""Roscoe P. Coldchain""}","[Intro: Pusha T]\nNo, no, no!\nI told you, I l...",8,en,en,en
7,Family Ties,rap,Cam'ron,2004,41960,"{""Cam\\'ron"",""Lady Wray""}","[Verse 1: Cam'ron]\nKilla, Dipset\nMan I spit ...",9,en,en,en
8,Rockin and Rollin,rap,Cam'ron,1998,6399,"{""Cam\\'ron""}",[Verse 1]\nAy yo you wonder who I are\nI guzzl...,10,en,en,en
9,Lord You Know,rap,Cam'ron,2004,11882,"{""Cam\\'ron"",""Juelz Santana"",Jaheim}","[Chorus: Jaheim]\nNow Lord you know, just how ...",11,en,en,en


## Remove brackets
Here we remove the brackets from the lyrics, that they can be used by the classifiers


In [15]:
import re
def preprocess(song):
    output_text = re.sub(r'\[\s*.*?\s*\]\n', '', song)
    return output_text

In [16]:
# Retrieve the content of the cell
import string
cell_content = lyrics.iat[5000,6]  # Replace with the appropriate row and column labels
# print(str(cell_content))
# Specify the file path and name for the text file
file_path = '../files/lyrics.txt'

# Write the cell content to the text file
with open(file_path, 'w') as file:
    lines = str(cell_content).splitlines()
    for line in lines:
        if line.strip():  # Check if the line is non-blank
            # print(line)
            last_two_words = re.findall(r'\b(\w+\W*\w+)\W*$', line)[-1]
            last_two_words = re.sub(r'\d', '', last_two_words)  # Remove numbers
            last_two_words = last_two_words.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
            file.write(last_two_words + '\n')

Here we remove the lyrics of the other languages that are not english, we drop each row which contains such
and we reindex the table afterwards

In [17]:

for i, row in lyrics.iterrows():
    if row['language'] != 'en':
        lyrics.drop(i, inplace=True)
        # print('Hey it is not an english lyrics!')
    else:
        cleaned = preprocess(row['lyrics'])
        lyrics.at[i, 'lyrics'] = cleaned


lyrics.reset_index(drop=True, inplace=True)
# lyrics.iloc[:10]

In [18]:
print(lyrics.shape)

(39648, 11)


## Rhyme extraction
What could be the desired marked formats?
1. Most frequent scheme - ABAB, ABBA (underfitting)
2. The exact words that are being rhymed?
3. The whole rhyming scheme of the song - ABABCDDC-FFFF-ABABCDCD-FFFF-FFFF (overfitting)
4. Multisyllable rhymes - (AB)(AB)(CB)(CB)

In [19]:
# Specify the file path and name
file_path = '../cmudict.dict'

# Create an empty dictionary to store the word-phoneme mappings
word_phoneme_dict = {}

encodings = ['utf-8', 'latin-1', 'utf-16', 'cp1252']
for encoding in encodings:
    try:
        with open(file_path, 'r') as file:
            for line in file:
                line = line.strip()
                if line:
                    #print(line)
                    split = line.split()
                    word_phoneme_dict[split[0]] = ' '.join(split[1:])
            # break
    except UnicodeDecodeError:
        print('Error')

# Print the dictionary
#print(word_phoneme_dict)

In [20]:
print(word_phoneme_dict.get("ain't"))
print(len(word_phoneme_dict))
# print(word_phoneme_dict)

EY1 N T
135167


In [21]:
import psutil

# Get the current available memory in bytes
available_memory = psutil.virtual_memory().available

# Convert bytes to human-readable format
available_memory_gb = available_memory / (1024 ** 3)  # Convert bytes to gigabytes

# Print the available memory
print(f"Available Memory: {available_memory_gb:.2f} GB")


Available Memory: 6.94 GB


## Idea
We take each row, take the lyrics from it, split the lyrics by whitespace, remove the special characters, maybe remove some stuff that is in the round brackets (aka low voice stuff), then we take only the last two words in each line, and we annotate them with corresponding phonemes

In [22]:
def clean_for_rhymes(lyric):
    lyric = re.sub(r'[^A-Za-z0-9\s]', '', lyric) # remove special chars
    lyric = lyric.lower()
    lyric = lyric.split('\n') # split by new lines
    lyric = list(filter(lambda line: line != '', lyric)) # delete an empty line
    for i, line in enumerate(lyric):
        line = line.split()
        if len(line) >= 2:
            lyric[i] = line[-2:] # take the last two words of each line
        else:
            lyric[i] = line

    return lyric


In [23]:
# create a new column in the dataset and apply the cleaning function to it
lyrics['end words'] = lyrics['lyrics'].apply(clean_for_rhymes)

In [24]:
# lyrics['end words'][1000]
for i in range(lyrics.shape[1]):
    print(lyrics['end words'][i])


[['cam', 'cam'], ['killa', 'cam'], ['cam', 'cam'], ['cam', 'cam'], ['killa', 'cam'], ['killa', 'killa'], ['bases', 'loaded'], ['cam', 'uhhuh'], ['on', 'third'], ['at', 'bat'], ['killa', 'cam'], ['the', 'world'], ['cam', 'cam'], ['killa', 'cam'], ['cam', 'hahahaha'], ['cam', 'cam'], ['killa', 'cam'], ['shit', 'clap'], ['cam', 'cam'], ['killa', 'cam'], ['cam', 'cam'], ['cam', 'cam'], ['killa', 'killa'], ['cam', 'killa'], ['what', 'up'], ['since', 'kumbaya'], ['my', 'lord'], ['on', 'board'], ['tutor', 'me'], ['to', 'me'], ['or', 'dime'], ['hammer', 'time'], ['on', 'nines'], ['camll', 'shine'], ['red', 'red'], ['em', 'lemonheads'], ['like', 'winnipeg'], ['with', 'fred'], ['on', 'scrappy'], ['at', 'me'], ['laffy', 'taffy'], ['killa', 'cam'], ['cam', 'sing'], ['killa', 'cam'], ['me', 'clap'], ['killa', 'cam'], ['cam', 'cam'], ['cam', 'sing'], ['me', 'clap'], ['killa', 'cam'], ['cam', 'sing'], ['killa', 'cam'], ['its', 'me'], ['killa', 'cam'], ['cam', 'cam'], ['cam', 'clap'], ['about', 'this'

In [25]:
# next we make the new function that will use the phoneme dictionary to annotate the phonemes of the words

def translate_to_phonemes(end_words):
    end_words_optimized = []
    for end_word in end_words:
        if len(end_word) == 2:
            end_word_1 = word_phoneme_dict.get(end_word[0])
            end_word_2 = word_phoneme_dict.get(end_word[1])
            end_words_optimized.append([end_word_1, end_word_2])
        elif len(end_word) == 1:
            end_word_1 = word_phoneme_dict.get(end_word[0])
            end_words_optimized.append([end_word_1])
    return end_words_optimized





In [26]:
lyrics['end words phonemes'] = lyrics['end words'].apply(translate_to_phonemes)

In [27]:
lyrics['end words phonemes'][1000]

[['JH EH1 D IY2', 'M AY1 N D'],
 ['AH1 V', 'M AE1 N K AY1 N D'],
 ['IH0 N', 'HH EH1 L'],
 ['W IH1 DH', 'M IY1'],
 ['Y AO1 R', 'B OW1 N Z'],
 ['AE1 Z', 'S T OW1 N'],
 ['AE1 T', 'HH OW1 M'],
 ['K IH1 L ER0 Z', 'R OW1 M'],
 ['T R AE1 P', 'M IY1'],
 ['W IY1', 'IH2 R AE1 K IY0'],
 ['D AA1 R K N AH0 S', 'P AE1 S AH0 Z'],
 ['M AH1 D IY0', 'AE1 SH AH0 Z'],
 ['IH1 Z', 'S IH1 K'],
 ['G AA1 Z AH0', 'S T R IH1 P'],
 ['T UW1', 'EH1 M'],
 ['AH0 N D', 'AY0 R EY1 N IY0 AH0 N Z'],
 ['DH AH0', 'P AE1 D'],
 [None, 'SH AA1 B AA0 Z'],
 ['W IH1 DH', None],
 ['AH0', 'G R UW1 P'],
 ['Y UW1', 'B IY1'],
 ['Y UW1', 'S IY1'],
 ['G OW1', 'AW1 T'],
 ['Y UW1', 'B IY1'],
 ['G OW1', 'AW1 T'],
 ['Y UW1', 'S IY1'],
 ['Y AO1 R', 'TH AO1 T S'],
 ['AE1 N AH0 M AH0 L', 'K L AO1 TH'],
 [None, 'F AO1 R S'],
 ['P EH1 R AH0 B AH0 L Z', 'L AO1 S T'],
 ['L AH1 NG Z', 'AE1 T'],
 ['P AH1 N', 'B AE1 K'],
 [None, 'DH AE1 T'],
 ['G AH1 N', 'R AE1 P'],
 ['AY1', 'T R AH1 S T'],
 ['DH EY1', 'B AH1 S T'],
 ['R AE1 P', 'N AW1'],
 ['R AE1 P

## Questions:
1. Some words are not in the dictionary - what to do?
2. How to lable the scheme? (practically)

## Topic Modelling and Extraction


## Affect labeling
Idea here is to use NRC labels and use majority score


In [28]:
import numpy as np
# Specify the file path and name
file_path = '../NRC-VAD-Lexicon/BipolarScale/NRC-VAD-Lexicon.txt'

# Create an empty dictionary to store the word-phoneme mappings
affect_dictionary = {}

encodings = ['utf-8', 'latin-1', 'utf-16', 'cp1252']
for encoding in encodings:
    try:
        with open(file_path, 'r') as file:
            for line in file:
                line = line.strip()
                if line:
                    #print(line)
                    split = line.split()
                    if len(split) == 4:
                        affect_dictionary[split[0]] = np.array([float(split[1]), float(split[2]), float(split[3])])
                    elif len(split) == 5:
                        affect_dictionary[split[0]+" "+split[1]] = np.array([float(split[2]), float(split[3]), float(split[4])])
                    else:
                        affect_dictionary[split[0]+ " "+split[1]+ " "+split[2]] = np.array([float(split[3]), float(split[4]), float(split[5])])
            # break
    except UnicodeDecodeError:
        print('Error')


In [29]:
print(affect_dictionary.get('love'))
print(len(affect_dictionary))

[1.    0.038 0.346]
19971


Here the algorithm is easy, first we take the lyrics, clean it from the newline characters, split it into words, then we iterate through the words and calculate the majority score. If the word is present in the affect dictionary - we take it, if not- ciao bambino!

In [34]:
import numpy as np
def compute_affect(lyric):
    lyric = re.sub(r'[^A-Za-z0-9\s]', '', lyric) # remove special chars
    lyric = lyric.lower()
    lyric = lyric.split('\n') # split by new lines
    lyric = list(filter(lambda line: line != '', lyric)) # delete an empty line
    majority_score = np.zeros(3)
    count_of_words = 0
    for i, line in enumerate(lyric):
        line = line.split()
        for word in line:
            if word in affect_dictionary.keys():
                majority_score += affect_dictionary.get(word)
                count_of_words+= 1
    print(count_of_words)
    return majority_score / count_of_words



In [35]:
lyrics['affect majority score'] = lyrics['lyrics'].apply(compute_affect)


302
202
162
215
143
257
270
208
173
284
249
129
259
205
313
249
275
357
266
243
269
173
330
199
280
203
180
262
163
280
126
177
110
197
272
175
163
271
192
186
188
126
278
181
262
221
162
232
247
188
273
286
152
217
241
191
213
248
182
134
170
248
271
114
222
284
232
188
180
330
259
274
270
215
543
283
199
290
281
187
211
175
245
303
225
227
288
185
220
240
192
151
297
141
529
198
233
292
272
183
245
180
210
199
384
279
244
155
204
213
102
209
259
193
285
247
265
205
194
213
111
112
209
253
222
115
178
236
255
233
244
193
249
276
219
284
205
290
285
190
332
328
229
381
301
188
179
204
194
226
309
209
219
181
272
234
285
178
397
359
35
192
129
229
301
282
205
345
193
208
280
189
324
173
350
193
191
225
187
216
186
392
275
290
386
271
424
184
161
241
224
195
217
264
181
178
227
268
210
178
173
356
265
198
266
186
233
276
24
175
251
156
212
155
273
207
194
182
352
350
85
143
222
160
627
455
331
131
327
331
203
256
291
154
233
142
193
243
404
196
208
212
205
231
180
162
280
230
243
275
193

In [39]:
lyrics['affect majority score'][9]

array([ 0.14428873, -0.09799296, -0.01533099])

In [40]:
lyrics

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language,end words,end words phonemes,affect majority score
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","Killa Cam, Killa Cam, Cam\nKilla Cam, Killa Ca...",1,en,en,en,"[[cam, cam], [killa, cam], [cam, cam], [cam, c...","[[K AE1 M, K AE1 M], [None, K AE1 M], [K AE1 M...","[0.03259602649006631, -0.12926490066225205, -0..."
1,Can I Live,rap,JAY-Z,1996,468624,{},"\nYeah, hah, yeah, Roc-A-Fella\nWe invite you ...",3,en,en,en,"[[yeah, rocafella], [you, know], [of, hopeless...","[[Y AE1, None], [Y UW1, N OW1], [AH1 V, HH OW1...","[0.16737623762376239, -0.040633663366336614, 0..."
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en,"[[im, eatin], [they, grub], [these, scrubs], [...","[[IH1 M, None], [DH EY1, G R AH1 B], [DH IY1 Z...","[0.15901234567901226, -0.11640740740740743, -0..."
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}","\nUgh, Killa!\nBaby!\nKanye, this that 1970s H...",5,en,en,en,"[[ugh, killa], [baby], [flow, huh], [it, up], ...","[[AH1 G, None], [B EY1 B IY0], [F L OW1, HH AH...","[0.06357209302325578, -0.10053953488372093, -0..."
4,Fly In,rap,Lil Wayne,2005,78271,{},"So they ask me\n""Young boy\nWhat you gon' do t...",6,en,en,en,"[[ask, me], [young, boy], [time, around], [com...","[[AE1 S K, M IY1], [Y AH1 NG, B OY1], [T AY1 M...","[0.11767832167832168, -0.07072727272727276, 0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39643,Real Bad Boys,rap,Shyheim,1996,7472,{},"It's real, did you ever see that type of nigga...",42185,en,en,en,"[[of, nigga], [be, nobody], [aint, squat], [ev...","[[AH1 V, None], [B IY1, N OW1 B AA2 D IY2], [N...","[0.0430955414012739, -0.023630573248407657, 0...."
39644,What Makes The World Go Round,rap,Shyheim,1996,1149,"{Rubbabandz,""Trigger Tha Gambler"",""Smoothe Da ...","There a toker, under the numb, scumb looker\nG...",42186,en,en,en,"[[scumb, looker], [and, booker], [smart, nigga...","[[None, L UH1 K ER0], [AH0 N D, B UH1 K ER0], ...","[0.049661016949152426, -0.05488135593220334, 0..."
39645,Dear God,rap,Shyheim,1996,7085,"{""June Luva"",""Pop Da Brown Hornet"",""Nikki Will...","Dear God, I wonder can you save me?\nI'm buggi...",42187,en,en,en,"[[save, me], [made, g], [and, robberies], [for...","[[S EY1 V, M IY1], [M EY1 D, JH IY1], [AH0 N D...","[0.15194262295081964, -0.06367213114754097, 0...."
39646,I Declare War,rap,Shyheim,1999,698,{},With my x-ray vision\nSee through you lames fo...,42196,en,en,en,"[[xray, vision], [for, days], [bullet, counts]...","[[None, V IH1 ZH AH0 N], [F AO1 R, D EY1 Z], [...","[0.09196296296296293, -0.029111111111111105, 0..."
