# PREPROCESSING ON THE SAMPLED_DATASET

In [1]:
import pandas as pd
df = pd.read_csv("sampled_dataset.csv")
df.head()

Unnamed: 0,title,artist,year,views,features,lyrics,is_country,is_pop,is_rap,is_rb,is_rock
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False
1,The Last Event,Matthew Ryan,2019,34,{''},Later on\nWhen we've all come down\nAnd the st...,True,False,False,False,False
2,Friends With The Enemy,Poor Man's Poison,2011,5611,"{""Poor Man\\\\'s Poison""}",[Verse 1]\nLonging for something more\nWill I ...,True,False,False,False,False
3,Sleepin With the Radio On,Charly McClain,1981,100,{''},My heart beats with the lonely rain\nWishin' I...,True,False,False,False,False
4,Riders in the Sky,Michael Martin Murphey,1993,108,{''},An old cowpoke went ridin' out one dark and wi...,True,False,False,False,False


In [2]:
df.tail()

Unnamed: 0,title,artist,year,views,features,lyrics,is_country,is_pop,is_rap,is_rb,is_rock
161655,From Dust to Dust,Horcrvx,2018,23,{''},Six feet beneath the creek bed; my soul does f...,False,False,False,False,True
161656,​cutting emptiness,Sadness (USA),2021,12,{''},Cutting emptiness as though it will fill me\nA...,False,False,False,False,True
161657,Coasting,Ten F,2019,867,{'Ten Fé'},[Verse 1]\nThe night is warmer now\nI want to ...,False,False,False,False,True
161658,Plunderberg,Lagerstein,2012,3,{''},"We loot, we steal, we Plunderberg\nSailing thr...",False,False,False,False,True
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True


### PULIZIA LYRICS

In [None]:
#function to clean the lyrics strings from words inside square brackets that are not keywords for the stanza splitting

import re
keep_pattern = r"\[(Chorus|Verse|Bridge|Intro|Outro|Hook|Prehook|Posthook|Introduction|Interlude|Coda|Conclusion|Refrain).\]*"


def clean_lyrics(lyrics):
    cleaned_lyrics = re.sub(r"\[.*?\]", lambda match: match.group(0) if re.match(keep_pattern, match.group(0)) else "", lyrics)
    return cleaned_lyrics


df['cleaned_lyrics'] = df['lyrics'].apply(clean_lyrics)

In [4]:
#function to split the stanzas according to the various formats used to denote stanza breaks (keywords with brackets, without brackets, with round brackets, only \n\n)    

def process_lyric(ly):
    sections = [ly.strip()]
    
    # First split by the pattern
    pattern = r"\[(Introduction|Interlude|Coda|Hook|Prehook|Posthook|Conclusion|Refrain|Verse|Intro|Outro|Chorus|Bridge).\]*"
    if re.search(pattern, ly, re.IGNORECASE):
        sections = [segment for section in sections for segment in section.split("\n[")]

    # Further split by "Verse"
    if re.search(r"Verse", ly, re.IGNORECASE):
        sections = [segment for section in sections for segment in section.split("Verse")]

    # Further split by double newlines
    if "\n\n" in ly:
        sections = [segment for section in sections for segment in section.split("\n\n")]
    
    # Further split by "(Chorus)"
    if re.search(r"\(Chorus\)", ly, re.IGNORECASE):
        sections = [segment for section in sections for segment in section.split("(")]

    # Clean up whitespace for each section and remove empty sections
    return [section.strip() for section in sections if section.strip()]


df["processed_lyrics"] = df["cleaned_lyrics"].apply(process_lyric)

In [5]:
#function to delete strings there are uninformative (only the keyword, empty strings, strings that are too short)


def clean_list(lyrics_list): 
    if not isinstance(lyrics_list, list):
        return lyrics_list
    tag_pattern = r"^\s*(Hook|Chorus|Bridge|Verse|Outro|Intro|Refrain|Prehook|Posthook|Coda|Interlude|Conclusion).*?\]\s*$|^(.{,20})\s*$"
    #tag_pattern = r"^\s*(Hook|Chorus|Bridge|Verse|Outro|Intro|Refrain|Prehook|Posthook|Coda|Interlude|Conclusion)\s*.*?\]\s*$"
    #tag_pattern = r"^\s*(Hook|Chorus|Bridge|Verse|Outro|Intro|Refrain|Prehook|Posthook|Coda|Interlude|Conclusion)\s*.*?\]?\s*$" questo mette la parentesi in fondo opzionale
    
    return [line for line in lyrics_list if line and not re.match(tag_pattern, line, re.IGNORECASE)]



df['cleaned_lists'] = df["processed_lyrics"].apply(clean_list)

In [6]:
df.head()

Unnamed: 0,title,artist,year,views,features,lyrics,is_country,is_pop,is_rap,is_rb,is_rock,cleaned_lyrics,processed_lyrics,cleaned_lists
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"She got a broke down El Camino, in the front y...","[She got a broke down El Camino, in the front ...","[She got a broke down El Camino, in the front ..."
1,The Last Event,Matthew Ryan,2019,34,{''},Later on\nWhen we've all come down\nAnd the st...,True,False,False,False,False,Later on\nWhen we've all come down\nAnd the st...,[Later on\nWhen we've all come down\nAnd the s...,[Later on\nWhen we've all come down\nAnd the s...
2,Friends With The Enemy,Poor Man's Poison,2011,5611,"{""Poor Man\\\\'s Poison""}",[Verse 1]\nLonging for something more\nWill I ...,True,False,False,False,False,[Verse 1]\nLonging for something more\nWill I ...,"[[, 1]\nLonging for something more\nWill I eve...",[1]\nLonging for something more\nWill I ever b...
3,Sleepin With the Radio On,Charly McClain,1981,100,{''},My heart beats with the lonely rain\nWishin' I...,True,False,False,False,False,My heart beats with the lonely rain\nWishin' I...,[My heart beats with the lonely rain\nWishin' ...,[My heart beats with the lonely rain\nWishin' ...
4,Riders in the Sky,Michael Martin Murphey,1993,108,{''},An old cowpoke went ridin' out one dark and wi...,True,False,False,False,False,An old cowpoke went ridin' out one dark and wi...,[An old cowpoke went ridin' out one dark and w...,[An old cowpoke went ridin' out one dark and w...


### CONTROLLO SU PULIZIA LYRICS

In [7]:
x = df.loc[0, "processed_lyrics"]
x

['She got a broke down El Camino, in the front yard up on blocks\nHer mom walks around in a pink nightgown, sandals and white socks\nShe don’t mind a baseball game in the middle of the lightning and the rain\nShe’s a pain in my brain, drives me insane\nBut I love her just the same, boys, love her just the same',
 'Chorus]\nSomething in the water\n(something in the water)\nSomething in the water\n(something in the water)\nOoh-ooh-ooh-ooh\nWhat makes her crazy I don’t know\nNever seen anything like her before\nThere must be something in the water',
 'She does her makeup and hair, to cook fried chicken in her underwear\nShe drinks malt liquor for lunch and dinner and sends me running scared\nShe yells, she screams and she beats me\nBut I don’t mind the way she treats me\nShe’ll someday lead to my death I know\nBut I’ll stay with her just the same, boys, stay with her just the same',
 'Chorus]\nSomething in the water\n(something in the water)\nSomething in the water\n(something in the wate

In [8]:
x = df.loc[1, ["title", "artist"]]
x

title     The Last Event
artist      Matthew Ryan
Name: 1, dtype: object

In [9]:
x = df.loc[1, "cleaned_lists"]
x

["Later on\nWhen we've all come down\nAnd the streets are just a funeral\nAnd the last event\nIs\u2005falling\u2005ash and embers\nWe'll\u2005drift and waltz and twist into\u2005nothing\nWe weren't too bright my love, but\nWeren't we something?",
 'Heroes waved\nFrom\u205frented\u205fblack\u205fsedans\nMonsters crept behind\u205ftall buildings\nAnd we\u205fsmiled and cheered\nLike nothing ever ended\nWe were so beautiful we forgot we were human\nIn dusk and fear amen\nWe found our undoing',
 "Don't say it comes as any shock\nThings just go and go and go until they stop\nI'll always remember you as a song with no end\nHere until...\nEarly May\nShe was standing in the front yard\nThe sun was perfect, the clouds gone missing\nAnd I couldn't move\nThat's when she started crying\nIt felt so quick and yet somehow in slow motion\nWe're all just boats that sink while falling in love\nWith the ocean",
 "Don't say it comes as any shock\nThings just go and go and go until they stop\nI'll always r

### EXPLODE SAMPLED_DATA

In [7]:
#removing redundant variables
df = df.drop(['cleaned_lyrics', 'processed_lyrics'], axis=1)

In [8]:
#create n records for the n stanzas of each song
exploded_df = df.explode('cleaned_lists', ignore_index=False)
exploded_df.rename(columns={'cleaned_lists': 'stanzas'}, inplace=True)

#numbers the stanzas based on their order
exploded_df['stanza_number'] = exploded_df.groupby(exploded_df.index).cumcount()


exploded_df

Unnamed: 0,title,artist,year,views,features,lyrics,is_country,is_pop,is_rap,is_rb,is_rock,stanzas,stanza_number
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"She got a broke down El Camino, in the front y...",0
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,Chorus]\nSomething in the water\n(something in...,1
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"She does her makeup and hair, to cook fried ch...",2
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,Chorus]\nSomething in the water\n(something in...,3
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"My hoosier girl is so fine, shake the watermel...",4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,[Refrain]\nThe thin gruel for millions of people,0
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,1]\nNothing is left but a mountain of shoes\nD...,1
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,Refrain]\nThe thin gruel for millions of people,2
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,2]\nSometimes you live\nAnd sometimes you die\...,3


In [13]:
#number of records
len(exploded_df)

966423

In [None]:
#checking the type of "stanza" values

for v in exploded_df["stanzas"]:
    print(type(v))
    break

<class 'str'>


In [10]:
#da rivedere questo controllo
#nella tail del explode_df si vede come ci siano tre record uguali ma con valori diversi in is_chorus


def is_chorus(stanza):
    if isinstance(stanza, str):
        pattern = r"^\s*(Chorus|Refrain)\s*(\]|\))"
        return bool(re.match(pattern, stanza, re.IGNORECASE))

#!TOCHECK
# this checks for is_chorus and whether a stanza is repeated
# for the same song (index)
exploded_df["is_chorus"] = (
    exploded_df["stanzas"].apply(is_chorus) | 
    exploded_df.groupby(exploded_df.index)["stanzas"].transform(
        lambda x: x.duplicated(keep=False)
    )
)

In [11]:
exploded_df.head()

Unnamed: 0,title,artist,year,views,features,lyrics,is_country,is_pop,is_rap,is_rb,is_rock,stanzas,stanza_number,is_chorus
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"She got a broke down El Camino, in the front y...",0,False
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,Chorus]\nSomething in the water\n(something in...,1,True
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"She does her makeup and hair, to cook fried ch...",2,False
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,Chorus]\nSomething in the water\n(something in...,3,True
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"My hoosier girl is so fine, shake the watermel...",4,False


In [12]:
exploded_df.tail()

Unnamed: 0,title,artist,year,views,features,lyrics,is_country,is_pop,is_rap,is_rb,is_rock,stanzas,stanza_number,is_chorus
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,[Refrain]\nThe thin gruel for millions of people,0,False
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,1]\nNothing is left but a mountain of shoes\nD...,1,False
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,Refrain]\nThe thin gruel for millions of people,2,True
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,2]\nSometimes you live\nAnd sometimes you die\...,3,False
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,Refrain]\nThe thin gruel for millions of people,4,True


In [20]:
x = exploded_df.loc[2, "stanzas"]
x

2    1]\nLonging for something more\nWill I ever be...
2    2]\nWorkin’ on a better way\nWill I ever be fr...
2    Chorus]\nAnd them hard times keep comin’ at th...
2    3]\nNever more will you see my face\nWill I ev...
2    Chorus]\nAnd them hard times keep comin’ at th...
2    Outro]\nAnd them hard times keep comin’ at the...
Name: stanzas, dtype: object

In [None]:
# function that cleans the string header from the keywords in order to get only clean strings (e.g. eliminating "chorus]", "refrain]") and removes \n between single lines

def pulire_stringhe(stanza):
    pattern = r"^.*?\]|\)\n*"
    
    result1 = re.sub(pattern, "", stanza)
    result2 = re.sub(r"\n", " ", result1)
    
    return result2


exploded_df["stanzas"] = exploded_df["stanzas"].apply(pulire_stringhe)


In [24]:
exploded_df.head()

Unnamed: 0,title,artist,year,views,features,lyrics,is_country,is_pop,is_rap,is_rb,is_rock,stanzas,stanza_number,is_chorus
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"She got a broke down El Camino, in the front y...",0,False
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,Something in the water (something in the wate...,1,True
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"She does her makeup and hair, to cook fried ch...",2,False
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,Something in the water (something in the wate...,3,True
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"My hoosier girl is so fine, shake the watermel...",4,False


## LEMMATIZATION

In [None]:
#we disabled some packages to reduce the running time since they are not useful for this task
import spacy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "textcat"])

In [26]:
def tokenize(stanza):
    if isinstance(stanza, str):
        stanza = stanza.lower()
        doc = nlp(stanza)
        tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_space ]
        return tokens

  
exploded_df["lemmatized_stanzas"] = exploded_df["stanzas"].apply(tokenize)
exploded_df.head()

Unnamed: 0,title,artist,year,views,features,lyrics,is_country,is_pop,is_rap,is_rb,is_rock,stanzas,stanza_number,is_chorus,lemmatized_stanzas
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"She got a broke down El Camino, in the front y...",0,False,"[she, get, a, broke, down, el, camino, in, the..."
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,Something in the water (something in the wate...,1,True,"[something, in, the, water, something, in, the..."
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"She does her makeup and hair, to cook fried ch...",2,False,"[she, do, her, makeup, and, hair, to, cook, fr..."
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,Something in the water (something in the wate...,3,True,"[something, in, the, water, something, in, the..."
0,Something in the Water,Pokey LaFarge,2015,10902,{''},"She got a broke down El Camino, in the front y...",True,False,False,False,False,"My hoosier girl is so fine, shake the watermel...",4,False,"[my, hoosi, girl, be, so, fine, shake, the, wa..."


In [None]:
#checking the result of the lemmatization
x = exploded_df.loc[0, "lemmatized_stanzas"]
x

0    [she, get, a, broke, down, el, camino, in, the...
0    [something, in, the, water, something, in, the...
0    [she, do, her, makeup, and, hair, to, cook, fr...
0    [something, in, the, water, something, in, the...
0    [my, hoosi, girl, be, so, fine, shake, the, wa...
0    [something, in, the, water, something, in, the...
Name: lemmatized_stanzas, dtype: object

In [None]:
#look at the stanzas 0, 2 and 4: same tokenized elements but different results in is_chorus
#that's because they were originally refrains
#the regex in the is_chorus function has been changed but it didn't bring the expected result
#bcs at this point we would have expected True for stanza 0 as well
exploded_df.tail()

Unnamed: 0,title,artist,year,views,features,lyrics,is_country,is_pop,is_rap,is_rb,is_rock,stanzas,stanza_number,is_chorus,lemmatized_stanzas
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,The thin gruel for millions of people,0,False,"[the, thin, gruel, for, million, of, people]"
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,Nothing is left but a mountain of shoes Dig y...,1,False,"[nothing, be, leave, but, a, mountain, of, sho..."
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,The thin gruel for millions of people,2,True,"[the, thin, gruel, for, million, of, people]"
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,Sometimes you live And sometimes you die But ...,3,False,"[sometimes, you, live, and, sometimes, you, di..."
161659,The Thin Gruel,Buck Gooter,2012,28,{''},[Refrain]\nThe thin gruel for millions of peop...,False,False,False,False,True,The thin gruel for millions of people,4,True,"[the, thin, gruel, for, million, of, people]"


## DA GESTIRE QUESTIONE DUPLICATI CHE SAREBBERO TALI MA CON VALORI DIVERSI IN "IS_CHORUS"
#### in ogni caso drop.duplicates() non funzionerebbe perchè fa fatica con le liste

In [None]:
#dropping redundant variables
exploded_df = exploded_df.drop(["lyrics", "stanzas"], axis=1)


In [33]:
exploded_df.head()

Unnamed: 0,title,artist,year,views,features,is_country,is_pop,is_rap,is_rb,is_rock,stanza_number,is_chorus,lemmatized_stanzas
0,Something in the Water,Pokey LaFarge,2015,10902,{''},True,False,False,False,False,0,False,"[she, get, a, broke, down, el, camino, in, the..."
0,Something in the Water,Pokey LaFarge,2015,10902,{''},True,False,False,False,False,1,True,"[something, in, the, water, something, in, the..."
0,Something in the Water,Pokey LaFarge,2015,10902,{''},True,False,False,False,False,2,False,"[she, do, her, makeup, and, hair, to, cook, fr..."
0,Something in the Water,Pokey LaFarge,2015,10902,{''},True,False,False,False,False,3,True,"[something, in, the, water, something, in, the..."
0,Something in the Water,Pokey LaFarge,2015,10902,{''},True,False,False,False,False,4,False,"[my, hoosi, girl, be, so, fine, shake, the, wa..."


In [None]:
#download the lemmatized_df 
#exploded_df.to_csv('lemmatized_df.csv', index=False)