In [2]:
import pandas as pd

### Load the sampled version of the dataset

In [9]:
df = pd.read_csv("./data/song_lyrics_sampled.csv")
print('Data rows number: ', len(df))
df.head()

Data rows number:  10000


Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Etap,rap,Der Plot,2014,124,{},"[Part I - Conny:]\nGuten Morgen fremdes Bett, ...",383522,de,de,de
1,Toothpick,pop,Biting Elbows,2012,8873,{},Some folks got the patience of the angels\nNot...,1166787,en,en,en
2,6 Feet Under,pop,Ana Johnsson,2004,60,{},You just left me 6 feet under ground I'm burni...,803057,en,en,en
3,Ir Al Baile,pop,Onda Vaga,2015,731,{},Cuando a los doce llevé la bandera en el hombr...,905848,es,es,es
4,Prudenza mai,pop,Ivan Graziani,1989,35,{},"Prudenza mai, mai...\nMai neanche da bambino\n...",1304379,it,it,it


### Drop useless columns and keep only english songs

In [10]:
# consider only english songs
df = df[df.language == 'en']
# drop rows containing NaN values
df = df.dropna()
# drop useless columns
df = df.drop(['language_cld3', 'language_ft','language','features','views'], axis=1)
print('Data rows number: ', len(df))
df.head()

Data rows number:  7810


Unnamed: 0,title,tag,artist,year,lyrics,id
1,Toothpick,pop,Biting Elbows,2012,Some folks got the patience of the angels\nNot...,1166787
2,6 Feet Under,pop,Ana Johnsson,2004,You just left me 6 feet under ground I'm burni...,803057
5,The Poetaster Act 4. Scene 2,misc,Ben Jonson,1601,"A Room in Lupus's House.\n\nEnter Lupus, HISTR...",674438
6,Hes Gone,pop,Phil Lesh & Friends,2015,"Rat in a drain ditch, caught on a limb, you kn...",961823
7,Ill Never Say,pop,Helen Ward,2015,"I'll never say ""never again"" again\nCause here...",1163619


### Dataset preprocessing

In [11]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


def preprocess_text(text):
    # remove \n
    text = text.replace('\n', ' ')
    # remove punctuation
    text = re.sub(r'[,\.!?]', '', text)
    #removing text in square braquet
    text = re.sub(r'\[.*?\]', ' ', text)
    #removing numbers
    text = re.sub(r'\w*\d\w*',' ', text)
    #removing bracket
    text = re.sub(r'[()]', ' ', text)
    # convert all words in lower case
    text = text.lower()
    # tokenize
    tokens = word_tokenize(text)
    # remove stop words
    stop_words = stopwords.words('english')
    new_stop_words = ['ooh','yeah','hey','whoa','woah', 'ohh', 'was', 'mmm', 'oooh','yah','yeh','mmm', 'hmm','deh','doh','jah','wa']
    stop_words.extend(new_stop_words)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    #remove tokens with lenght < 3
    final_tokens = [token for token in lemmatized_tokens if len(token) > 2 and not token.isnumeric()]

    return final_tokens

cleaned_text = df["lyrics"].apply(preprocess_text)
df["lyrics_proc"] = cleaned_text
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alessandro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alessandro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alessandro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_proc
1,Toothpick,pop,Biting Elbows,2012,Some folks got the patience of the angels\nNot...,1166787,"[folk, got, patience, angel, heart, well, year..."
2,6 Feet Under,pop,Ana Johnsson,2004,You just left me 6 feet under ground I'm burni...,803057,"[left, foot, ground, burning, sight, light, fo..."
5,The Poetaster Act 4. Scene 2,misc,Ben Jonson,1601,"A Room in Lupus's House.\n\nEnter Lupus, HISTR...",674438,"[room, lupus, house, enter, lupus, histrio, li..."
6,Hes Gone,pop,Phil Lesh & Friends,2015,"Rat in a drain ditch, caught on a limb, you kn...",961823,"[rat, drain, ditch, caught, limb, know, better..."
7,Ill Never Say,pop,Helen Ward,2015,"I'll never say ""never again"" again\nCause here...",1163619,"['ll, never, say, never, cause, love, head, he..."


In [13]:
# first 10 tokens of the first song
df.iloc[0]['lyrics_proc'][:10]

['folk',
 'got',
 'patience',
 'angel',
 'heart',
 'well',
 'yearns',
 'vengeance',
 'leave',
 'place']