In [2]:
# Data Manipulation
#-----------------------------
import pandas as pd


# Natural Laguage Processing
#-----------------------------
import nltk

# Stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

# Punctuation
from nltk.tokenize import wordpunct_tokenize

# Stemmers and Lemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer, SnowballStemmer


# Other packages
#-----------------------------

import string
import re
import random
from random import sample

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rigau\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rigau\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Import Dataset

In [3]:
artist = pd.read_csv('./Data set/artists-data.csv')
lyrics = pd.read_csv('./Dataset/lyrics-data.csv')

FileNotFoundError: [Errno 2] No such file or directory: './Data set/artists-data.csv'

## Artist dataset 

In [3]:
artist.head()

Unnamed: 0,Artist,Songs,Popularity,Link,Genre,Genres
0,10000 Maniacs,110,0.3,/10000-maniacs/,Rock,Rock; Pop; Electronica; Dance; J-Pop/J-Rock; G...
1,12 Stones,75,0.3,/12-stones/,Rock,Rock; Gospel/Religioso; Hard Rock; Grunge; Roc...
2,311,196,0.5,/311/,Rock,Rock; Surf Music; Reggae; Ska; Pop/Rock; Rock ...
3,4 Non Blondes,15,7.5,/4-non-blondes/,Rock,Rock; Pop/Rock; Rock Alternativo; Grunge; Blue...
4,A Cruz Está Vazia,13,0.0,/a-cruz-esta-vazia/,Rock,Rock


In [4]:
print(f'The artist dataframe has {artist.shape[0]} rows and {artist.shape[1]} columns.')

The artist dataframe has 3242 rows and 6 columns.


In [5]:
artist.Genre.value_counts()

Rock            797
Pop             796
Sertanejo       617
Hip Hop         537
Funk Carioca    302
Samba           193
Name: Genre, dtype: int64

We only keep the Rock, Hip Hop and Pop songs, as most of them have lyrics in English.
The other genres are most likely made of songs in Portuguese.

In [6]:
artist = artist[artist.Genre.isin(['Rock','Hip Hop','Pop'])]

artist.shape

(2130, 6)

We still have 65% of the original dataframe.

We now check if there are any null values.

In [7]:
artist.isnull().sum()

Artist        0
Songs         0
Popularity    0
Link          0
Genre         0
Genres        4
dtype: int64

The null values in Genres are not an issue, as they indicate if artists belong to multiple genre, outside of their main one.
We will not use this data for our models.

In [8]:
genre_per_artist = artist.groupby(['Artist'])[['Genre']].count()
genre_per_artist.value_counts()

Genre
1        1841
2         143
3           1
dtype: int64

Some artists belong two 2 or more different (main) music genre. We will not keep them, to avoid any ambiguity when trying to classify the lyrics.

In [9]:
artist_1_genre = genre_per_artist[genre_per_artist.Genre == 1].index.tolist()

## Lyrics dataset

In [10]:
lyrics.head()

Unnamed: 0,ALink,SName,SLink,Lyric,Idiom
0,/10000-maniacs/,More Than This,/10000-maniacs/more-than-this.html,I could feel at the time. There was no way of ...,ENGLISH
1,/10000-maniacs/,Because The Night,/10000-maniacs/because-the-night.html,"Take me now, baby, here as I am. Hold me close...",ENGLISH
2,/10000-maniacs/,These Are Days,/10000-maniacs/these-are-days.html,These are. These are days you'll remember. Nev...,ENGLISH
3,/10000-maniacs/,A Campfire Song,/10000-maniacs/a-campfire-song.html,"A lie to say, ""O my mountain has coal veins an...",ENGLISH
4,/10000-maniacs/,Everyday Is Like Sunday,/10000-maniacs/everyday-is-like-sunday.html,Trudging slowly over wet sand. Back to the ben...,ENGLISH


In [11]:
print(f'The lyrics dataframe has {lyrics.shape[0]} rows and {lyrics.shape[1]} columns.')

The lyrics dataframe has 209522 rows and 5 columns.


We keep the songs in English only.

In [12]:
lyrics = lyrics[lyrics.Idiom == 'ENGLISH']

While we lose about 50% of the dataset by doing this, we still have 114,723 songs in the dataset, which will be sufficiant for modelling.

We now check if there are any null values.

In [13]:
lyrics.isnull().sum()

ALink    0
SName    0
SLink    0
Lyric    0
Idiom    0
dtype: int64

## Merging the datasets

In [14]:
df_lyrics = artist.merge(lyrics, left_on='Link', right_on='ALink', how = 'inner')

We now drop the duplicate rows.

In [15]:
df_lyrics = df_lyrics.drop_duplicates()

We keep the songs where the artists only belong to 1 principal music genre.

In [16]:
df_lyrics = df_lyrics[df_lyrics.Artist.isin(artist_1_genre)]
df_lyrics = df_lyrics.reset_index(drop=True)

In [17]:
print(f'There are {len(df_lyrics.Artist.unique())} different artists.')

There are 1023 different artists.


In [18]:
Genre_dict = {'Rock':0, 'Hip Hop':1, 'Pop':2}
df_lyrics['Genre'] = df_lyrics['Genre'].map(Genre_dict)

In [19]:
df_lyrics.head(2)

Unnamed: 0,Artist,Songs,Popularity,Link,Genre,Genres,ALink,SName,SLink,Lyric,Idiom
0,12 Stones,75,0.3,/12-stones/,0,Rock; Gospel/Religioso; Hard Rock; Grunge; Roc...,/12-stones/,World So Cold,/12-stones/world-so-cold.html,"It starts with pain, followed by hate. Fueled ...",ENGLISH
1,12 Stones,75,0.3,/12-stones/,0,Rock; Gospel/Religioso; Hard Rock; Grunge; Roc...,/12-stones/,Broken,/12-stones/broken.html,Freedom!. Alone again again alone. Patiently w...,ENGLISH


For convenience and readability we will only keep a few columns, as we will not be using the others ones.

In [20]:
df_lyrics = df_lyrics[['SName', 'Lyric','Artist', 'Genre']].copy()

In [21]:
print(f'We now have a dataframe made of {df_lyrics.shape[0]} different songs, ready for cleaning.')

We now have a dataframe made of 79452 different songs, ready for cleaning.


We now export the dataframe, to save a copy of it.

In [22]:
df_lyrics.to_csv('./Dataset/music_1_genre.csv', index = False)

#df_lyrics = pd.read_csv('./Dataset/music_1_genre.csv')

## Radom lyrics sample

We have a look at a few songs.

In [23]:
random.seed(10)
dict = {0:'Rock', 1:'Hip Hop', 2:'Pop'}

for item in sample(range(len(df_lyrics)), 3):
    print(df_lyrics['Lyric'][item], "\n")
    print("Genre: {}".format(dict.get(df_lyrics['Genre'][item])))
    print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

(feat. Mannie Fresh). [Intro: Baby]. Uh huh uh huh. Cut me up in my head, Cut me up nigga fuck. Ay ay Fresh, its all gravy baby. Its my turn nigga, I'm under the burn biotch. If you ain't getting money, you's a crazy motherfucker (got to be a crazy motherfucker). I ain't getting money, I'm probably a crazy motherfucker (lord help me). This is a biotch, biotch. [Baby]. Come on nigga. Its them pimpin, ballin, gangster macks. Theys old school caddies with them bows and racks. The new school bens with them bows to match. But its the Birdman daddy got them O's of crack. A nigga off the diet cause a nigga gettin fat. Them throwback jerseys with them throwback hats. Burberry timbs with the fence to match. And I'm so so fly, and mommy like that. Ounces of that purple and we do it by the sac. Rag-top bens with them rag-top lacs. Nothing to a gangsta nigga choak your strap. We live for money, hoes cooking that crap. Them 20" rims and them tires are flat. There ain't no question, that stunna is a

# Data Preparation

## Stemming

Before cleaning the data, we explore what kind of stemmers are available, that is:
 - Porter Stemmer
 - Lancaster Stemmer
 - Snowball Stemmer
 
Because of the computational requirement for lemmatizing, we will only stem the data.

In [24]:
p_stemmer = PorterStemmer()
l_stemmer = LancasterStemmer()
s_stemmer = SnowballStemmer(language='english')

lemma = nltk.wordnet.WordNetLemmatizer()

In [25]:
my_str = ['apparatus', 'applicable', 'apply', 'apple', 'appletown', 
          'AppleTown', 'apples', 'apply', 'app', 'application', 'applied', 'applies', ]

In [26]:
def StemTest(my_str=my_str):
    ''' Testing various words on a selection of stemmers and lemmatizers '''
    
    assert(type(my_str)==list), "Please input your selection of words as a LIST!"
    my_str_1 = [p_stemmer.stem(x) for x in my_str] ## Use the Porter stemmer
    my_str_2 = [l_stemmer.stem(x) for x in my_str] ## Use the Lancaster stemmer
    my_str_3 = [s_stemmer.stem(x) for x in my_str] ## Use the Snowball stemmer
    my_str_4 = [lemma.lemmatize(x) for x in my_str]
    quick_dict = {1:'Porter: ', 2:'Lancaster: ', 3:"Snowball: ", 4:'Lemmatizer:'}

    ## Output our results from all stemmers, together with the original string
    print("Original: ", my_str, "\n")
    for i in range(1,5):
        print(quick_dict[i], locals()['my_str_{}'.format(i)], sep=" ")

We play aroung with the different stemmers/lemmatizer.

In [27]:
StemTest()

In [28]:
StemTest(['try', 'tryy', 'tryyy', 'tryyyy', 'tryin\''])

Original:  ['try', 'tryy', 'tryyy', 'tryyyy', "tryin'"] 

Porter:  ['tri', 'tryy', 'tryyi', 'tryyyy', "tryin'"]
Lancaster:  ['try', 'tryy', 'tryyy', 'tryyyy', "tryin'"]
Snowball:  ['tri', 'tryy', 'tryyi', 'tryyyy', 'tryin']
Lemmatizer: ['try', 'tryy', 'tryyy', 'tryyyy', "tryin'"]


None of the stemmers or lemmatizer seem to be dealing well with words ending by multiple instances of the same letter (commonly found in song lyrics).

In [29]:
StemTest(['yeeeaaaahhhhh'])

Original:  ['yeeeaaaahhhhh'] 

Porter:  ['yeeeaaaahhhhh']
Lancaster:  ['yeeeaaaahhhhh']
Snowball:  ['yeeeaaaahhhhh']
Lemmatizer: ['yeeeaaaahhhhh']


Similar observation when the repetition of the letter is within a word.

## Data cleaning using Regex

To solve the letter repetition within words issue, we will use regular expression.

In [30]:
def regex_clean(txt, regex, sub=' '):
    
    '''This function takes in a string and a regular expression.
    It will clean the string by removing any match from the regular expression.
    
    The output is the cleaned string'''
        
    return " ".join(re.sub(regex, sub, txt).split()) ## Substitute the desired regex with nothing,
                                                    ## then bring the sentence back together

### Repeated instances of letters

This snipet will remove letters that repeat themselves 3 times or more, and keep only a single instance.

In [31]:
test_items = ['ooooooh', 'yeeeaaaaahhhhh', 'good', 'hello', 'yassss!!!']
test_clean = [regex_clean(word, r'(\w)\1{2,}', r'\1') for word in test_items]

test_clean

['oh', 'yeah', 'good', 'hello', 'yas!!!']

### Instances of trailing '

For example: Trying -> tryin'
This kind of spelling is quite common within song lyrics.

In [32]:
test_items = ['tryin\' ', 'handlin\' ', 'handlin\'.', 'handling\'']
test_clean = [regex_clean(word, r"'\s|'\.", r'g') for word in test_items]

test_clean

['trying', 'handling', 'handling', "handling'"]

### Instances of square brackets

We need to remove square brackets from the lyrics, as well as what is inside the brackets.

For example: \[Chorus 1\]

In [33]:
test_items = ['I once ate an elephant [It was delicious!]', '[Chorus 2] I will live on!', 
             'There [mouse] will be [rat] justice.']
test_clean = [regex_clean(word, r'(\[.*?\])', r'') for word in test_items]

test_clean

['I once ate an elephant', 'I will live on!', 'There will be justice.']

### Tracking

We will now keep track of how many instance of trailing ' and letter repetition there are in every song.
We already suspect that there should be a correlation between the number of ' and the Hip Hop genre.

In [34]:
# Cleaning

lyric_multiple_letter = [regex_clean(word, r'(\w)\1{2,}', r'\1') for word in df_lyrics.Lyric] # Removing all trailing '
lyric_clean_trail = [regex_clean(word, r"'\s|'\.", r'g') for word in df_lyrics.Lyric] # Removing triple letters or more. 

# Tokenising
lyric = [nltk.wordpunct_tokenize(sentence) for sentence in df_lyrics.Lyric] # We will use this list for comparison

lyric_multiple_letter = [nltk.wordpunct_tokenize(sentence) for sentence in lyric_multiple_letter]
lyric_clean_trail = [nltk.wordpunct_tokenize(sentence) for sentence in lyric_clean_trail]

#### Repeated instances of letters

To count how many time letters were repeated three times or more, we use the Hamming distance.

In [35]:
from scipy.spatial.distance import hamming

Example of how the Hamming distance works. 
The more similar the words are, the closer to 0 the score is.

In [36]:
hamming(['apple', 'apple', 'appl'], ['apple', 'apple', 'apple'])

0.3333333333333333

In [37]:
hamming(['apple', 'apple', 'apple'], ['apple', 'apple', 'apple'])

0.0

We now define a function to calculate the Hamming distance.

In [38]:
def hamham(my_lst):
    '''This function calculates the Hamming distance between the lyrics and their tokenized version.
    Input = list with cleaned lyrics
    
    Output = list containing the hamming distance between the original lyrics and the cleaned lyrics'''
    
    new_lst = []
    j=0
    for item in my_lst:
        new_lst.append(hamming(item,lyric[j]))
        j+=1
    
    return new_lst

We now store the results in the main dataframe.

In [39]:
df_lyrics['multiple_letter'] = hamham(lyric_multiple_letter)

We look at the obtained values.

In [40]:
df_lyrics['multiple_letter'].describe()

count    79452.000000
mean         0.000690
std          0.004534
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          0.276190
Name: multiple_letter, dtype: float64

We notice that the maximum is 0.27, which is not very high and the mean is very close to zero.
We can assume that this feature will not have too much impact on the modelling.

#### Instances of letters

In [41]:
Trails = [len(lyric[i])-len(lyric_clean_trail[i]) for i in range(len(lyric))]

In [42]:
df_lyrics['Trails'] = Trails

In [43]:
df_lyrics.Trails.value_counts()

 0      56526
 2       5556
 4       3101
 6       1869
 8       1425
        ...  
 94         1
 167        1
 176        1
 100        1
-6          1
Name: Trails, Length: 127, dtype: int64

There are some negative values, we set them to 0.

In [44]:
df_lyrics.loc[df_lyrics['Trails'] < 0, 'Trails'] = 0

# Data Preparation

In [45]:
stpwrds = stopwords.words('english') # Import the list of stopwords.
string.punctuation # Import the list of punctuation.
stpwrds.extend(string.punctuation) # Merge the 2 lists together.

#  Add chorus and verse to the list, as they are not part of the lyrics
stpwrds.extend(['chorus', 'verse', 'verses', 'choruses'])
                                        

### Function Declaration

In [46]:
def prep_data(song, stem='s'):
    ''' This function takes in a string, clean it using regular expressions, tokenize it, remove
    digits and any word shorter than 2 characters.
    
    It returned the clean and tokenized version of the string'''
    assert(stem=='s' or stem=='l' or stem=='p'), '''Input a correct stemming parameter and try again.
    The only accepted types are s for Snowball, l for Lancaster or p for Porter. Default is Snowball.'''
    
    song = song.lower() ## Transform the sentence into lowercase

    song = regex_clean(song, r'(\w)\1{2,}', r'\1') # Removing all trailing '
    song = regex_clean(song, r"'\s|'\.", r'g') # Removing triple letters or more.
    song = regex_clean(song, r'(\[.*?\])', r'') # Removing any characeters inside brackets (including brackets)
    song = regex_clean(song, r'(\W){2,}', r'\1') # Removing trailing white space
    
    ## Tokenize 
    song = nltk.wordpunct_tokenize(song) ## tokenize the string

    ## Post Token Cleaning - Stuff that applies to a list 
        
    song = [word for word in song if word not in stpwrds] # Eliminate all extended stopwords from among our tokens
    song = [globals()['{}_stemmer'.format(stem)].stem(word) for word in song] # Apply the chosen stemmer
    song = [word for word in song if not word.isdigit()] # Remove all digits
    song = [word for word in song if len(word)>2] # Remove words shorter than 2 characters, to avoid 'de, tg, ll' etc.


    return song

Function test:

In [47]:
prep_data("I am gonna go so hardn't I won't back down dwag 555!.?  ??any word", 'p')

['gonna', 'hardn', 'back', 'dwag', 'word']

### Cleaning

We now clean the lyrics and add them to the main dataframe.

In [48]:
lyrics = [" ".join(prep_data(x)) for x in df_lyrics['Lyric']]

df_lyrics['lyrics_clean'] = lyrics

Null values verification:

In [49]:
df_lyrics.isnull().sum()

SName              0
Lyric              0
Artist             0
Genre              0
multiple_letter    0
Trails             0
lyrics_clean       0
dtype: int64

### Extra features

We create a list containing the 10 most common words for each genre, after removing the 200 most popular words for the other genres.

In [50]:
Hop = set(['nigga', 'shit', 'bitch', 'fuck', 'hit', 'gon', 'niggaz', 'with', 'bout', 'hoe', 'cuz'])
Rock = set(['blue', 'dead', 'line', 'woman', 'child', 'cold', 'town', 'year',
       'help', 'alive', 'behind'])
Pop = set(['kiss', 'touch', 'christmas', 'hurt', 'beautiful', 'matter', 'somebody', 'trying', 'else', 'knew', 'forget'])

We will use set manipulation to check if one of these words appear in a song.

In [51]:
set_lyric = [set(song) for song in lyric] # Set of the tokenized lyrics

# These three lists will track if a word from the 10 most common words for each genre is present in the lyrics.
Rock_lst = [1 if len(song&Rock)>0 else 0 for song in set_lyric]
Pop_lst = [1 if len(song&Pop)>0 else 0 for song in set_lyric]
Hop_lst = [1 if len(song&Hop)>0 else 0 for song in set_lyric]

We now add these lists to the main dataframe, in 3 separate columns.

In [52]:
df_lyrics = pd.concat([df_lyrics, pd.Series(Rock_lst), pd.Series(Pop_lst), pd.Series(Hop_lst)], axis=1)
df_lyrics.rename(columns={0: 'Rock', 1:'Pop', 2:'Hip_hop'}, inplace=True)
df_lyrics.head()

Unnamed: 0,SName,Lyric,Artist,Genre,multiple_letter,Trails,lyrics_clean,Rock,Pop,Hip_hop
0,World So Cold,"It starts with pain, followed by hate. Fueled ...",12 Stones,0,0.0,0,start pain follow hate fuel endless question o...,1,0,1
1,Broken,Freedom!. Alone again again alone. Patiently w...,12 Stones,0,0.0,0,freedom alon alon patient wait phone hope call...,1,1,0
2,3 Leaf Loser,"Biting the hand that feeds you, lying to the v...",12 Stones,0,0.0,0,bite hand feed lie voic insid reach beg someth...,1,1,0
3,Anthem For The Underdog,You say you know just who I am. But you can't ...,12 Stones,0,0.0,2,say know imagin wait across line thought still...,1,0,0
4,Adrenaline,My heart is beating faster can't control these...,12 Stones,0,0.007042,0,heart beat faster control feel anymor wait lon...,0,0,0


In [53]:
df_lyrics.isnull().sum()

SName              0
Lyric              0
Artist             0
Genre              0
multiple_letter    0
Trails             0
lyrics_clean       0
Rock               0
Pop                0
Hip_hop            0
dtype: int64

In [54]:
df_lyrics.iloc[7509]

SName                                           L'espirit D'escalier
Lyric              All you have to do. All you have to do. Ahhhhh...
Artist                                                           Jet
Genre                                                              0
multiple_letter                                             0.142857
Trails                                                             0
lyrics_clean                                                        
Rock                                                               0
Pop                                                                0
Hip_hop                                                            0
Name: 7509, dtype: object

Now that the dataframe is cleaned and ready for modelling, we export it.

In [55]:
df_lyrics.to_csv('./Dataset/clean_lyrics.csv', index = False)
