## data collection and preprocessing



In [4]:
% matplotlib inline

# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# We do this to ignore several specific Pandas warnings
import warnings
import requests
import json
warnings.filterwarnings("ignore")

### Extract data from Million Song Subset which is 1.8G in size

In [79]:
import os, sys

dir_tree = 'C:\\Users\\10465\\Downloads\\millionsongsubset_full\\MillionSongSubset\\'

for dir_path, dir_names, file_names in os.walk(dir_tree):
    for file_name in file_names:
        try:
            os.rename(os.path.join(dir_path, file_name), os.path.join(dir_tree, file_name))
        except OSError:
            print ("Could not move %s " % os.join(dir_path, file_name))

### Build an artist table with file,title, artist columns

In [80]:
def make_artist_table(base):

# Get file names

    files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
    data = {'file':[], 'artist':[], 'title':[]}

    # Add artist and title data to dictionary
    for f in files:
        store = pd.HDFStore(f)
        title = store.root.metadata.songs.cols.title[0]
        artist = store.root.metadata.songs.cols.artist_name[0]
        data['file'].append(os.path.basename(f))
        data['title'].append(title.decode("utf-8"))
        data['artist'].append(artist.decode("utf-8"))
        store.close()
    
    # Convert dictionary to pandas DataFrame
    df = pd.DataFrame.from_dict(data, orient='columns')
    df = df[['file', 'artist', 'title']]
    return df

In [81]:
base = 'C:\\Users\\10465\\Downloads\\millionsongsubset_full\\MillionSongSubset\\'
df = make_artist_table(base)

df.tail()

Unnamed: 0,file,artist,title
9996,TRBIJMU12903CF892B.h5,Moonspell,The Hanged Man
9997,TRBIJNF128F14815A7.h5,Danny Williams,The Wonderful World Of The Young
9998,TRBIJNK128F93093EC.h5,Winston Reedy,Sentimental Man
9999,TRBIJRN128F425F3DD.h5,"Myrick ""Freeze"" Guillory",Zydeco In D-Minor
10000,TRBIJYB128F14AE326.h5,Seventh Day Slumber,Shattered Life


### Add the lyrics column

In [83]:
df['lyrics'] = pd.Series('', index=df.index)
df.tail()

Unnamed: 0,file,artist,title,lyrics
9996,TRBIJMU12903CF892B.h5,Moonspell,The Hanged Man,
9997,TRBIJNF128F14815A7.h5,Danny Williams,The Wonderful World Of The Young,
9998,TRBIJNK128F93093EC.h5,Winston Reedy,Sentimental Man,
9999,TRBIJRN128F425F3DD.h5,"Myrick ""Freeze"" Guillory",Zydeco In D-Minor,
10000,TRBIJYB128F14AE326.h5,Seventh Day Slumber,Shattered Life,


### download the PyLyrics package to download lyrics from the website

In [98]:
# !pip install PyLyrics

Collecting PyLyrics
  Downloading PyLyrics-1.1.0.zip
Building wheels for collected packages: PyLyrics
  Running setup.py bdist_wheel for PyLyrics: started
  Running setup.py bdist_wheel for PyLyrics: finished with status 'done'
  Stored in directory: C:\Users\10465\AppData\Local\pip\Cache\wheels\3e\1a\25\a4217f5896313da289878107ba12324d3044413d466b65f243
Successfully built PyLyrics
Installing collected packages: PyLyrics
Successfully installed PyLyrics-1.1.0


You are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [99]:
from PyLyrics import *

In [100]:
# from PyLyrics import *
## test this function
print(PyLyrics.getLyrics('Taylor Swift','Blank Space')) #Print the lyrics directly

Nice to meet you, where you been?
I could show you incredible things
Magic, madness, heaven, sins
Saw you there and I thought

"Oh my God, look at that face!"
You look like my next mistake
Love's a game, wanna play?

New money, suit and tie
I can read you like a magazine
Ain't it funny, rumors fly
And I know you heard about me

So hey, let's be friends
I'm dying to see how this one ends
Grab your passport and my hand
I can make the bad guys good for a weekend

So it's gonna be forever
Or it's gonna go down in flames
You can tell me when it's over (mmm)
If the high was worth the pain

Got a long list of ex-lovers
They'll tell you I'm insane
'Cause you know I love the players
And you love the game

'Cause we're young and we're reckless
We'll take this way too far
It'll leave you breathless (mmm)
Or with a nasty scar

Got a long list of ex-lovers
They'll tell you I'm insane
But I got a blank space, baby
And I'll write your name

Cherry lips, crystal skies
I could show you incredible thing

In [90]:
import pyprind

### download lyrics with the arguments of artist and track name

In [102]:
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
    try:
        lyr = PyLyrics.getLyrics(df.loc[row_id]['artist'],df.loc[row_id]['title'])    
        df.loc[row_id,'lyrics'] = lyr
        pbar.update()
    except: #ignore erro when API returns no lyrics 
        continue


0% [############################# ] 100% | ETA: 00:00:09

In [103]:
print('downloaded Lyrics for %s songs' %sum(df.lyrics!=''))
df.head()

downloaded Lyrics for 1048 songs


Unnamed: 0,file,artist,title,lyrics
0,subset_msd_summary_file.h5,Mastodon,Deep Sea Creature,"Knowing right, learning wrong\nWhat you're fee..."
1,TRAAAAW128F429D538.h5,Casual,I Didn't Mean To,Verse One:\n\nAlright I might\nHave had a litt...
2,TRAAABD128F429CF47.h5,The Box Tops,Soul Deep,"Darling, I don't know much\nBut I know I love ..."
4,TRAAAEF128F4273421.h5,Adam Ant,Something Girls,Adam Ant/Marco Pirroni\nEvery girl is a someth...
5,TRAAAFD128F92F423A.h5,Gob,Face the Ashes,"I've just erased it's been a while, I've got a..."


In [104]:
df.to_csv('df_lyr_backup.csv')

### drop rows that has no lyrics

In [105]:
df = df[df.lyrics!='']

### remove songs that is not English song, cited from https://github.com/rasbt/musicmood

In [109]:
import nltk

def eng_ratio(text):
    ''' Returns the ratio of non-English to English words from a text '''

    english_vocab = set(w.lower() for w in nltk.corpus.words.words()) 
    text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha()) 
    unusual = text_vocab.difference(english_vocab)
    diff = len(unusual)/len(text_vocab)
    return diff

In [110]:
before = df.shape[0]
for row_id in df.index:
    text = df.loc[row_id]['lyrics']
    diff = eng_ratio(text)
    if diff >= 0.5:
        df = df[df.index != row_id]
after = df.shape[0]
rem = before - after
print('%s have been removed.' %rem)
print('%s songs remain in the dataset.' %after)

131 have been removed.
917 songs remain in the dataset.


### Till now, we got the songs with lyrics, but we have to tag each song with mood. Here I download the tags from Last.fm and classified each some with happy mood or sad mood

In [136]:
def getSongTags(artist,track):
    url = "http://ws.audioscrobbler.com/2.0/?method=track.getTopTags&api_key=0f6916aff634cb3e768baa9d5ee89341&artist="+artist+"&track="+track+"&format=json"
#     print(url)
    results = requests.get(url).json()
#     print(results)
    tagList = []
    if 'toptags' in results:
        toptags = results['toptags']
        if 'tag' in toptags:
            taglistss = toptags['tag']           
            for tagItem in taglistss:
                tagList.append(tagItem['name']) 
    return tagList

In [138]:
df['tags'] = ''
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
    tags = getSongTags(df.loc[row_id]['artist'],df.loc[row_id]['title'])    
    df.loc[row_id,'tags'] = tags
    pbar.update()
    


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:49


In [139]:
getSongTags("Mastodon","Deep Sea Creature")

['Progressive metal',
 'metal',
 'Sludge',
 'metalcore',
 'rock',
 'alternative metal',
 'sludge metal',
 'progressive stoner metal',
 'heavy metal',
 'hard rock',
 'thrash metal',
 'death metal',
 'post-hardcore',
 'wwe',
 'Megadeth',
 'Testament',
 'exodus',
 'Math core',
 'attacker']

In [141]:
df['tags']
# df.head()

0       [Progressive metal, metal, Sludge, metalcore, ...
1       [Bay Area, hieroglyiphics, Hip-Hop, classic, h...
2       [60s, soul, pop, rock, oldies, 1969, classic r...
4       [new wave, 80s, girls, freedom, strangeromanti...
5                                       [punk rock, punk]
9       [Progressive rock, classic rock, rock, hard ro...
15      [80s, all in a day, Lionel Richie, slowhand, v...
16      [classic rock, country, singer-songwriter, blu...
19      [hard rock, rock, chillout, Love, Favorite, ch...
24      [mathcore, hardcore, metal, experimental, meta...
25      [oldies, 60s girls, 60s, female vocalist, midb...
26          [metal, seen live, lie, the favouritest song]
29      [latin, latin jazz, jazz, Soundtrack, Latin-ja...
32                                [favourite, fave songs]
33                                                     []
34                                              [country]
42                                                     []
43            

In [159]:
for row_id in df.index:     
    if len(df.loc[row_id,'tags'])==0:
        df = df.drop(row_id)
    

In [None]:
df['year'] = pd.Series('', index=df.index)

base = 'C:\\Users\\10465\\Downloads\\millionsongsubset_full\\MillionSongSubset\\'
files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
for row_id in df.index:
    filename = df.loc[row_id]['file']
    filepath = os.path.join(base,filename)
    store = pd.HDFStore(filepath)
    year = store.root.musicbrainz.songs.cols.year[0]
    df.loc[row_id]['year'] = year

<img src="image.png">

In [160]:
df.shape[0]

719

## Happy or Sad
### Group id	Tags	num. of tags	num. of songs
#### sad tags:

G15	sad, sadness, unhappy, melancholic, melancholy, feeling sad, mood: sad - slightly, sad song	8	1,178

G16	depressed, blue, dark, depressive, dreary, gloom, darkness, depress, depression, depressing, gloomy	11	471

G28	anger, angry, choleric, fury, outraged, rage, angry music	7	254

G17	grief, heartbreak, mournful, sorrow, sorry, doleful, heartache, heartbreaking, heartsick, lachrymose, mourning, plaintive, regret, sorrowful	14	183

#### happy tags:
G6	cheerful, cheer up, festive, jolly, jovial, merry, cheer, cheering, cheery, get happy, rejoice, songs that are cheerful, sunny	13	142

G5	happy, happiness, happy songs, happy music, glad, mood: happy	6	749

G2	upbeat, gleeful, high spirits, zest, enthusiastic, buoyancy, elation, mood: upbeat	8	543

G1	excitement, exciting, exhilarating, thrill, ardor, stimulating, thrilling, titillating	8	30
TOTAL		135	6,490

### This tag summary comes from the last.fm website which were group into different categories. Here, I choose group 15,16,28,17 as sad tag and group 5,6,2,1 as happy songs

In [166]:
happyTags = "cheerful, cheer up, festive, jolly, jovial, merry, cheer, cheering,\
cheery, get happy, rejoice, songs that are cheerful, sunny,happy, happiness, happy songs, happy music, glad, mood: happy,\
upbeat, gleeful, high spirits, zest, enthusiastic, buoyancy, elation, mood: upbeat,excitement, exciting, exhilarating, thrill,\
ardor, stimulating, thrilling, titillating"
happyTags = happyTags.replace(" ","").split(",")

sagTags = "sad, sadness, unhappy, melancholic, melancholy, feeling sad, mood: sad - slightly, sad song,\
depressed, blue, dark, depressive, dreary, gloom, darkness, depress, depression, depressing, gloomy,\
anger, angry, choleric, fury, outraged, rage, angry music,grief, heartbreak, mournful, sorrow, sorry, doleful, heartache, heartbreaking, heartsick, lachrymose, mourning,\
plaintive, regret, sorrowful"
sagTags = sagTags.replace(" ","").split(",")

In [165]:
happyTags

['cheerful',
 'cheerup',
 'festive',
 'jolly',
 'jovial',
 'merry',
 'cheer',
 'cheering',
 'cheery',
 'gethappy',
 'rejoice',
 'songsthatarecheerful',
 'sunny',
 'happy',
 'happiness',
 'happysongs',
 'happymusic',
 'glad',
 'mood:happyupbeat',
 'gleeful',
 'highspirits',
 'zest',
 'enthusiastic',
 'buoyancy',
 'elation',
 'mood:upbeat',
 'excitement',
 'exciting',
 'exhilarating',
 'thrill',
 'ardor',
 'stimulating',
 'thrilling',
 'titillating']

In [167]:
sagTags

['sad',
 'sadness',
 'unhappy',
 'melancholic',
 'melancholy',
 'feelingsad',
 'mood:sad-slightly',
 'sadsong',
 'depressed',
 'blue',
 'dark',
 'depressive',
 'dreary',
 'gloom',
 'darkness',
 'depress',
 'depression',
 'depressing',
 'gloomy',
 'anger',
 'angry',
 'choleric',
 'fury',
 'outraged',
 'rage',
 'angrymusic',
 'grief',
 'heartbreak',
 'mournful',
 'sorrow',
 'sorry',
 'doleful',
 'heartache',
 'heartbreaking',
 'heartsick',
 'lachrymose',
 'mourning',
 'plaintive',
 'regret',
 'sorrowful']

### Based on the tag numbers from sad group or the happy group, we can assign a mood value 1(happy) or 0(sad) to the mood column 

In [169]:
df['mood']=""
pbar = pyprind.ProgBar(df.shape[0])
for row_id in df.index:
    tags = df.loc[row_id,'tags']    
    sad_tags = np.intersect1d(tags,sagTags) 
    happy_tags = np.intersect1d(tags,happyTags)
    if len(sad_tags)>0 or len(happy_tags)>0:# having mood tag
        if len(sad_tags)>len(happy_tags):
            df.loc[row_id,'mood'] = 0
        else:
            df.loc[row_id,'mood'] = 1
    else:
        df = df.drop(row_id)# remove songs that does not have tag
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


In [171]:
df.shape[0]

166

In [172]:
df['mood']

24      1
29      1
49      1
56      1
75      1
76      0
79      0
87      0
88      0
116     0
139     0
167     1
180     0
189     1
198     0
205     1
214     0
226     1
232     1
236     1
242     0
259     0
281     1
321     1
352     1
372     1
410     0
411     0
443     1
449     0
       ..
2627    0
2641    0
2655    0
2665    1
2672    0
2683    0
2730    0
2751    0
2804    1
2811    0
2837    1
2840    0
2858    1
2913    1
2920    1
2963    0
2971    0
2974    0
3050    0
3051    0
3067    1
3081    1
3085    1
3116    0
3128    0
3132    1
3147    0
3159    1
3171    1
3214    1
Name: mood, dtype: object

In [173]:
df

Unnamed: 0,file,artist,title,lyrics,tags,mood
24,TRAACER128F4290F96.h5,The Dillinger Escape Plan,Setting Fire to Sleeping Giants,First off let me say you look so tired...\nRes...,"[mathcore, hardcore, metal, experimental, meta...",1
29,TRAACNS128F14A2DF5.h5,Willie Bobo,Spanish Grease,Ain't no love\nAin't no love\nAin't no love\nA...,"[latin, latin jazz, jazz, Soundtrack, Latin-ja...",1
49,TRAADQX128F422B4CF.h5,Casiotone For The Painfully Alone,Nashville Parthenon,If you ever make it back to Nashville\nRemembe...,"[Lo-Fi, indie, indietronica, electronic, 8-bit...",1
56,TRAAEEH128E0795DFE.h5,Chris Rea,Driving Home For Christmas,"I'm driving home for Christmas\nOh, I can't wa...","[christmas, xmas, Christmas Songs, Chris Rea, ...",1
75,TRAAGCZ128F93210FD.h5,Black Eyed Peas,Let's Get It Started,"Let's get it started, in here...\n\nAnd the ba...","[Hip-Hop, dance, pop, black eyed peas, hip hop...",1
76,TRAAGEC128E079252C.h5,Nine Inch Nails,Metal,We're in the building where they make us grow\...,"[industrial, cover, electronic, industrial roc...",0
79,TRAAGJV128F1464090.h5,Lesley Gore,It's My Party,It's my party and I'll cry if I want to\nCry i...,"[60s, oldies, pop, female vocalists, rock n ro...",0
87,TRAAGPH128F4273C8D.h5,Savage Garden,You Can Still Be Free,Cool breeze and autumn leaves\nSlow motion day...,"[pop, savage garden, australian, amazing, Guil...",0
88,TRAAGPJ128F428CD1B.h5,Queens Of The Stone Age,I Never Came,"When you say it's dead and gone\nYes, I know y...","[Stoner Rock, rock, alternative rock, alternat...",0
116,TRAAIHL128F92E6DDA.h5,Boyz II Men,On Bended Knee,"And I....\n\nDarlin' I, I can't explain\nWhere...","[rnb, slow jams, 90s, love songs, soul, Love, ...",0


In [174]:
df['year'] = pd.Series('', index=df.index)

base = 'C:\\Users\\10465\\Downloads\\millionsongsubset_full\\MillionSongSubset\\'
files = [os.path.join(base,fn) for fn in os.listdir(base) if fn.endswith('.h5')]
for row_id in df.index:
    filename = df.loc[row_id]['file']
    filepath = os.path.join(base,filename)
    store = pd.HDFStore(filepath)
    year = store.root.musicbrainz.songs.cols.year[0]
    df.loc[row_id]['year'] = year

In [175]:
df

Unnamed: 0,file,artist,title,lyrics,tags,mood,year
24,TRAACER128F4290F96.h5,The Dillinger Escape Plan,Setting Fire to Sleeping Giants,First off let me say you look so tired...\nRes...,"[mathcore, hardcore, metal, experimental, meta...",1,2004
29,TRAACNS128F14A2DF5.h5,Willie Bobo,Spanish Grease,Ain't no love\nAin't no love\nAin't no love\nA...,"[latin, latin jazz, jazz, Soundtrack, Latin-ja...",1,1997
49,TRAADQX128F422B4CF.h5,Casiotone For The Painfully Alone,Nashville Parthenon,If you ever make it back to Nashville\nRemembe...,"[Lo-Fi, indie, indietronica, electronic, 8-bit...",1,2006
56,TRAAEEH128E0795DFE.h5,Chris Rea,Driving Home For Christmas,"I'm driving home for Christmas\nOh, I can't wa...","[christmas, xmas, Christmas Songs, Chris Rea, ...",1,1986
75,TRAAGCZ128F93210FD.h5,Black Eyed Peas,Let's Get It Started,"Let's get it started, in here...\n\nAnd the ba...","[Hip-Hop, dance, pop, black eyed peas, hip hop...",1,2004
76,TRAAGEC128E079252C.h5,Nine Inch Nails,Metal,We're in the building where they make us grow\...,"[industrial, cover, electronic, industrial roc...",0,2000
79,TRAAGJV128F1464090.h5,Lesley Gore,It's My Party,It's my party and I'll cry if I want to\nCry i...,"[60s, oldies, pop, female vocalists, rock n ro...",0,1963
87,TRAAGPH128F4273C8D.h5,Savage Garden,You Can Still Be Free,Cool breeze and autumn leaves\nSlow motion day...,"[pop, savage garden, australian, amazing, Guil...",0,1999
88,TRAAGPJ128F428CD1B.h5,Queens Of The Stone Age,I Never Came,"When you say it's dead and gone\nYes, I know y...","[Stoner Rock, rock, alternative rock, alternat...",0,2005
116,TRAAIHL128F92E6DDA.h5,Boyz II Men,On Bended Knee,"And I....\n\nDarlin' I, I can't explain\nWhere...","[rnb, slow jams, 90s, love songs, soul, Love, ...",0,1994


In [178]:
# df.to_csv('lyrics_166.csv', index=False,encoding='utf-8')

In [179]:
df = df.drop("tags",axis=1)

In [181]:
len(df[df.mood==1])/166


0.3795180722891566

In [5]:
# save songs with mood tag into csv file for training and keywords extraction
df.to_csv('lyrics_166.csv', index=False,encoding='utf-8')