In [104]:
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from functools import reduce

## Cleaning and preparing the dataset

In [337]:
df = pd.read_pickle('Data/ug_dataset.pkl')

In [289]:
wrong_lyrics = df.Lyrics.apply(lambda l : len(",".join(map(str,l)).split(' ')))<40
print("NUMBER OF WRONG LYRICS =", wrong_lyrics.sum())
df[wrong_lyrics].Lyrics

NUMBER OF WRONG LYRICS = 120


182     [central lead thing, then you can add little t...
184     [Save us from the ball and chain, Save us from...
339                                                    []
583     [Your mouth is a revolver firing bullets in th...
733           [, , , , They got Catfish on the table, , ]
1019    [, My heart is drenched in wine,, My heart is ...
1391                                                   []
1527                                                   []
1550                                                   []
1683                                                   []
1703                              [, , Enjoy the silence]
1757    [when the world has turned, paralyzed and wron...
1764    [Stay alive, stay alive, for me, You will die,...
1772                 [You have to play this all the time]
1799    [Party rock is in the house tonight, Everybody...
1817                                 [Love will never do]
1960    [Moon river, Wider than a mile, I'm crossing y...
1964    [You t

In [191]:
wrong_chords = df.Chords.apply(len) < 4
print("NUMBER OF WRONG CHORDS =", wrong_chords.sum())
df[wrong_chords].Chords

NUMBER OF WRONG CHORDS = 22


1391                []
1883      [G#, Cm, Fm]
2525                []
2637                []
2650                []
2677                []
2726                []
3687      [G#, Cm, Fm]
3711                []
3896          [G7, E7]
3966    [F#min, G, Em]
3982     [F#m, A, F#m]
4134           [Em, D]
4195               [D]
4254                []
4351                []
4370                []
4496                []
4628                []
4871         [E, A, B]
4992                []
5171                []
Name: Chords, dtype: object

In [192]:
wrong_entries = wrong_chords | wrong_lyrics

In [193]:
df[wrong_entries]

Unnamed: 0,Title,Artist,Genre,URL,Hits,Chords,Lyrics
339,Time,Pink Floyd,Rock,https://tabs.ultimate-guitar.com/tab/pink_floy...,956813,"[F#m, Dmaj7, Amaj7, C#m7, Bm7, Bm7, F#m, A, E,...",[]
583,Bonfire Heart,James Blunt,Folk,https://tabs.ultimate-guitar.com/tab/james_blu...,693513,"[C, C, Em, C, Em, F, Am, C, G, C, C, Am, F, Am...",[Your mouth is a revolver firing bullets in th...
733,Walking In Memphis,Marc Cohn,Folk,https://tabs.ultimate-guitar.com/tab/marc_cohn...,416908,"[F, G, C, Am, F, G, C, Am, F, G, C, Am, F, G, ...","[, , , , They got Catfish on the table, , ]"
1391,As Shes Walking Away,Zac Brown Band,Country,https://tabs.ultimate-guitar.com/tab/zac_brown...,307146,[],[]
1527,The Truth,Jason Aldean,Country,https://tabs.ultimate-guitar.com/tab/jason_ald...,188560,"[G, C, G, C, G, C, G, C, D, C, G, C, D, G, C, ...",[]
1550,Let It Be Me,The Everly Brothers,Country,https://tabs.ultimate-guitar.com/tab/the_everl...,176998,"[G, D, Em, Bm, C, G, C, G, G, D, Em, Bm, C, G,...",[]
1683,Moon River,Henry Mancini,Electronic,https://tabs.ultimate-guitar.com/tab/henry_man...,404406,"[Bb9-5, F#m7-5, F7, C, Am, F, C, F, C, Dm7, E7...",[]
1703,Enjoy The Silence,Depeche Mode,Electronic,https://tabs.ultimate-guitar.com/tab/depeche_m...,355711,"[Bsus2, Bbsus2, Asus2, Dm, G, Bsus2, Dm, Bsus2...","[, , Enjoy the silence]"
1772,Good Feeling,Flo Rida,Electronic,https://tabs.ultimate-guitar.com/tab/flo_rida/...,218867,"[C#m, E, B, A, C#m, E, B, A, C#m, E, B, A]",[You have to play this all the time]
1817,Its Not Unusual,Tom Jones,Electronic,https://tabs.ultimate-guitar.com/tab/tom_jones...,164945,"[C, Dm, C, Dm, C, Dm, C, Dm, Em, Dm, G, C, G, ...",[Love will never do]


In [194]:
df[wrong_entries].Genre.value_counts()#.plot(kind="pie")
#plt.show()

Reggae              12
Jazz                12
Blues                8
Hip Hop              7
Metal                7
Electronic           5
Country              3
Rhythm And Blues     2
Contemporary R&b     2
Folk                 2
Religious Music      1
Rock                 1
Name: Genre, dtype: int64

Upon inspection, one entry has some empty lists in its chord sequence :

In [338]:
df.iloc[3999].Chords

['G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 'C',
 'Em',
 'C',
 'G',
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

We remove those empty lists with the following command :

In [348]:
df.Chords = df.Chords.apply(lambda l : list(filter(None, l)))
df.Chords = df.Chords.apply(lambda l : list(filter(lambda c : "[" not in c and "]" not in c, l)))

In [344]:
filtered_df = df[~wrong_entries]
filtered_df

Unnamed: 0,Title,Artist,Genre,URL,Hits,Chords,Lyrics
0,Hallelujah,Jeff Buckley,Rock,https://tabs.ultimate-guitar.com/tab/jeff_buck...,23157554,"[C, Am, C, Am, C, Am, C, Am, F, G, C, G, C, F,...","[I heard there was a secret chord, That David ..."
1,Im Yours,Jason Mraz,Rock,https://tabs.ultimate-guitar.com/tab/jason_mra...,16413451,"[G, D, Em, C, G, D, Em, C, G, D, Em, C, G, D, ...","[Well you done done me and you bet I felt it,,..."
2,Wonderwall,Oasis,Rock,https://tabs.ultimate-guitar.com/tab/oasis/won...,12091872,"[Em7, G, Dsus4, A7sus4, Em7, G, Dsus4, A7sus4,...","[Today is gonna be the day, That they're gonna..."
3,Wish You Were Here,Pink Floyd,Rock,https://tabs.ultimate-guitar.com/tab/pink_floy...,10529033,"[Em7, G, Em7, G, Em7, A7sus4, Em7, A7sus4, G, ...","[So, so you think you can tell,, Heaven from H..."
4,Hey Soul Sister,Train,Rock,https://tabs.ultimate-guitar.com/tab/train/hey...,10423751,"[C, G, Am, F, C, G, Am, F, C, G, Am, F, G, C, ...",[Your lipstick stains on the front lobe of my ...
5,Wonderwall,Oasis,Rock,https://tabs.ultimate-guitar.com/tab/oasis/won...,9452644,"[F#m, A, Esus4, B, F#m, A, Esus4, B, F#m, A, E...","[Today is gonna be the day, That they're gonna..."
6,The Only Exception,Paramore,Rock,https://tabs.ultimate-guitar.com/tab/paramore/...,9420989,"[G, Dm, Cmaj7, G, Dm, Cmaj7, G, Dm, Cmaj7, G, ...",[When I was younger I saw my daddy cry and cur...
7,Love Story,Taylor Swift,Rock,https://tabs.ultimate-guitar.com/tab/taylor_sw...,8964458,"[C, G, Am, F, C, F, Am, F, C, F, Am, G, F, G, ...","[We were both young when I first saw you, I cl..."
8,Iris,Goo Goo Dolls,Rock,https://tabs.ultimate-guitar.com/tab/goo_goo_d...,8841766,"[Bm, Bsus2, G, D, Em, G, Bm, A, G, D, Em, G, B...","[And I'd give up forever to touch you, 'Cause ..."
9,Let It Be,The Beatles,Rock,https://tabs.ultimate-guitar.com/tab/the_beatl...,8786451,"[C, G, Am, Fmaj7, F6, C, G, F, C/E, Dm, C, C, ...","[When I find myself in times of trouble, Mothe..."


# Measuring harmonic complexity

In [345]:
chord_set_by_song = filtered_df['Chords'].apply(set)
all_chords = reduce(set.union, chord_set_by_song)
print("Total number of different chords found in our dataset :", len(all_chords), "chords.")

Total number of different chords found in our dataset : 1077 chords.


In [366]:
#Helper method to show the set of chords in our dataset that contains a certain "component" string
def show_set_of_chords(component):
    return set([ c for c in all_chords if component in c ])

show_set_of_chords("sus")

{'A7(sus)',
 'A7sus',
 'A7sus4',
 'Absus',
 'Absus4',
 'Asus',
 'Asus2',
 'Asus2/A',
 'Asus2/E',
 'Asus2/F#',
 'Asus2/G',
 'Asus2/G#',
 'Asus4',
 'Asus4/E',
 'Asus7',
 'B7sus',
 'B7sus2',
 'B7sus4',
 'B9sus4',
 'Bb6sus2',
 'Bb7sus4',
 'Bbsus',
 'Bbsus2',
 'Bbsus4',
 'Bbsus4/Eb',
 'Bsus',
 'Bsus/F#',
 'Bsus2',
 'Bsus4',
 'C#7sus4',
 'C#sus2',
 'C7sus',
 'C7sus4',
 'Csus',
 'Csus2',
 'Csus4',
 'Csus4/F',
 'D#sus',
 'D(sus2)',
 'D7sus',
 'D7sus2',
 'D7sus2/F#',
 'D7sus4',
 'D7sus4/G',
 'D9sus',
 'Db7sus',
 'Db7sus4',
 'Dbsus2',
 'Dbsus4',
 'Dbsus4/Ab',
 'Dsus',
 'Dsus/C',
 'Dsus/F#',
 'Dsus/G',
 'Dsus2',
 'Dsus2/Ab',
 'Dsus2/Bb',
 'Dsus2/C#',
 'Dsus2/E',
 'Dsus2/F#',
 'Dsus4',
 'Dsus4/A',
 'Dsus4/F#',
 'E7(sus)',
 'E7sus',
 'E7sus4',
 'E7sus4/B',
 'E7sus4/C#',
 'E7sus4/D',
 'E9sus4',
 'Eb7sus',
 'Ebsus2',
 'Ebsus4',
 'Ebsus4/F',
 'Esus',
 'Esus2',
 'Esus4',
 'Esus4/B',
 'F#7b9sus4',
 'F#7sus',
 'F#7sus4',
 'F#sus',
 'F#sus/G#',
 'F#sus2',
 'F#sus4',
 'F7sus',
 'F7sus4',
 'Fsus',
 'Fsus2',

For each song, we create a dataframe from the chord sequence that contains one row per chord with all the features of the chord decomposed into columns.

In [349]:
def chord_seq_to_df(chord_seq):
    s = pd.Series(chord_seq)
    return pd.concat([s, s.apply(lambda c : c[0:2] if len(c)>1 and (c[1]=="b" or c[1]=="#") else c[0]), \
                    s.apply(lambda c : "m" in c or "min" in c), \
                    s.apply(lambda c : not ("m" in c or "min" in c)), \
                    s.apply(lambda c : "7" in c), \
                    s.apply(lambda c : "maj7" in c), \
                    s.apply(lambda c : "aug" in c), \
                    s.apply(lambda c : "add" in c), \
                    s.apply(lambda c : "dim" in c), \
                    s.apply(lambda c : "sus" in c), \
                    s.apply(lambda c : "5" in c), \
                    s.apply(lambda c : "6" in c), \
                    s.apply(lambda c : "9" in c), \
                    s.apply(lambda c : "11" in c), \
                    s.apply(lambda c : "13" in c), \
                    s.apply(lambda c : c.split('/')[1] if "/" in c else None) ], 
              keys=['chord', 'root', 'minor', 'major', '7th', 'major 7th', 'augmented', 'add', 'diminished', 'suspended', 'power chord', '6th', '9th', '11th', '13th', 'specified bass note'],
              axis=1)

In [350]:
chords_df = filtered_df.Chords.apply(chord_seq_to_df)

In [351]:
chords_df[3487]

Unnamed: 0,chord,root,minor,major,7th,major 7th,augmented,add,diminished,suspended,power chord,6th,9th,11th,13th,specified bass note
0,D,D,False,True,False,False,False,False,False,False,False,False,False,False,False,
1,D/A,D,False,True,False,False,False,False,False,False,False,False,False,False,False,A
2,Asus4,A,False,True,False,False,False,False,False,True,False,False,False,False,False,
3,G,G,False,True,False,False,False,False,False,False,False,False,False,False,False,
4,G/B,G,False,True,False,False,False,False,False,False,False,False,False,False,False,B
5,D/F#,D,False,True,False,False,False,False,False,False,False,False,False,False,False,F#
6,Bm7,B,True,False,True,False,False,False,False,False,False,False,False,False,False,
7,Em7,E,True,False,True,False,False,False,False,False,False,False,False,False,False,
8,D,D,False,True,False,False,False,False,False,False,False,False,False,False,False,
9,Asus4,A,False,True,False,False,False,False,False,True,False,False,False,False,False,


In [359]:
filtered_df["Chords_df"] = chords_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [363]:
filtered_df.iloc[3473].Chords

['C',
 'G',
 'Am',
 'G',
 'F',
 'C',
 'F',
 'C',
 'Am',
 'Dm',
 'G',
 'C',
 'G',
 'Am',
 'G',
 'F',
 'C',
 'F',
 'C',
 'Am',
 'Dm',
 'G',
 'C',
 'Am',
 'F',
 'G',
 'C',
 'Am',
 'F',
 'G',
 'Am',
 'F',
 'G',
 'Am',
 'G/G7',
 'C',
 'Am',
 'F',
 'G',
 'C',
 'Am',
 'F',
 'G',
 'Am',
 'F',
 'G',
 'C']

In [364]:
filtered_df.iloc[3473].Chords_df

Unnamed: 0,chord,root,minor,major,7th,major 7th,augmented,add,diminished,suspended,power chord,6th,9th,11th,13th,specified bass note
0,C,C,False,True,False,False,False,False,False,False,False,False,False,False,False,
1,G,G,False,True,False,False,False,False,False,False,False,False,False,False,False,
2,Am,A,True,False,False,False,False,False,False,False,False,False,False,False,False,
3,G,G,False,True,False,False,False,False,False,False,False,False,False,False,False,
4,F,F,False,True,False,False,False,False,False,False,False,False,False,False,False,
5,C,C,False,True,False,False,False,False,False,False,False,False,False,False,False,
6,F,F,False,True,False,False,False,False,False,False,False,False,False,False,False,
7,C,C,False,True,False,False,False,False,False,False,False,False,False,False,False,
8,Am,A,True,False,False,False,False,False,False,False,False,False,False,False,False,
9,Dm,D,True,False,False,False,False,False,False,False,False,False,False,False,False,


In [365]:
filtered_df.to_pickle("Data/dataset_milestone3.pkl")