In [294]:
#Imports
import numpy as np
import pandas as pd
import re
import nltk
import string
import scipy
import pylab as pl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from statistics import mean
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from operator import itemgetter

### Pre-processing
Using TF-IDF is allowing to have a complexity score for each document, however it requires some pre-processing. The algorithm is using so data-cleaning on its own, such as taking into account only alphanumerical characters and only words which are longer than 2 characters. However we must get rid of english stop words. Those words only have a syntactic role and bring nothing to the complexity computation. Indeed they are of no good to our analysis, thus why we must remove them.

### Analysis
Once the cleaning is done, the score can be computed

In [269]:
#Loading the data
df = pd.read_pickle('Data/ug_dataset.pkl')

#Turning the lyrics into a string, putting in lowercase to process stopwords
df['Lyrics']  = df['Lyrics'].apply(lambda x: ' '.join(map(str, x))).str.lower()


#Adding the stopword list 
stops = set(stopwords.words("english"))

#Removing the stop words
df['TokenizedLyrics']  =  df.Lyrics.str.split(' ').apply(lambda l: [w for w in l if w not in stops])
#Putting together the lyrics of each song again
df 

#Removing the lyrics for which there are less than 40 words
df = df[(df.TokenizedLyrics.apply(len) > 40) & (~df.Lyrics.str.contains(r"[\+\_\|]"))]
display(df.shape)

(4677, 8)

In [272]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Applying the Sklearn implementation of TF-IDF
tfidf_vec = TfidfVectorizer()
transformed = tfidf_vec.fit_transform(df.TokenizedLyrics.apply(lambda x: ' '.join(map(str, x))))
index_value={i[1]:i[0] for i in tfidf_vec.vocabulary_.items()}

#Getting the full index
fully_indexed = []
for row in transformed:
    fully_indexed.append({index_value[column]:value for (column,value) in zip(row.indices,row.data)})


#Putting the output in the DataFrame
df['tf_idf'] = fully_indexed



In [274]:
display(df.head())

Unnamed: 0,Title,Artist,Genre,URL,Hits,Chords,Lyrics,TokenizedLyrics,tf_idf
0,Hallelujah,Jeff Buckley,Rock,https://tabs.ultimate-guitar.com/tab/jeff_buck...,23157554,"[C, Am, C, Am, C, Am, C, Am, F, G, C, G, C, F,...",i heard there was a secret chord that david pl...,"[heard, secret, chord, david, played, pleased,...","{'heard': 0.06692382833490777, 'secret': 0.089..."
1,Im Yours,Jason Mraz,Rock,https://tabs.ultimate-guitar.com/tab/jason_mra...,16413451,"[G, D, Em, C, G, D, Em, C, G, D, Em, C, G, D, ...","well you done done me and you bet i felt it, i...","[well, done, done, bet, felt, it,, tried, chil...","{'music': 0.05415600350071205, 'well': 0.10985..."
2,Wonderwall,Oasis,Rock,https://tabs.ultimate-guitar.com/tab/oasis/won...,12091872,"[Em7, G, Dsus4, A7sus4, Em7, G, Dsus4, A7sus4,...",today is gonna be the day that they're gonna t...,"[today, gonna, day, they're, gonna, throw, bac...","{'heard': 0.04846009883982555, 'really': 0.040..."
3,Wish You Were Here,Pink Floyd,Rock,https://tabs.ultimate-guitar.com/tab/pink_floy...,10529033,"[Em7, G, Em7, G, Em7, A7sus4, Em7, A7sus4, G, ...","so, so you think you can tell, heaven from hel...","[so,, think, tell,, heaven, hell,, blue, skies...","{'cold': 0.17722646708151818, 'hot': 0.2167689..."
4,Hey Soul Sister,Train,Rock,https://tabs.ultimate-guitar.com/tab/train/hey...,10423751,"[C, G, Am, F, C, G, Am, F, C, G, Am, F, G, C, ...",your lipstick stains on the front lobe of my l...,"[lipstick, stains, front, lobe, left, side, br...","{'you': 0.02891145667903827, 'cut': 0.05934857..."


In [275]:
#Computing the average score of each song

df['average_score'] = df['tf_idf'].apply(lambda d: mean(d.values()))

In [276]:
display(df.sort_values('average_score', ascending=False))

Unnamed: 0,Title,Artist,Genre,URL,Hits,Chords,Lyrics,TokenizedLyrics,tf_idf,average_score
2176,Hound Dog,Elvis Presley,Rhythm And Blues,https://tabs.ultimate-guitar.com/tab/elvis_pre...,255523,"[C, F7, C, G7, F7, C, C, F7, C, G7, F7, C, C, ...",you ain't nothin' but a hound dog cryin' all t...,"[ain't, nothin', hound, dog, cryin', time, ain...","{'well': 0.2891762738064728, 'time': 0.1068324...",0.258399
4890,Since Ive Laid My Burden Down,Mississippi John Hurt,Blues,https://tabs.ultimate-guitar.com/tab/mississip...,21310,"[C, F, C, C, C, G, C, C, F, C, C, C, G, C, C, ...",glory glory hallelujah since i've laid my burd...,"[glory, glory, hallelujah, since, i've, laid, ...","{'hallelujah': 0.22250689716191074, 've': 0.18...",0.232169
570,White Blank Page,Mumford & Sons,Folk,https://tabs.ultimate-guitar.com/tab/mumford_s...,732452,"[Em, C, G, C, G, D/F#, Em, C, G, C, G, D/F#, E...",as well as your folly and can you kneel before...,"[well, folly, kneel, king, say, ‘i’m, tell, fa...","{'well': 0.04043119861030134, 'king': 0.064840...",0.217358
4824,Cant Find My Way Home,Blind Faith,Blues,https://tabs.ultimate-guitar.com/tab/blind_fai...,74992,"[D/C, G/B, Gm, Bb, D/A, F, G, D, D/C, G/B, Gm,...",you are the reason i've been waiting so long w...,"[reason, i've, waiting, long, well,, i'm, near...","{'well': 0.15606198924486153, 've': 0.12465935...",0.213904
4528,When The Saints Go Marching In,Louis Armstrong,Jazz,https://tabs.ultimate-guitar.com/tab/louis_arm...,51468,"[Em, B7, Em, A, B7, Em, Em, B7, Em, A, B7, Em,...","oh, when the saints, go marching in, oh when t...","[oh,, saints,, go, marching, in,, oh, saints, ...","{'lord': 0.2721394328267872, 'throne': 0.13730...",0.209913
4791,Love Me Like A River Does,Melody Gardot,Jazz,https://tabs.ultimate-guitar.com/tab/melody_ga...,7681,"[Am, Dm, Am, Dm, Am, Dm, F, E, Am, Am, Dm, Am,...",love me like a river does cross the sea love m...,"[love, like, river, cross, sea, love, like, ri...","{'like': 0.2853514918646245, 'baby': 0.1329117...",0.207637
2164,Hound Dog,Elvis Presley,Rhythm And Blues,https://tabs.ultimate-guitar.com/tab/elvis_pre...,278934,"[C, F, C, G, F, C, C, F, C, G, F, C, C, F, C, ...",you ain't nothin' but a hound dog cryin' all t...,"[ain't, nothin', hound, dog, cryin', time, ain...","{'well': 0.25743127200417487, 'time': 0.095104...",0.207534
3500,Hosanna,Hillsong United,Religious Music,https://tabs.ultimate-guitar.com/tab/hillsong_...,45306,"[G, D, Em7, C, D, G, D, Em7, C, D, C, D, G, C,...","hosanna, hosanna hosanna in the highest! hosan...","[hosanna,, hosanna, hosanna, highest!, hosanna...","{'lord': 0.19922376374892703, 'lift': 0.127383...",0.205095
3228,Let It Rain,Jesus Culture,Religious Music,https://tabs.ultimate-guitar.com/tab/jesus_cul...,397827,"[C#m7, A, E, B, A2, C#m7, E, B, C#m7, A, E, B,...","let it rain, let it rain open the flood gates ...","[let, rain,, let, rain, open, flood, gates, he...","{'you': 0.09279014476066891, 'love': 0.0732504...",0.202687
3477,Take My Life,Third Day,Religious Music,https://tabs.ultimate-guitar.com/tab/third_day...,48756,"[G, C, D, G, G, C, D, G, C, G, D, G, C, D, G, ...",how many times have i turned away the number i...,"[many, times, turned, away, number, sand, shor...","{'back': 0.10372280411761801, 'sky': 0.0729240...",0.202567


In [277]:
df[(df.Title == 'Whiskey In The Jar') & (df.Hits == 254627) ]

Unnamed: 0,Title,Artist,Genre,URL,Hits,Chords,Lyrics,TokenizedLyrics,tf_idf,average_score
2459,Whiskey In The Jar,Metallica,Metal,https://tabs.ultimate-guitar.com/tab/metallica...,254627,"[G5, F#5, Em, G5, F#5, Em, C5, G5, Em, C5, G5,...",as i was goin' over the cork and kerry mounta...,"[, goin', cork, kerry, mountains, saw, captain...","{'you': 0.0323138152291752, 'like': 0.10320415...",0.102852


In [285]:
df.tf_idf[3939].tf_idf

AttributeError: 'dict' object has no attribute 'sort'

In [300]:
sorted(df.tf_idf[123].items(), key=itemgetter(1), reverse=True)

[('11', 0.46760451593449914),
 ('cuddles', 0.29394212174251283),
 ('argumentative', 0.29394212174251283),
 ('12', 0.2570675330954131),
 ('remember', 0.23636506725615153),
 ('mardy', 0.19596141449500853),
 ('kitchen', 0.18150360589881606),
 ('yeah', 0.14997735794160208),
 ('oh', 0.1456197859036844),
 ('bum', 0.14558546303061057),
 ('face', 0.1374140223506184),
 ('ground', 0.13710003753645883),
 ('joke', 0.13191547751415675),
 ('10', 0.1278357656342259),
 ('laugh', 0.10798105701171935),
 ('hard', 0.1053559301268833),
 ('disappointment', 0.09798070724750427),
 ('arsed', 0.09798070724750427),
 ('reoccurs', 0.09798070724750427),
 ('things', 0.09619086863253302),
 ('around', 0.09429812927407187),
 ('today', 0.0911918703632577),
 ('a5', 0.09022561553811645),
 ('g5', 0.08772903363843644),
 ('pleasant', 0.08568917769847102),
 ('jokes', 0.08568917769847102),
 ('on', 0.0845226800854623),
 ('right', 0.08255066261527948),
 ('barrel', 0.0824705238287286),
 ('away', 0.08154684063041756),
 ('debate', 