# Analyse processed dataset full of _da_ tunes

In [1]:
from platform import python_version
python_version()

'3.7.1'

In [2]:
import spacy
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# source: "Crowdsourcing a Word-Emotion Association Lexicon", Saif Mohammad
# and Peter Turney, Computational Intelligence, 29 (3), 436-465, 2013
from nrclex import NRCLex

### Functions

In [113]:
def convert_timestamp_to_seconds(x):
    """Convert annotated timestamp (ss or mm'ss) to number of seconds."""
    if isinstance(x, int):
        return x
    elif "'" in x:
        return int(x[0])*60 + int(x[2:])  # '0x' is correctly converted to x
    
def get_dtm(texts):
    """Create a document-term matrix."""
    cv = CountVectorizer(analyzer="word") 
    cv_matrix = cv.fit_transform(texts)
    
    df_dtm = pd.DataFrame(cv_matrix.toarray(),
                          columns=cv.get_feature_names())
    
    return cv, df_dtm

def extract_pos(lyrics):
    """Return a dictionary of nouns, proper nouns, noun phrases, verbs, and entities in the lyrics."""
    doc = nlp(lyrics)
        
    nouns = [token.lemma_ for token in doc if token.pos_ == "NOUN"]  
    pnouns = [token.lemma_ for token in doc if token.pos_ == "PROPN"]
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    out = {"nouns": nouns,
           "pnouns": pnouns,
           "noun_phrases": noun_phrases,
           "verbs": verbs,
           "entities": entities
          }    
        
    return out

def compute_emotions(lyrics, frequencies=True):
    """Compute the frequences of all emotions (affect) present in the lyrics, based on the NRC dictionary."""
    nrc = NRCLex(lyrics)
    
    if frequencies is True:
        return nrc.affect_frequencies
    else:
        return nrc.raw_emotion_scores

def get_section_rel_by_key(data, key="album_key"):
    """Returns song_section vs. {key} pivot table with relative nbr. of seconds."""
    sums = data.groupby([key, "song_section"])["section_duration"].sum()
    totals = sums.groupby([key]).sum()
    rel = sums.div(totals).reset_index()
    df_piv = pd.pivot_table(rel, values="section_duration", columns=key, index="song_section")
    
    return df_piv

def compute_herfindahl_hirschman_index(x):
    """Computes HHI for the emotion frequencies. Higher values indicate more 'emotional cohesiveness'."""
    return np.square(x).sum()

def group_chorus(df):
    """Adds pre- and post-chorus to chorus, then deletes these two song sections."""
    df["CHORUS"] = df.filter(regex="CHORUS").sum(axis=1)
    df = df.drop(columns=["PRE-CHORUS", "POST-CHORUS"])
    return df

### Read data

In [5]:
data = pd.read_excel("../data/data_lyrics_arctic_monkeys_full.xlsx")

In [6]:
print(data.shape)
data.head()

(574, 7)


Unnamed: 0,album,album_key,song_nr,song_title,song_section,timestamp_end,song_section_lyrics
0,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,INTRO,22,
1,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,VERSE,1'02,Anticipation has the habit to set you up For d...
2,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,VERSE,1'28,Anticipation has the habit to set you up For d...
3,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,CHORUS,1'49,And she won't be surprised and she won't be sh...
4,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,POST-CHORUS,2'15,And you can pour your heart out but her reason...


In [7]:
song_sections_all = data["song_section"].unique()
song_sections_all

array(['INTRO', 'VERSE', 'CHORUS', 'POST-CHORUS', 'INSTRUMENTAL',
       'PRE-CHORUS', 'BRIDGE', 'OUTRO'], dtype=object)

### Enrich data

In [8]:
data["timestamp_end_sec"] = data["timestamp_end"].apply(convert_timestamp_to_seconds)

In [9]:
data["section_duration"] = data.groupby("song_title")["timestamp_end_sec"].transform(np.diff, prepend=0)

In [10]:
data["song_duration"] = data.groupby("song_title")["section_duration"].transform(sum)

In [11]:
data["section_duration_rel"] = data["section_duration"] / data["song_duration"] 

In [12]:
data["album_duration"] = data.groupby("album_key")["song_duration"].transform(sum)

In [13]:
data.fillna({"song_section_lyrics": ""}, inplace=True)
data["n_chars"] = data["song_section_lyrics"].apply(len)

In [14]:
data.drop(columns=["timestamp_end"], inplace=True)

In [15]:
data.head()

Unnamed: 0,album,album_key,song_nr,song_title,song_section,song_section_lyrics,timestamp_end_sec,section_duration,song_duration,section_duration_rel,album_duration,n_chars
0,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,INTRO,,22,22,223,0.098655,19268,0
1,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,VERSE,Anticipation has the habit to set you up For d...,62,40,223,0.179372,19268,399
2,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,VERSE,Anticipation has the habit to set you up For d...,88,26,223,0.116592,19268,385
3,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,CHORUS,And she won't be surprised and she won't be sh...,109,21,223,0.09417,19268,321
4,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,POST-CHORUS,And you can pour your heart out but her reason...,135,26,223,0.116592,19268,97


In [16]:
cv, dtm = get_dtm(data["song_section_lyrics"])

In [17]:
# cv.vocabulary_

In [18]:
# TODO: document typical issues (e.g. bigrams such as New York are separated)
# TODO: check what is done with ',', '-', etc.
dtm.head()

Unnamed: 0,09,100,19,1984,2000,2019,24,505,70s,aaaaah,...,you,young,younger,your,yours,yourself,youth,youyou,zeros,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,2,0,0,1,0,0,0,0,0,0


In [19]:
data["section_n_words"] = dtm.sum(axis=1)
data["song_n_words"] = data.groupby("song_title")["section_n_words"].transform(sum)

In [20]:
data["section_n_words_unique"] = (dtm >= 1).sum(axis=1)
data["song_n_words_unique"] = data.groupby("song_title")["section_n_words_unique"].transform(sum)

In [21]:
data["section_lexical_diversity"] = data["section_n_words_unique"] / data["section_n_words"]  # biased for small sections
data["song_lexical_diversity"] = data["song_n_words_unique"] / data["song_n_words"]

In [22]:
data.head()

Unnamed: 0,album,album_key,song_nr,song_title,song_section,song_section_lyrics,timestamp_end_sec,section_duration,song_duration,section_duration_rel,album_duration,n_chars,section_n_words,song_n_words,section_n_words_unique,song_n_words_unique,section_lexical_diversity,song_lexical_diversity
0,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,INTRO,,22,22,223,0.098655,19268,0,0,342,0,240,,0.701754
1,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,VERSE,Anticipation has the habit to set you up For d...,62,40,223,0.179372,19268,399,74,342,58,240,0.783784,0.701754
2,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,VERSE,Anticipation has the habit to set you up For d...,88,26,223,0.116592,19268,385,72,342,53,240,0.736111,0.701754
3,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,CHORUS,And she won't be surprised and she won't be sh...,109,21,223,0.09417,19268,321,63,342,33,240,0.52381,0.701754
4,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1,The View from the Afternoon,POST-CHORUS,And you can pour your heart out but her reason...,135,26,223,0.116592,19268,97,19,342,17,240,0.894737,0.701754


### Store final dataset

In [23]:
data.to_excel("../data/_data_lyrics_arctic_monkeys.xlsx", index=False)

## Analysis

In [24]:
# data = pd.read_excel("../data/_data_lyrics_arctic_monkeys.xlsx")

### Lyrical content

In [25]:
dtm.head()

Unnamed: 0,09,100,19,1984,2000,2019,24,505,70s,aaaaah,...,you,young,younger,your,yours,yourself,youth,youyou,zeros,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,2,0,0,1,0,0,0,0,0,0


In [26]:
nlp = spacy.load("en_core_web_md")  # python -m spacy download en_core_web_md (_sm doesn't have vectors)

In [27]:
full_songs = data[["album_key", "song_title", "song_section_lyrics"]].copy()
# full_songs["song_section_lyrics"]  # TODO: lowercase first words of sentence, might improve POS tagging

In [28]:
full_songs = full_songs.groupby(["album_key", "song_title"])["song_section_lyrics"].apply(lambda x: " ".join(x))
full_songs = full_songs.reset_index().rename(columns={"song_section_lyrics": "song_lyrics"})
full_songs["song_lyrics"] = full_songs["song_lyrics"].str.strip()

In [29]:
full_songs.head()

Unnamed: 0,album_key,song_title,song_lyrics
0,1. WPSIATWIN,A Certain Romance,"""Shall I keep rolling?"" Well, oh, they might w..."
1,1. WPSIATWIN,Dancing Shoes,Get on your dancing shoes There's one thing on...
2,1. WPSIATWIN,Fake Tales of San Francisco,Fake tales of San Francisco Echo through the r...
3,1. WPSIATWIN,From the Ritz to the Rubble,"Well, last night these two bouncers And one of..."
4,1. WPSIATWIN,I Bet You Look Good on the Dancefloor,Stop making the eyes at me I'll stop making th...


In [30]:
dict_pos = {}
for i, row in full_songs.iterrows():
    lyrics = row["song_lyrics"]
    
    pos = extract_pos(lyrics)
    dict_pos[i] = pos

In [31]:
full_songs = pd.concat([full_songs, pd.DataFrame.from_dict(dict_pos, orient="index")], axis=1)

In [32]:
full_songs.head()

Unnamed: 0,album_key,song_title,song_lyrics,nouns,pnouns,noun_phrases,verbs,entities
0,1. WPSIATWIN,A Certain Romance,"""Shall I keep rolling?"" Well, oh, they might w...","[bottom, sock, point, point, romance, truth, p...","[Reeboks, Converse, Sherlock, Holmes]","[I, they, classic Reeboks, knackered Converse,...","[keep, roll, wear, tuck, in't, see, like, thro...","[(Converse, ORG), (tonight, TIME), (tonight, T..."
1,1. WPSIATWIN,Dancing Shoes,Get on your dancing shoes There's one thing on...,"[dancing, shoe, thing, mind, shit, shock, horr...",[],"[your dancing shoes, one thing, your mind, the...","[hope, look, rummage, see, say, wait, wait, co...","[(one, CARDINAL), (first, ORDINAL), (tonight, ..."
2,1. WPSIATWIN,Fake Tales of San Francisco,Fake tales of San Francisco Echo through the r...,"[tale, room, point, wedding, disco, bride, gro...","[San, Francisco, Echo, San, Francisco, Echo, S...","[Fake tales, San Francisco Echo, the room, a w...","[practice, want, hear, kick, kick, want, hear,...","[(San Francisco Echo, ORG), (weekend, DATE), (..."
3,1. WPSIATWIN,From the Ritz to the Rubble,"Well, last night these two bouncers And one of...","[night, bouncer, one, scary'un, way, way, time...","[duff, Sunday, Sunday, Da, da, da, da, da, Da,...","[Well, last night these two bouncers, 'em, a s...","[get, look, breathe, want, step, make, say, go...","[(last night, TIME), (two, CARDINAL), (one, CA..."
4,1. WPSIATWIN,I Bet You Look Good on the Dancefloor,Stop making the eyes at me I'll stop making th...,"[eye, eye, shoulder, night, explosion, name, s...","[Cold, Rio, Cold, Rio, Montagues, Capulets, DJ...","[the eyes, me, I, the eyes, you, What, it, me,...","[stop, make, stop, make, surprise, want, dynam...","[(Rio, ORG), (1984, DATE), (1984, DATE), (Rio,..."


In [33]:
list_df_ents = []
for i, row in full_songs[["song_title", "entities"]].iterrows():
    song_title, ents = row["song_title"], row["entities"]
    if len(ents) > 0:
        df_ents = pd.DataFrame(ents, columns=["entity", "entity_type"])
        df_ents["song_title"] = song_title
        list_df_ents.append(df_ents)
        
df_ents_full = pd.concat(list_df_ents)[["song_title", "entity", "entity_type"]].reset_index(drop=True)

In [34]:
df_ents_full.head()

Unnamed: 0,song_title,entity,entity_type
0,A Certain Romance,Converse,ORG
1,A Certain Romance,tonight,TIME
2,A Certain Romance,tonight,TIME
3,A Certain Romance,Said,PERSON
4,Dancing Shoes,one,CARDINAL


In [35]:
ENTS_TO_KEEP = ["LOC", "GPE", "TIME", "DATE"]
df_ents_full[df_ents_full["entity_type"].isin(ENTS_TO_KEEP)].head()

Unnamed: 0,song_title,entity,entity_type
1,A Certain Romance,tonight,TIME
2,A Certain Romance,tonight,TIME
6,Dancing Shoes,tonight,TIME
9,Fake Tales of San Francisco,weekend,DATE
11,Fake Tales of San Francisco,San Francisco,GPE


In [36]:
all_nouns = full_songs.set_index("song_title")["nouns"].explode()

In [37]:
all_nouns.reset_index()

Unnamed: 0,song_title,nouns
0,A Certain Romance,bottom
1,A Certain Romance,sock
2,A Certain Romance,point
3,A Certain Romance,point
4,A Certain Romance,romance
...,...,...
3102,There’d Better Be a Mirrorball,moment
3103,There’d Better Be a Mirrorball,car
3104,There’d Better Be a Mirrorball,heart
3105,There’d Better Be a Mirrorball,mirrorball


In [38]:
all_nouns = all_nouns.reset_index().merge(full_songs[["album_key", "song_title"]], on="song_title", how="left")

In [39]:
all_nouns = all_nouns.groupby("album_key")["nouns"].value_counts()

In [40]:
all_nouns.name = "n"
all_nouns = all_nouns.reset_index()

In [41]:
# TODO: prepare for a lexical dispersion plot
all_nouns[all_nouns["n"] > 5]

Unnamed: 0,album_key,nouns,n
0,1. WPSIATWIN,thing,13
1,1. WPSIATWIN,tonight,13
2,1. WPSIATWIN,eye,11
3,1. WPSIATWIN,mind,11
4,1. WPSIATWIN,way,11
...,...,...,...
1227,6. TBH & Casino,eye,6
1519,7. The Car,time,8
1520,7. The Car,paint,7
1521,7. The Car,arm,6


In [42]:
dict_emotions = {}
for i, row in full_songs.iterrows():
    lyrics = row["song_lyrics"]
    
    emotions = compute_emotions(lyrics, frequencies=True)
    dict_emotions[i] = emotions

In [43]:
df_emotions = pd.concat([full_songs[["album_key", "song_title"]],
                         pd.DataFrame.from_dict(dict_emotions, orient="index")], axis=1)
if "anticip" in df_emotions.columns:
    df_emotions.drop(columns="anticip", inplace=True)

df_emotions.fillna(0, inplace=True)

In [44]:
df_emotions.tail()

Unnamed: 0,album_key,song_title,fear,anger,trust,surprise,positive,negative,sadness,disgust,joy,anticipation
75,7. The Car,Mr Schwartz,0.0,0.111111,0.166667,0.111111,0.166667,0.111111,0.0,0.0,0.111111,0.222222
76,7. The Car,Perfect Sense,0.0,0.071429,0.142857,0.071429,0.285714,0.0,0.0,0.0,0.214286,0.214286
77,7. The Car,Sculptures of Anything Goes,0.054054,0.027027,0.162162,0.081081,0.216216,0.081081,0.054054,0.027027,0.135135,0.162162
78,7. The Car,The Car,0.052632,0.052632,0.052632,0.0,0.210526,0.157895,0.052632,0.0,0.210526,0.210526
79,7. The Car,There’d Better Be a Mirrorball,0.153846,0.0,0.076923,0.0,0.153846,0.230769,0.153846,0.076923,0.076923,0.076923


In [45]:
emotion_categories = df_emotions.columns[2:]
df_emotions["hhi"] = df_emotions[emotion_categories].apply(compute_herfindahl_hirschman_index, axis=1)

In [46]:
(df_emotions
 .sort_values(["album_key", "hhi"], ascending=[True, False])
 .groupby("album_key")
 .head(3)
)

Unnamed: 0,album_key,song_title,fear,anger,trust,surprise,positive,negative,sadness,disgust,joy,anticipation,hhi
9,1. WPSIATWIN,Still Take You Home,0.066667,0.0,0.066667,0.033333,0.333333,0.133333,0.066667,0.0,0.166667,0.133333,0.188889
3,1. WPSIATWIN,From the Ritz to the Rubble,0.061224,0.020408,0.081633,0.061224,0.285714,0.122449,0.020408,0.020408,0.122449,0.204082,0.16868
6,1. WPSIATWIN,Perhaps Vampires Is a Bit Strong But...,0.051282,0.102564,0.153846,0.025641,0.179487,0.25641,0.0,0.025641,0.128205,0.076923,0.158448
18,2. Favourite WN,Fluorescent Adolescent,0.033333,0.033333,0.05,0.016667,0.116667,0.35,0.066667,0.133333,0.1,0.1,0.183333
16,2. Favourite WN,D is for Dangerous,0.117647,0.0,0.176471,0.0,0.205882,0.147059,0.0,0.029412,0.147059,0.176471,0.16263
14,2. Favourite WN,Balaclava,0.054795,0.054795,0.082192,0.068493,0.219178,0.191781,0.027397,0.027397,0.150685,0.123288,0.141678
31,3. Humbug,Potion Approaching,0.206897,0.034483,0.0,0.068966,0.068966,0.275862,0.137931,0.0,0.0,0.206897,0.191439
25,3. Humbug,Cornerstone,0.2,0.04,0.0,0.08,0.08,0.2,0.08,0.0,0.04,0.28,0.1808
28,3. Humbug,Dangerous Animals,0.054054,0.027027,0.108108,0.027027,0.135135,0.27027,0.216216,0.027027,0.081081,0.054054,0.164354
35,4. Suck It and See,All My Own Stunts,0.1875,0.0,0.125,0.0,0.0,0.3125,0.3125,0.0,0.0,0.0625,0.25


In [47]:
df_emotions.groupby("album_key").mean()

Unnamed: 0_level_0,fear,anger,trust,surprise,positive,negative,sadness,disgust,joy,anticipation,hhi
album_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1. WPSIATWIN,0.068106,0.065463,0.111584,0.080097,0.191392,0.169191,0.059085,0.038504,0.115918,0.100659,0.140913
2. Favourite WN,0.074137,0.063107,0.089233,0.070428,0.157243,0.204359,0.072276,0.063654,0.097792,0.10777,0.135372
3. Humbug,0.13853,0.051555,0.078319,0.062691,0.125157,0.20932,0.093426,0.061073,0.060223,0.119707,0.149677
4. Suck It and See,0.128384,0.079693,0.0844,0.05812,0.145282,0.17894,0.115468,0.051323,0.086262,0.072128,0.160652
5. AM,0.098083,0.037761,0.10835,0.052998,0.165404,0.155978,0.109758,0.020505,0.111132,0.140031,0.162808
6. TBH & Casino,0.037431,0.039274,0.133355,0.056254,0.258169,0.090008,0.045872,0.022141,0.158933,0.158563,0.183222
7. The Car,0.059989,0.054155,0.13555,0.061209,0.211822,0.106766,0.048428,0.019366,0.131901,0.170813,0.157813


### Song arrangement

In [48]:
sections_by_album = data.groupby("album_key")["song_section"].value_counts()
sections_by_album.name = "n"
sections_by_album = sections_by_album.reset_index()

In [49]:
pd.pivot_table(sections_by_album, values="n", columns="album_key", index="song_section")

album_key,1. WPSIATWIN,2. Favourite WN,3. Humbug,4. Suck It and See,5. AM,6. TBH & Casino,7. The Car
song_section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BRIDGE,5.0,6.0,8.0,5.0,10.0,8.0,3.0
CHORUS,25.0,21.0,21.0,27.0,32.0,24.0,22.0
INSTRUMENTAL,15.0,15.0,7.0,12.0,5.0,4.0,5.0
INTRO,9.0,5.0,6.0,5.0,5.0,2.0,2.0
OUTRO,6.0,9.0,7.0,8.0,8.0,5.0,5.0
POST-CHORUS,4.0,2.0,,,3.0,,
PRE-CHORUS,2.0,,,,2.0,4.0,3.0
VERSE,33.0,31.0,22.0,28.0,25.0,27.0,26.0


#### Song section importance by album

In [114]:
dw_song_sections_by_album = get_section_rel_by_key(data, key="album_key")  # dw = datawrapper

dw_song_sections_by_album = (dw_song_sections_by_album
                             .fillna(0)
                             .transpose()
                             [["INTRO", "VERSE", "PRE-CHORUS", "CHORUS", "POST-CHORUS", "BRIDGE", "INSTRUMENTAL", "OUTRO"]]
                            )
dw_song_sections_by_album = group_chorus(dw_song_sections_by_album)
dw_song_sections_by_album.rename(columns={"INSTRUMENTAL": "INSTR."}, inplace=True)

dw_song_sections_by_album.to_csv("../viz/dw_song_sections_by_album.csv")

#### Song section importance by song per album

In [125]:
dw_song_sections_by_song = (data[["album_key", "song_title"]]
                            .drop_duplicates()
                            .merge(get_section_rel_by_key(data, key="song_title").transpose(),
                                   on="song_title")
                            .fillna(0)                            
                            .rename(columns={"INSTRUMENTAL": "INSTR."})
                            .set_index("album_key")
                            [["song_title",
                              "INTRO", "VERSE", "PRE-CHORUS", "CHORUS", "POST-CHORUS", "BRIDGE", "INSTR.", "OUTRO"]]
                           )

dw_song_sections_by_song_humbug = dw_song_sections_by_song.loc["3. Humbug"].copy()
dw_song_sections_by_song_humbug = group_chorus(dw_song_sections_by_song_humbug)

dw_song_sections_by_song_humbug.to_csv("../viz/dw_song_sections_by_song_humbug.csv", index=False)

### Song length

In [51]:
pd.concat([data.groupby("album_key")["n_chars"].sum(),
           data.groupby("album_key")["section_n_words"].sum(),
           data.groupby("album_key")["section_n_words_unique"].sum()
          ],
          axis=1
         )

Unnamed: 0_level_0,n_chars,section_n_words,section_n_words_unique
album_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1. WPSIATWIN,18607,3585,2499
2. Favourite WN,14850,2769,1983
3. Humbug,13098,2364,1701
4. Suck It and See,10255,1870,1496
5. AM,17103,3287,2159
6. TBH & Casino,13395,2421,1898
7. The Car,8994,1631,1415


In [52]:
np.corrcoef(data.song_n_words, data.song_lexical_diversity)[0][1]

-0.4498762623638724

In [53]:
np.corrcoef(data.song_duration, data.song_n_words)[0][1]

0.3559275053758195