In [1]:
# Classics
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Base
from collections import Counter
import re
import os

# Plotting
import squarify
import matplotlib.pyplot as plt
import seaborn as sns

# NLP Libraries
import spacy
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_lg")
tokenizer = Tokenizer(nlp.vocab)

# Vector Representations
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import RMSprop

In [2]:
df = pd.read_csv('./lyrics.csv')
df.head()

Unnamed: 0,artist,album,track_title,track_n,lyric,line,year
0,Taylor Swift,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1,2006
1,Taylor Swift,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2,2006
2,Taylor Swift,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3,2006
3,Taylor Swift,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4,2006
4,Taylor Swift,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5,2006


In [3]:
df = df.drop(['artist', 'track_n'], axis = 1)

In [4]:
df.head()

Unnamed: 0,album,track_title,lyric,line,year
0,Taylor Swift,Tim McGraw,He said the way my blue eyes shined,1,2006
1,Taylor Swift,Tim McGraw,Put those Georgia stars to shame that night,2,2006
2,Taylor Swift,Tim McGraw,"I said, ""That's a lie""",3,2006
3,Taylor Swift,Tim McGraw,Just a boy in a Chevy truck,4,2006
4,Taylor Swift,Tim McGraw,That had a tendency of gettin' stuck,5,2006


In [5]:
df['album'].value_counts()

reputation      1006
1989             953
Speak Now        930
Red              845
Taylor Swift     567
Fearless         561
Name: album, dtype: int64

In [6]:
rep = df[df['album'] == 'reputation']
rep['track_title'].unique()

array(['...Ready for It?', 'End Game (Ft.\xa0Ed\xa0Sheeran & Future)',
       'I Did Something Bad', "Don't Blame Me", 'Delicate',
       'Look What You Made Me Do', 'So It Goes...', 'Gorgeous',
       'Getaway Car', 'King of My Heart', 'Dancing With Our Hands Tied',
       'Dress', "This Is Why We Can't Have Nice Things",
       'Call It What You Want', "New Year's Day"], dtype=object)

In [7]:
df = df.replace('&', 'and')
df = df.replace('22', 'Twenty-Two')

## Titles

In [8]:
# Eliminate repetition of titles

titles = pd.DataFrame(df['track_title'])
titles.columns = (['title'])
titles.head()

Unnamed: 0,title
0,Tim McGraw
1,Tim McGraw
2,Tim McGraw
3,Tim McGraw
4,Tim McGraw


In [9]:
def get_lemmas(text):

    lemmas = []
    
    doc = nlp(text)
    
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['title_lemmas'] = titles['title'].apply(get_lemmas)

In [10]:
# Store NaN-free version in x

x = df['title_lemmas'].dropna()

In [11]:
# Check the average song title length

lengths = []

for item in x:
    number = len(item)
    lengths.append(number)
    
total = sum(lengths)
mean = total / len(x)
mean

1.6624845742492802

In [12]:
def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(temp, columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc.sort_values(by='rank')

In [13]:
wc = count(x)
wc_75 = wc[wc['appears_in'] > 75]
wc_top20 = wc[wc['rank'] <= 21]

wc_75

Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
35,,261,522,1.0,0.06458,0.06458,0.053682
31,Ft,261,261,2.0,0.03229,0.09687,0.053682
12,stay,163,239,3.0,0.029568,0.126438,0.033525
65,know,164,164,4.0,0.020289,0.146728,0.033731
76,Ed,158,158,5.0,0.019547,0.166275,0.032497
77,Sheeran,158,158,6.0,0.019547,0.185822,0.032497
94,bad,125,125,7.0,0.015465,0.201287,0.02571
38,love,121,121,8.0,0.01497,0.216256,0.024887
11,tie,117,117,9.0,0.014475,0.230731,0.024064
86,New,112,112,10.0,0.013856,0.244587,0.023036


In [14]:
# Randomly generate a title based on most common words

import random

words = ['stay', 'know', 'bad', 'love', 'tie', 'new', 'story', 'end', 
        'game', 'future', 'girl', 'change', 'blank', 'space', 'wish',
        'heart', 'enchant', 'delicate', 'beautiful']

def title():
    one = random.choice(words)
    two = random.choice(words)
    
    one = one.title()
    two = two.title()
    
    x = random.randint(0, 10)
    if x > 6:
        print(f'{one} {two}')
    else:
        print(one)
    
title()

Know


## "Tim McGraw" Lyrics

In [15]:
# Let's isolate choruses and verses for each song

first = df[df['track_title'] == 'Tim McGraw']
#first['lyric'].value_counts()

In [16]:
def chorus_verse(df):
    
    chorus = ['When you think Tim McGraw',
          'I hope you think my favorite song',                     
          'I hope you think that little black dress',                 
          'When you think happiness',                                 
          'I hope you think of me',                                   
          'And my old faded blue jeans',                              
          'Think of my head on your chest',                           
          "I said, 'That's a lie'",                                  
          'The moon like a spotlight on the lake',                    
          'Put those Georgia stars to shame that night',              
          'The one we danced to all night long']
    
    values = []
    
    for item in df['lyric']:
        if item in chorus:
            value = 'chorus'
            values.append(value)
        else:
            value = 'verse'
            values.append(value)
            
    return pd.DataFrame(values, columns = ['lyric_type'])

In [17]:
lt = chorus_verse(first)

In [18]:
first = pd.concat([first, lt], axis = 1)
first

Unnamed: 0,album,track_title,lyric,line,year,title_lemmas,lyric_type
0,Taylor Swift,Tim McGraw,He said the way my blue eyes shined,1,2006,"[Tim, McGraw]",verse
1,Taylor Swift,Tim McGraw,Put those Georgia stars to shame that night,2,2006,"[Tim, McGraw]",chorus
2,Taylor Swift,Tim McGraw,"I said, ""That's a lie""",3,2006,"[Tim, McGraw]",verse
3,Taylor Swift,Tim McGraw,Just a boy in a Chevy truck,4,2006,"[Tim, McGraw]",verse
4,Taylor Swift,Tim McGraw,That had a tendency of gettin' stuck,5,2006,"[Tim, McGraw]",verse
5,Taylor Swift,Tim McGraw,On backroads at night,6,2006,"[Tim, McGraw]",verse
6,Taylor Swift,Tim McGraw,And I was right there beside him all summer long,7,2006,"[Tim, McGraw]",verse
7,Taylor Swift,Tim McGraw,And then the time we woke up to find that summ...,8,2006,"[Tim, McGraw]",verse
8,Taylor Swift,Tim McGraw,But when you think Tim McGraw,9,2006,"[Tim, McGraw]",verse
9,Taylor Swift,Tim McGraw,I hope you think my favorite song,10,2006,"[Tim, McGraw]",chorus


## Let's do that in the whole dataframe

In [19]:
# Identify all songs

songs = []

stuff = df['track_title'].unique()
for item in stuff:
    songs.append(item)

In [20]:
# Sort out which lyrics appear more than once (chorus)

chorus = []

for item in songs:
    temp = df[df['track_title'] == item]
    for val, count in temp['lyric'].value_counts().iteritems():
        if count > 1:
            chorus.append(val)
        else:
            pass

In [21]:
# Label lyrics and either belonging to a verse or chorus

values = []

for item in df['lyric']:
    if item in chorus:
        value = 'chorus'
        values.append(value)
    else:
        value = 'verse'
        values.append(value)

In [22]:
# Add the lyric_type column to our dataframe

lt = pd.DataFrame(values, columns = ['lyric_type'])
final = pd.concat([df, lt], axis = 1)
final

Unnamed: 0,album,track_title,lyric,line,year,title_lemmas,lyric_type
0,Taylor Swift,Tim McGraw,He said the way my blue eyes shined,1,2006,"[Tim, McGraw]",verse
1,Taylor Swift,Tim McGraw,Put those Georgia stars to shame that night,2,2006,"[Tim, McGraw]",chorus
2,Taylor Swift,Tim McGraw,"I said, ""That's a lie""",3,2006,"[Tim, McGraw]",chorus
3,Taylor Swift,Tim McGraw,Just a boy in a Chevy truck,4,2006,"[Tim, McGraw]",verse
4,Taylor Swift,Tim McGraw,That had a tendency of gettin' stuck,5,2006,"[Tim, McGraw]",verse
...,...,...,...,...,...,...,...
4857,reputation,New Year's Day,"(Hold on to the memories, they will hold on to...",43,2017,"[New, Year, Day]",chorus
4858,reputation,New Year's Day,Please don't ever become a stranger,44,2017,"[New, Year, Day]",chorus
4859,reputation,New Year's Day,"(Hold on to the memories, they will hold on to...",45,2017,"[New, Year, Day]",chorus
4860,reputation,New Year's Day,Whose laugh I could recognize anywhere,46,2017,"[New, Year, Day]",chorus


In [23]:
chorus = final[final['lyric_type'] == 'chorus']
verse = final[final['lyric_type'] == 'verse']

## Let's work with the chorus lyrics

In [24]:
len(chorus['lyric'].unique())

777

In [25]:
len(chorus['track_title'].unique())

94

In [26]:
chorus_length = len(chorus['lyric'].unique()) / len(chorus['track_title'].unique())
chorus_length

8.26595744680851

So it looks like the average chorus has 8 unique lines

## How about the verses?

In [27]:
verse_length = len(verse['lyric'].unique()) / len(verse['track_title'].unique())
verse_length

25.28723404255319

There's a average total of 25 unique verse lyrics. If we go off of songwriting trends, we can assume that this means there are 3 verses, each 8 lines long, with a chorus or partial chorus following each of them

## Machine Learning (!!!)

In [30]:
verse_text = ' '.join(verse['lyric'])

In [31]:
chars = sorted(list(set(verse_text)))

char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [33]:
maxlen = 40
steps = 10

sentences = []
next_chars = []

for i in range(0, len(verse_text) - maxlen, steps):
    sentences.append(verse_text[i: i + maxlen])
    next_chars.append(verse_text[i + maxlen])
    
print('There are', len(sentences), 'sentences')

There are 8896 sentences


In [38]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype = np.bool)
y = np.zeros((len(sentences), len(chars)), dtype = np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_to_int[char]] = 1
        
    y[i, char_to_int[next_chars[i]]] = 1

In [39]:
x.shape

(8896, 40, 81)

In [42]:
model = Sequential()
model.add(LSTM(128, input_shape = (maxlen, len(chars))))
model.add(Dense(len(chars), activation = 'relu'))
model.add(Dense(len(chars), activation = 'relu'))
model.add(Dense(len(chars), activation = 'relu'))
model.add(Dense(len(chars), activation = 'softmax'))

optimizer = RMSprop()
model.compile(loss = 'categorical_crossentropy', optimizer = optimizer)

In [51]:
import sys

def sample(preds, temperature = 1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)
    
def on_epoch_end(epoch, _):
    print()
    print('----- Generating text after Epoch: %d' % epoch)
    
    start_index = random.randint(0, len(verse_text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)
        
        generated = ''
        sentence = verse_text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
        
        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_int[char]] = 1
                
            preds = model.predict(x_pred, verbose = 0)[0]
            next_index = sample(preds, diversity)
            next_char = int_to_char[next_index]
            
            sentence = sentence[1:] + next_char
            
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
        
print_callback = LambdaCallback(on_epoch_end = on_epoch_end)

In [None]:
model.fit(x, y,
         batch_size = 128,
         epochs = 10,
         callbacks = [print_callback])

Train on 8896 samples
Epoch 1/10
----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "r name goes up in lights, like diamonds "
r name goes up in lights, like diamonds at tay tou to to to the to tat thas tas tan tas tan to tas at tat tat tas yo to an to yo tou to was was hat was tat fas to tas to to to we tat to in tas to at to to yo the to to t tas the tas you to yo tat to tas the tas tat to you tat tos tan tas tas toas tas taat to sat to to te tas to tas at tat te tan the tas tas tat tas to tat tas tat to yo to yo tas to yo to you tas at toe re you to at tas t
----- diversity: 0.5
----- Generating with seed: "r name goes up in lights, like diamonds "
r name goes up in lights, like diamonds tater I to youu to I mad t ake sang hhe batind cat tas anre te an t yo youn mab tis an te wad atas toas acle ye lap an th nal se re yhat yous no toe I lat at yo lat las at cad you tis bas wat tis s As tas dos ues lat An ye yol taw ar you ais hat te wamas te fe att 

----- Generating text after Epoch: 2
----- diversity: 0.2
----- Generating with seed: "And I wrote down our song Friday night b"
And I wrote down our song Friday night be the the co the the the the wo the the the the the the the the the the se the whe the the the to the the the the the the the be the the the the the the the whe the the the the the the the you the the you the the the the the the you the the the the the no the the the we the me the the the the me the the an the the the the the the the the sn me the the the the the be the the the the the you no the 
----- diversity: 0.5
----- Generating with seed: "And I wrote down our song Friday night b"
And I wrote down our song Friday night be cin in co the no the the war ol gon ros yo the n to we yo s an fre Ang ue me the se bege the chen you Le in no y din yo u wo An fin the to me he me fle nhe keve th who od ang me you to le g the yo wo th lo n me gas an the se I se the s the d than the lo nl mos le the fe the th we the the the on 

----- Generating text after Epoch: 4
----- diversity: 0.2
----- Generating with seed: "ave given me no choice but to But I thin"
ave given me no choice but to But I thing the the the the the the the the the the the the the the the the the the the you then the the ther ther ther you ther the the ther the the ther the the the the we ther the ther the then I wat then the ther the the you the the thet the the the the ther the the the the you the the the the we the the then the the the wast then sor the the the the the I we the the the the the ther the the wand the th
----- diversity: 0.5
----- Generating with seed: "ave given me no choice but to But I thin"
ave given me no choice but to But I thing cous you won you the me An you then the an'r tae youl wher yaus taind whatl ther kans surs tor thend woln wet then ge'r mevel alt you sn't thing in litt the to deveng wat you the tols you mart you dien be ling th and I nos I let fonle we he we sy bet ag, what on thand blit's beret thend you cave

----- Generating text after Epoch: 6
----- diversity: 0.2
----- Generating with seed: "ot down And I'm only me Who I wanna be W"
ot down And I'm only me Who I wanna be Wo the thand and you thang I wat that and in the that the thand and and and I thang and the to ho whe thang I and and you the lot the I the to bas thing the the thang I mas I thing thang I the the I thang I was and bath that thing that and I wake and and and I't the thing the thing the you to thon and thing I't and and and I that thing whe that I was to bat I the thong thang I the to wat the you ho
----- diversity: 0.5
----- Generating with seed: "ot down And I'm only me Who I wanna be W"
ot down And I'm only me Who I wanna be Who wat sasthap touth oud pous to to the bout and I aboth bat I saaf maslis nout I masand I thing you I hat as tithe and aryou bor whove bish you lot has thar I't you masthet I toan I kalle you in ulle to whing and lot you ther thas pare and I you doun wous awath to and and you and and and the Wous

----- Generating text after Epoch: 8
----- diversity: 0.2
----- Generating with seed: "twist of fate, when it all broke down Ne"
twist of fate, when it all broke down Net the thing the the the the thing hou the the the the the the the the me the the the ther the the the the the the the she the the the me the the the thang the the doung the the the the the ther whe the what the the the ther the the the the the whing the thang thing the ther the thing the cout I whe the thing I the thing the the the the the the the the the the the the the thing the the the the the 
----- diversity: 0.5
----- Generating with seed: "twist of fate, when it all broke down Ne"
twist of fate, when it all broke down Ne dout and af falle thing whe beang shen the thing thing I puote I't ithing I me shisce shes mar minol you chaven thing you sorle you she whe thing of wher aching and and buthe I uthing she thing then you she the bing whalt in the thisge I't shen hat ding I't tharnd alinh I thor whith an shel leas 