##### Caretaking*

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
%config Completer.use_jedi = False

##### Imports

In [3]:
import os
import math
import re
import nltk

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
#nltk.download('punkt')

In [5]:
from tqdm import notebook

from fuzzywuzzy import fuzz, process
from itertools import combinations
from statistics import mode

from langdetect import detect, detect_langs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
#from kneed import KneeLocator
from sklearn.metrics import silhouette_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from nltk.util import ngrams
from nltk.metrics import jaccard_distance
from nltk.sentiment import SentimentIntensityAnalyzer

In [6]:
import cufflinks as cf
cf.go_offline()

#### Read Data

In [7]:
lyrics_path = './lyrics/'
new_lyrics_path = './new_lyrics/'

In [8]:
# Get folder contents first of all...
lyrics_ls = [f for f in os.listdir(lyrics_path) if f.endswith('.txt')]
new_lyrics_ls = [f for f in os.listdir(new_lyrics_path) if f.endswith('.txt')]

In [9]:
# ...and then read contents to two different lists.
lyrics = [[file.replace(".txt",""),open(os.path.join(lyrics_path, file), 'r').read()] for file in lyrics_ls]
new_lyrics = [[file.replace(".txt",""),open(os.path.join(new_lyrics_path, file), 'r').read()] for file in new_lyrics_ls]

In [10]:
combined_lyrics = pd.DataFrame(lyrics,columns=['Song','Lyrics'])
combined_new_lyrics = pd.DataFrame(new_lyrics,columns=['Song','Lyrics'])

## Objective 1: Account for Duplicates and Anomalies

- Account for duplicates in data files:
	- example: "a-day-in-the-life" and "a-day-in-the-life-live-in-amsterdam"
- "weird" or "missing" data.
- Remove anything that looks "amiss"
- Essentially data cleaning

###### Cleansing

In [11]:
def simple_content_check(x):
    
    assess = (x != '\n' and "instrumental" not in x.lower())
    
    return assess

In [12]:
def clean_lyrics(lys):
    
    rgx_pattern = r'[^a-zA-Z0-9\s]'
    
    sentences_ls = lys.split('\n')
    
    std_ls = [re.sub(rgx_pattern,'',x.strip()).lower() for x in sentences_ls if len(x) > 0]
    
    return std_ls

In [13]:
combined_lyrics_adj = combined_lyrics[combined_lyrics['Lyrics'].apply(simple_content_check)] # adjusted version to exclude based on function above

In [14]:
# Just to check if any adjustments or additions to function above
print(f"{np.round((len(lyrics) - len(combined_lyrics_adj))/len(lyrics) * 100,2)} % reduction")

21.86 % reduction


In [15]:
# Drop all duplicate instances on the lyrics column
# Keep first instance by default
combined_lyrics_adj.drop_duplicates(subset=['Lyrics'],inplace=True)

In [16]:
combined_lyrics_adj.reset_index(drop=True,inplace=True)
combined_lyrics_adj.head()

Unnamed: 0,Song,Lyrics
0,a-day-in-the-life-live-in-amsterdam,"I read the news today oh, boy\nAbout a lucky m..."
1,a-hard-days-night,It's been a hard day's night\nAnd I've been wo...
2,a-taste-of-honey,A taste of honey\ntasting much sweeter than wi...
3,across-the-universe,Words are flowing out like\nendless rain into ...
4,act-naturally,They're gonna put me in the movies\nthey're go...


###### Language Differences

In [17]:
# Account for any language differences
combined_lyrics_adj['Languages'] = combined_lyrics_adj['Lyrics'].apply(lambda x: detect(x))

In [18]:
combined_lyrics_adj['Languages'].value_counts()

Languages
en    212
de      2
so      1
Name: count, dtype: int64

In [19]:
# fault of the detection module. Funny one though!
combined_lyrics_adj.loc[combined_lyrics_adj['Languages'] == 'so']

Unnamed: 0,Song,Lyrics,Languages
64,hello-goodbye,"You say yes, I say no\nYou say stop and I say ...",so


In [20]:
# Interesting. As above ^
detect_langs(combined_lyrics_adj['Lyrics'][64])

[so:0.5714276182148893, en:0.4285707927518186]

In [21]:
# Two known instances of German versions of Beatles songs being released.
# These pertain to "I Want To Hold Your Hand" and "She Loves You"
# Will omit as the English lang versions of both are already included.
combined_lyrics_adj.loc[combined_lyrics_adj['Languages'] == 'de']

Unnamed: 0,Song,Lyrics,Languages
104,komm-gib-mir-deine-hand,"O komm doch, komm zu mir,\ndu nimmst mir den V...",de
165,sie-liebt-dich,Sie liebt dich yeah yeah yeah.\nSie liebt dich...,de


In [22]:
combined_lyrics_adj = combined_lyrics_adj.loc[combined_lyrics_adj['Languages'] != 'de']

In [23]:
combined_lyrics_adj.reset_index(drop=True,inplace=True)

###### Cosine Similarity of song lyrics

In [24]:
tfidf_vectorizer = TfidfVectorizer()

In [25]:
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_lyrics_adj['Lyrics'])

In [26]:
comparisons_cs = [] # List for cosine similarity
rng_max = len(combined_lyrics_adj)
for i in notebook.tqdm(range(rng_max)):
    for j in range(i+1,rng_max): # Avoid self-matching
        
        # Lyrics for each song pairing
        lyrics_i = tfidf_matrix[i]
        lyrics_j = tfidf_matrix[j]
        
        # Compute cosine similarity score
        sim_score = cosine_similarity(lyrics_i,lyrics_j)[0][0]
        
        comparisons_cs.append({
            'Song1': combined_lyrics_adj['Song'][i],
            'Song2': combined_lyrics_adj['Song'][j],
            'Similarity': sim_score
        })

  0%|          | 0/213 [00:00<?, ?it/s]

In [27]:
comp_cs_df = pd.DataFrame(comparisons_cs)

In [28]:
# Rationale: the general and short nature of the song "the-end" is causing it to compare...
# ...favourably to many songs. Omitting to inspect further
comp_cs_df_redux = comp_cs_df[~comp_cs_df['Song1'].str.contains('the-end', case=False) & ~comp_cs_df['Song2'].str.contains('the-end', case=False)]

In [29]:
# Comfortable to take anything at .7 and above as actually being the same
# And can inspect by eye from here.
# Revolution has a copy with additional lyrics
# Sgt Peppers Lonely Hearts Club Band has a reprise
comp_cs_df_redux.sort_values('Similarity',ascending=False).head(5)

Unnamed: 0,Song1,Song2,Similarity
20498,revolution-1,revolution,0.929282
21038,sgt-peppers-lonely-hearts-club-band-reprise,sgt-peppers-lonely-hearts-club-band,0.868211
12507,hey-jude,run-for-your-life,0.501395
17353,love-me-do,p-s-I-love-you,0.498712
14188,i-want-to-hold-your-hand,you-really-got-a-hold-on-me,0.492032


In [30]:
omit_list = list(comp_cs_df_redux.loc[comp_cs_df_redux['Similarity']>=0.7]['Song1'].values)
omit_list

['revolution-1', 'sgt-peppers-lonely-hearts-club-band-reprise']

In [31]:
lyrics_clean = combined_lyrics_adj[combined_lyrics_adj['Song'].apply(lambda x: x not in omit_list)]
lyrics_clean.reset_index(drop=True,inplace=True)

# Objective 2: Answer 2 of 5 Available Questions of the Data

Question 1: Which song has the largest amount of repetition?

Question 2: How many of the songs feature the song name (found in the file name) in the song lyrics?

Bonus (from above data cleaning): Which songs are the most similar

### Which song has the largest amount of repetition?

- Taking the definition of repetition as all the unique lines in  a song that occur more than once.

###### Basic Approach

In [32]:
def repetition_counter(lys):
    
    standardised_ls = clean_lyrics(lys)
    
    standardised_srs = pd.Series(standardised_ls)
    instance_count = standardised_srs.value_counts()
    
    # Terminology may be off here, but count the length of all instances greater than 1
    # Call that number of instances
    # Then the frequency of repetitions in a set of lyrics is the sum of the count of instances
    
    repetition_instances = len(instance_count[instance_count > 1]) # How many lines repeat
    repetition_frequency = instance_count[instance_count > 1].sum() # What is the sum of those repetitions
    
    return [repetition_instances,repetition_frequency]

In [33]:
lyrics_clean['Results'] = lyrics_clean['Lyrics'].apply(repetition_counter)
lyrics_clean[['Repetition Instances','Repetition Frequency']] = lyrics_clean['Results'].apply(pd.Series)
lyrics_clean.drop('Results',axis=1,inplace=True)

In [34]:
lyrics_clean.head()

Unnamed: 0,Song,Lyrics,Languages,Repetition Instances,Repetition Frequency
0,a-day-in-the-life-live-in-amsterdam,"I read the news today oh, boy\nAbout a lucky m...",en,2,4
1,a-hard-days-night,It's been a hard day's night\nAnd I've been wo...,en,13,34
2,a-taste-of-honey,A taste of honey\ntasting much sweeter than wi...,en,3,11
3,across-the-universe,Words are flowing out like\nendless rain into ...,en,3,17
4,act-naturally,They're gonna put me in the movies\nthey're go...,en,8,17


###### N-gram Similarity - More Robust

In [35]:
def n_gram_sim(lys,jcrd_thresh=1.0,inspect=False):
    
    standardised_ls = clean_lyrics(lys)
    
    # Tokenize lyrics and run calculation
    token_lyrics = [nltk.word_tokenize(x.lower()) for x in standardised_ls]
    n_grams = [list(ngrams(y,2)) for y in token_lyrics]
    
    #return n_grams
    jcrd_similarity = []
    for i in range(len(standardised_ls)):
        for j in range(i + 1, len(standardised_ls)): # Again, avoid self-comparison
            try:
                similarity = 1 - jaccard_distance(set(n_grams[i]), set(n_grams[j]))
                if similarity >= jcrd_thresh: # If they match based on the set threshold then they can be included
                    jcrd_similarity.append([n_grams[i],n_grams[j],similarity])
            except:
                pass
                
    # For inspection and testing purposes...or not.
    if inspect:
        return jcrd_similarity
    else:
        return len(jcrd_similarity)

In [36]:
lyrics_clean['N_Gram_Similarity'] = lyrics_clean['Lyrics'].apply(n_gram_sim)

In [37]:
# Rankings as seen below. Preliminary check of the top few confirms.
lyrics_clean.sort_values('N_Gram_Similarity',ascending=False).head(10)

Unnamed: 0,Song,Lyrics,Languages,Repetition Instances,Repetition Frequency,N_Gram_Similarity
9,all-together-now,One two three four\nCan I have a little fore\n...,en,7,53,788
84,i-want-you-shes-so-heavy,I want you\nI want you so bad\nI want you\nI w...,en,6,50,293
81,i-wanna-be-your-man,I wanna be your lover baby\nI wanna be your ma...,en,6,37,200
171,the-continuing-story-of-bungalow-bill,"Hey, Bungalow Bill\nWhat did you kill, Bungalo...",en,3,31,185
105,let-it-be,When I find myself in times of trouble\nMother...,en,6,40,185
58,good-day-sunshine,Good day sunshine\nGood day sunshine\nGood day...,en,1,17,136
41,dont-let-me-down,"Don't let me down\nHey, don't let me down\nDon...",en,7,31,135
39,doctor-robert,"Ring, my friend I said you'd call\nDoctor Robe...",en,7,28,126
26,blue-jay-way,There's a fog upon L.A.\nAnd my friends have l...,en,4,28,105
208,your-mother-should-know,Let's all get up and dance to a song\nthat was...,en,4,23,102


### How many of the songs feature the song name (found in the file name) in the song lyrics?

In [38]:
def song_in_lyrics(row):
    
    clean_song = row['Song'].split('-live')[0].replace("-"," ")
    
    standardised_ls = clean_lyrics(row['Lyrics'])
    
    return clean_song in standardised_ls

In [39]:
lyrics_clean['Name_In_Lyrics'] = lyrics_clean[['Song','Lyrics']].apply(song_in_lyrics, axis=1)
lyrics_clean['Name_In_Lyrics'].value_counts()

Name_In_Lyrics
False    120
True      91
Name: count, dtype: int64

# Objective 3: Derive a single piece of insight from the data that you find interesting.

In [40]:
btls_context_data = pd.read_csv('./Context Data/The Beatles songs dataset, v1, no NAs.csv')

In [41]:
# Create a corresponding "Song" column for this df to match the main df.
hyphen_pattern = r'[^\w\s-]'
btls_context_data['Song'] = btls_context_data['Title'].apply(lambda x: re.sub(hyphen_pattern,'',x).replace(' ','-').lower())

In [42]:
# Difference list. Of a negligible enough number that I'm happy to continue.
diff_ls = list(set(lyrics_clean['Song'].apply(lambda x: x.lower())) - set(btls_context_data['Song']))

In [43]:
diff_df = lyrics_clean.loc[~lyrics_clean['Song'].isin(diff_ls)]

In [44]:
diff_df['Song'] = diff_df['Song'].apply(lambda x: x.lower())

In [45]:
context_df = pd.merge(diff_df,btls_context_data[['Song',
                                                 'Year',
                                                 'Duration',
                                                 'Genre',
                                                 'Songwriter',
                                                 'Lead.vocal',
                                                 'Top.50.Billboard']],on='Song')

In [46]:
# Minor negative correlation (aka, the higher the level of repetition, the higher it was on the charts)
subset_1 = context_df.copy().loc[context_df["Top.50.Billboard"] != -1] # only instances where it came top 50 billboard charts
subset_1['N_Gram_Similarity'].corr(subset_1['Top.50.Billboard']) # simple pearson corr

-0.3833327379254955

In [47]:
# By injecting context, we can see what the average n_gram_similarity is by lead singer
# With the n_gram_similarity (bigram if we look back at the parameters set when the value was calculated)...
# ... being our measure for repetition.
context_col = "Lead.vocal"
context_df[[context_col,'N_Gram_Similarity']].groupby(context_col).mean()[['N_Gram_Similarity']].sort_values('N_Gram_Similarity',ascending=False).iplot(kind='bar',title=f"Average Repetition by {context_col}")

In [48]:
def calculate_mode(column):
    try:
        return mode(column)
    except StatisticsError:
        return None

In [49]:
# Same as above, but only looking at instances where they made it to the billboard top 50
# Features average chart position.

aggregations = {
    'N_Gram_Similarity': 'mean',  # Sum of column A
    'Top.50.Billboard': calculate_mode,  # Mean of column B
}

context_col = "Lead.vocal"
subset_1[[context_col,'N_Gram_Similarity','Top.50.Billboard']].groupby(context_col).agg(aggregations)[['N_Gram_Similarity',"Top.50.Billboard"]].sort_values('N_Gram_Similarity',ascending=False).iplot(kind='bar',title=f"Average Repetition by {context_col}")

In [50]:
col_a = "Duration"
col_b = "N_Gram_Similarity"
correlation = context_df[col_a].corr(context_df[col_b])
context_df[['Year','Duration','N_Gram_Similarity']].groupby('Year').mean().iplot(title=f'Average {col_a} and {col_b} per Year - correlation: {np.round(correlation,3)}',
                                                                                 xTitle='Year',
                                                                                 yTitle='Average X')

# Objective 4: Put Songs into Clusters

- Value of K is up to debate, but justify

###### K-Means Clustering

In [51]:
# Create new matrix based on instance of tfidf_vectorizer above.
lyrics_clean_matrix = tfidf_vectorizer.fit_transform(lyrics_clean['Lyrics'])

In [52]:
# Silhouette scores method

# never met an elbow that didn't present before about half the length of the cluster population, so 1/4 should be fine
#range_upper_bound = int(len(lyrics_clean['Lyrics'].unique())/4)
range_upper_bound = 10
range_n_clusters = list(range(2,range_upper_bound))

silhouette_scores  = []
for n_clusters in notebook.tqdm(range_n_clusters):
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(lyrics_clean_matrix)
    
    silhouette_avg = silhouette_score(lyrics_clean_matrix, cluster_labels)
    
    silhouette_scores.append(silhouette_avg)

  0%|          | 0/8 [00:00<?, ?it/s]

In [54]:
optimal_cluster_num = silhouette_scores.index(max(silhouette_scores))
optimal_cluster_num

5

In [103]:
pd.Series(silhouette_scores).iplot()

In [55]:
kmeans = KMeans(n_clusters=optimal_cluster_num, init='k-means++', max_iter=300, n_init=10, random_state=0)
cluster_labels = kmeans.fit_predict(lyrics_clean_matrix)

In [56]:
lyrics_clean['Cluster_Labels_KMEANS'] = cluster_labels

In [101]:
lyrics_clean['Cluster_Labels_KMEANS'].value_counts()

Cluster_Labels_KMEANS
2    80
3    67
1    29
4    25
0    10
Name: count, dtype: int64

In [57]:
cluster_meanings = {}
for cluster in range(optimal_cluster_num):
    cluster_songs = lyrics_clean[lyrics_clean['Cluster_Labels_KMEANS'] == cluster]['Song'].tolist()
    cluster_meanings[f'Cluster {cluster+1}'] = cluster_songs

In [58]:
for cluster, songs in cluster_meanings.items():
    print(f'{cluster}:')
    for song in songs:
        print(f'- {song}')
    print('\n')

Cluster 1:
- cry-baby-cry
- from-me-to-you
- happiness-is-a-warm-gun
- i-wanna-be-your-man
- ive-just-seen-a-face
- mother-natures-son
- ob-la-di-ob-la-da
- rocky-raccoon
- she-said-she-said
- your-mother-should-know


Cluster 2:
- and-i-love-her
- babys-in-black
- day-tripper
- devil-in-her-heart
- every-little-thing
- for-no-one
- girl
- good-day-sunshine
- hallelujah-i-love-her-so
- here-there-and-everywhere
- i-dont-want-to-spoil-the-party
- i-feel-fine
- I-saw-her-standing-there
- lovely-rita
- maggie-mae
- mailman-bring-me-no-more-blues
- maxwells-silver-hammer
- mean-mr-mustard
- misery
- norwegian-wood-this-bird-has-flown
- polythene-pam
- searchin
- she-came-in-through-the-bathroom-window
- she-loves-you
- shes-a-woman
- shes-leaving-home
- something
- ticket-to-ride
- yesterday


Cluster 3:
- a-day-in-the-life-live-in-amsterdam
- a-taste-of-honey
- across-the-universe
- act-naturally
- all-my-loving
- all-things-must-pass
- all-together-now
- bad-boy
- because
- being-for-the

###### Latent Dirichlet Allocation (LDA)

In [59]:
lyrics_clean["Tokens"] = lyrics_clean['Lyrics'].apply(nltk.word_tokenize)

In [60]:
count_vectorizer = CountVectorizer(max_features=1000, stop_words='english')

In [61]:
count_matrix = count_vectorizer.fit_transform(lyrics_clean['Tokens'].apply(lambda tokens: ' '.join(tokens)))

In [62]:
lda = LatentDirichletAllocation(n_components=5, random_state=0)
lyrics_clean['Topic'] = lda.fit_transform(count_matrix).argmax(axis=1)

In [63]:
feat_names = count_vectorizer.get_feature_names_out()

In [64]:
for topic_idx, topic in enumerate(lda.components_):
    top_words_indices = topic.argsort()[-10:][::-1]  # Get index position of top 10 words, then reverse to display in desc order
    top_words = [feat_names[i] for i in top_words_indices]
    print(f"Topic {topic_idx + 1} Top Words: {', '.join(top_words)}")

Topic 1 Top Words: yeah, oh, ll, say, got, baby, ve, day, come, said
Topic 2 Top Words: na, let, gon, come, ah, know, hey, got, long, hi
Topic 3 Top Words: love, know, oh, girl, ll, need, want, yeah, just, hold
Topic 4 Top Words: know, ll, ve, going, home, like, time, night, mind, way
Topic 5 Top Words: want, la, girl, honey, baby, know, got, little, na, da


In [65]:
topic_meanings = {}
for topic in range(optimal_cluster_num):
    topic_songs = lyrics_clean[lyrics_clean['Topic'] == topic]['Song'].tolist()
    topic_meanings[f'Topic {topic+1}'] = topic_songs

for topic, songs in topic_meanings.items():
    print(f'{topic}:')
    for song in songs:
        print(f'- {song}')
    print('\n')

Topic 1:
- a-day-in-the-life-live-in-amsterdam
- all-together-now
- and-your-bird-can-sing
- baby-its-you
- baby-youre-a-rich-man
- babys-in-black
- come-together
- day-tripper
- dear-prudence
- devil-in-her-heart
- dizzy-miss-lizzy
- doctor-robert
- drive-my-car
- everybodys-trying-to-be-my-baby
- fixing-a-hole
- hello-goodbye
- helter-skelter
- i-am-the-walrus
- i-me-mine
- ill-be-back
- ill-follow-the-sun
- ill-get-you
- im-down
- ive-got-a-feeling
- lovely-rita
- mean-mr-mustard
- mother-natures-son
- my-bonnie
- norwegian-wood-this-bird-has-flown
- not-guilty
- one-after-909
- paperback-writer
- please-mister-postman
- savoy-truffle
- taxman
- teddy-boy
- tell-me-what-you-see
- thatll-be-the-day
- the-long-and-winding-road
- the-word
- this-boy
- three-cool-cats
- till-there-was-you
- we-can-work-it-out
- when-im-sixty-four
- you-cant-do-that


Topic 2:
- across-the-universe
- act-naturally
- back-in-the-ussr
- because
- birthday
- blue-jay-way
- boys
- carry-that-weight
- chains


###### Sentiment Analysis of Topics and Clusters

In [84]:
test_lyric = lyrics_clean['Lyrics'][10] # One set of lyrics as a sample

In [85]:
sia = SentimentIntensityAnalyzer() # Init SIA

In [86]:
sia.polarity_scores(test_lyric)['compound'] # Quick test!

0.9997

In [87]:
def compound_assignment(score):
    # Custom sentiment categories and threshold ranges
    # Covering regular intervals from 1 to -1
    sentiment_categories = {
        "Strongly Positive": (0.5, 1.0),
        "Moderately Positive": (0.2, 0.5),
        "Slightly Positive": (0.05, 0.2),
        "Neutral": (-0.05, 0.05),
        "Slightly Negative": (-0.2, -0.05),
        "Moderately Negative": (-0.5, -0.2),
        "Strongly Negative": (-1.0, -0.5),
    }

    # Sentiment category based on the custom thresholds
    for category, (lower_threshold, upper_threshold) in sentiment_categories.items():
        if lower_threshold <= score <= upper_threshold:
            return category
    
    return "Undefined"  # Default category if the score is outside of defined thresholds

In [88]:
compound_assignment(1) # test

'Strongly Positive'

In [89]:
lyrics_clean['Compound Sentiment Score'] = lyrics_clean['Lyrics'].apply(lambda x: sia.polarity_scores(x)['compound'])
lyrics_clean['Compound Sentiment Assignment'] = lyrics_clean['Compound Sentiment Score'].apply(lambda x: compound_assignment(x))

##### Uncomment the following to see results as grouped by Cluster labels and Topics

# Objective 5: Cluster the Remaining 5 "Songs" Into Groups.

In [None]:
'Song', 'Lyrics', 'Cluster_Labels_KMEANS', 'Topic'

In [99]:
lyrics_clean.columns

Index(['Song', 'Lyrics', 'Languages', 'Repetition Instances',
       'Repetition Frequency', 'N_Gram_Similarity', 'Name_In_Lyrics',
       'Cluster_Labels_KMEANS', 'Tokens', 'Topic', 'Compound Sentiment Score',
       'Compound Sentiment Assignment'],
      dtype='object')

### Sandbox