##### Caretaking*

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
%config Completer.use_jedi = False

##### Imports

In [3]:
import os
import math
import re
import nltk

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [5]:
from tqdm import notebook
from collections import Counter

from fuzzywuzzy import fuzz, process
from itertools import combinations
from statistics import mode

from langdetect import detect, detect_langs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from kneed import KneeLocator
from sklearn.metrics import silhouette_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import StandardScaler

from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.metrics import jaccard_distance
from nltk.sentiment import SentimentIntensityAnalyzer

ModuleNotFoundError: No module named 'langdetect'

In [None]:
import cufflinks as cf
cf.go_offline()

In [None]:
nltk.data.path.append("C:\\Users\\ernes\\anaconda3\\Lib\\site-packages\\nltk")

#### Read Data

In [None]:
lyrics_path = './lyrics/'
new_lyrics_path = './new_lyrics/'

In [None]:
# Get folder contents first of all...
lyrics_ls = [f for f in os.listdir(lyrics_path) if f.endswith('.txt')]
new_lyrics_ls = [f for f in os.listdir(new_lyrics_path) if f.endswith('.txt')]

In [None]:
# ...and then read contents to two different lists.
lyrics = [[file.replace(".txt",""),open(os.path.join(lyrics_path, file), 'r').read()] for file in lyrics_ls]
new_lyrics = [[file.replace(".txt",""),open(os.path.join(new_lyrics_path, file), 'r').read()] for file in new_lyrics_ls]

In [None]:
combined_lyrics = pd.DataFrame(lyrics,columns=['Song','Lyrics'])
combined_new_lyrics = pd.DataFrame(new_lyrics,columns=['Song','Lyrics'])

combined_lyrics['New/Old'] = "old"
combined_new_lyrics['New/Old'] = "new"

In [None]:
total_lyrics = pd.concat([combined_lyrics,combined_new_lyrics])

In [None]:
total_lyrics['New/Old'].value_counts()

## Objective 1: Account for Duplicates and Anomalies

- Account for duplicates in data files:
	- example: "a-day-in-the-life" and "a-day-in-the-life-live-in-amsterdam"
- "weird" or "missing" data.
- Remove anything that looks "amiss"
- Essentially data cleaning

###### Cleansing

In [None]:
def simple_content_check(x):
    
    assess = (x != '\n' and "instrumental" not in x.lower())
    
    return assess

In [None]:
def clean_lyrics(lys):
    
    rgx_pattern = r'[^a-zA-Z0-9\s]'
    
    sentences_ls = lys.split('\n')
    
    std_ls = [re.sub(rgx_pattern,'',x.strip()).lower() for x in sentences_ls if len(x) > 0]
    
    return std_ls

In [None]:
combined_lyrics_adj = total_lyrics[total_lyrics['Lyrics'].apply(simple_content_check)] # adjusted version to exclude based on function above

In [None]:
combined_lyrics_adj['New/Old'].value_counts()

In [None]:
# Just to check if any adjustments or additions to function above
print(f"{np.round((len(lyrics) - len(combined_lyrics_adj))/len(lyrics) * 100,2)} % reduction")

In [None]:
# Drop all duplicate instances on the lyrics column
# Keep first instance by default
combined_lyrics_adj.drop_duplicates(subset=['Lyrics'],inplace=True)

In [None]:
combined_lyrics_adj.reset_index(drop=True,inplace=True)
combined_lyrics_adj.head()

###### Language Differences

In [None]:
# Account for any language differences
combined_lyrics_adj['Languages'] = combined_lyrics_adj['Lyrics'].apply(lambda x: detect(x))

In [None]:
combined_lyrics_adj['Languages'].value_counts()

In [None]:
# fault of the detection module. Funny one though!
#combined_lyrics_adj.loc[combined_lyrics_adj['Languages'] == 'so']

In [None]:
#odd_idx = combined_lyrics_adj.loc[combined_lyrics_adj['Languages'] == 'so'].index[0]

In [None]:
# Interesting. As above ^
#detect_langs(combined_lyrics_adj['Lyrics'][odd_idx])

In [None]:
# Two known instances of German versions of Beatles songs being released.
# These pertain to "I Want To Hold Your Hand" and "She Loves You"
# Will omit as the English lang versions of both are already included.
combined_lyrics_adj.loc[combined_lyrics_adj['Languages'] == 'de']

In [None]:
combined_lyrics_adj = combined_lyrics_adj.loc[combined_lyrics_adj['Languages'] != 'de']

In [None]:
combined_lyrics_adj.reset_index(drop=True,inplace=True)

###### Cosine Similarity of song lyrics

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_lyrics_adj['Lyrics'])

In [None]:
comparisons_cs = [] # List for cosine similarity
old_df = combined_lyrics_adj.loc[combined_lyrics_adj['New/Old']=="old"]
rng_max = len(old_df)
for i in notebook.tqdm(range(rng_max)):
    for j in range(i+1,rng_max): # Avoid self-matching
        
        # Lyrics for each song pairing
        lyrics_i = tfidf_matrix[i]
        lyrics_j = tfidf_matrix[j]
        
        # Compute cosine similarity score
        sim_score = cosine_similarity(lyrics_i,lyrics_j)[0][0]
        
        comparisons_cs.append({
            'Song1': old_df['Song'][i],
            'Song2': old_df['Song'][j],
            'Similarity': sim_score
        })

In [None]:
comp_cs_df = pd.DataFrame(comparisons_cs)

In [None]:
# Rationale: the general and short nature of the song "the-end" is causing it to compare...
# ...favourably to many songs. Omitting to inspect further
comp_cs_df_redux = comp_cs_df[~comp_cs_df['Song1'].str.contains('the-end', case=False) & ~comp_cs_df['Song2'].str.contains('the-end', case=False)]

In [None]:
# Comfortable to take anything at .7 and above as actually being the same
# And can inspect by eye from here.
# Revolution has a copy with additional lyrics
# Sgt Peppers Lonely Hearts Club Band has a reprise
comp_cs_df_redux.sort_values('Similarity',ascending=False).head(5)

In [None]:
omit_list = list(comp_cs_df_redux.loc[comp_cs_df_redux['Similarity']>=0.7]['Song1'].values)
omit_list

In [None]:
lyrics_clean = combined_lyrics_adj[combined_lyrics_adj['Song'].apply(lambda x: x not in omit_list)]
lyrics_clean.reset_index(drop=True,inplace=True)

In [None]:
lyrics_clean

# Objective 2: Answer 2 of 5 Available Questions of the Data

Question 1: Which song has the largest amount of repetition?

Question 2: How many of the songs feature the song name (found in the file name) in the song lyrics?

Bonus (from above data cleaning): Which songs are the most similar

### Which song has the largest amount of repetition?

- Taking the definition of repetition as all the unique lines in  a song that occur more than once.

###### Basic Approach

In [None]:
def repetition_counter(lys):
    
    standardised_ls = clean_lyrics(lys)
    
    standardised_srs = pd.Series(standardised_ls)
    instance_count = standardised_srs.value_counts()
    
    # Terminology may be off here, but count the length of all instances greater than 1
    # Call that number of instances
    # Then the frequency of repetitions in a set of lyrics is the sum of the count of instances
    
    repetition_instances = len(instance_count[instance_count > 1]) # How many lines repeat
    repetition_frequency = instance_count[instance_count > 1].sum() # What is the sum of those repetitions
    
    return [repetition_instances,repetition_frequency]

In [None]:
lyrics_clean['Results'] = lyrics_clean['Lyrics'].apply(repetition_counter)
lyrics_clean[['Repetition Instances','Repetition Frequency']] = lyrics_clean['Results'].apply(pd.Series)
lyrics_clean.drop('Results',axis=1,inplace=True)

In [None]:
lyrics_clean.head()#.sort_values('Repetition Frequency',ascending=False)

###### N-gram Similarity - More Robust

In [None]:
def n_gram_sim(lys,jcrd_thresh=1.0,inspect=False):
    
    standardised_ls = clean_lyrics(lys)
    
    # Tokenize lyrics and run calculation
    token_lyrics = [nltk.word_tokenize(x.lower()) for x in standardised_ls]
    n_grams = [list(ngrams(y,2)) for y in token_lyrics]
    
    #return n_grams
    jcrd_similarity = []
    for i in range(len(standardised_ls)):
        for j in range(i + 1, len(standardised_ls)): # Again, avoid self-comparison
            try:
                similarity = 1 - jaccard_distance(set(n_grams[i]), set(n_grams[j]))
                if similarity >= jcrd_thresh: # If they match based on the set threshold then they can be included
                    jcrd_similarity.append([n_grams[i],n_grams[j],similarity])
            except:
                pass
                
    # For inspection and testing purposes...or not.
    if inspect:
        return jcrd_similarity
    else:
        return len(jcrd_similarity)

In [None]:
lyrics_clean['N_Gram_Similarity'] = lyrics_clean['Lyrics'].apply(n_gram_sim)

In [None]:
# Rankings as seen below. Preliminary check of the top few confirms.
lyrics_clean.sort_values('N_Gram_Similarity',ascending=False)[['Song','N_Gram_Similarity']].head(10)

### How many of the songs feature the song name (found in the file name) in the song lyrics?

In [None]:
def song_in_lyrics(row):
    
    clean_song = row['Song'].split('-live')[0].replace("-"," ")
    
    standardised_ls = clean_lyrics(row['Lyrics'])
    
    return clean_song in standardised_ls

In [None]:
lyrics_clean['Name_In_Lyrics'] = lyrics_clean[['Song','Lyrics']].apply(song_in_lyrics, axis=1)
lyrics_clean['Name_In_Lyrics'].value_counts()

# Objective 3: Derive a single piece of insight from the data that you find interesting.

In [None]:
btls_context_data = pd.read_csv('./Context Data/The Beatles songs dataset, v1, no NAs.csv')

In [None]:
# Create a corresponding "Song" column for this df to match the main df.
hyphen_pattern = r'[^\w\s-]'
btls_context_data['Song'] = btls_context_data['Title'].apply(lambda x: re.sub(hyphen_pattern,'',x).replace(' ','-').lower())

In [None]:
# Difference list. Of a negligible enough number that I'm happy to continue.
diff_ls = list(set(lyrics_clean['Song'].apply(lambda x: x.lower())) - set(btls_context_data['Song']))

In [None]:
diff_df = lyrics_clean.loc[~lyrics_clean['Song'].isin(diff_ls)]

In [None]:
diff_df['Song'] = diff_df['Song'].apply(lambda x: x.lower())

In [None]:
context_df = pd.merge(diff_df,btls_context_data[['Song',
                                                 'Year',
                                                 'Duration',
                                                 'Genre',
                                                 'Songwriter',
                                                 'Lead.vocal',
                                                 'Top.50.Billboard']],on='Song')

In [None]:
# Minor negative correlation (aka, the higher the level of repetition, the higher it was on the charts)
subset_1 = context_df.copy().loc[context_df["Top.50.Billboard"] != -1] # only instances where it came top 50 billboard charts
subset_1['N_Gram_Similarity'].corr(subset_1['Top.50.Billboard']) # simple pearson corr

In [None]:
# By injecting context, we can see what the average n_gram_similarity is by lead singer
# With the n_gram_similarity (bigram if we look back at the parameters set when the value was calculated)...
# ... being our measure for repetition.
context_col = "Lead.vocal"
context_df[[context_col,'N_Gram_Similarity']].groupby(context_col).mean()[['N_Gram_Similarity']].sort_values('N_Gram_Similarity',ascending=False).iplot(kind='bar',title=f"Average Repetition by {context_col}")

In [None]:
def calculate_mode(column):
    try:
        return mode(column)
    except StatisticsError:
        return None

In [None]:
# Same as above, but only looking at instances where they made it to the billboard top 50
# Features average chart position.

aggregations = {
    'N_Gram_Similarity': 'mean',  # Sum of column A
    'Top.50.Billboard': calculate_mode,  # Mean of column B
}

context_col = "Lead.vocal"
subset_1[[context_col,'N_Gram_Similarity','Top.50.Billboard']].groupby(context_col).agg(aggregations)[['N_Gram_Similarity',"Top.50.Billboard"]].sort_values('N_Gram_Similarity',ascending=False).iplot(kind='bar',title=f"Average Repetition by {context_col}")

In [None]:
col_a = "Duration"
col_b = "N_Gram_Similarity"
correlation = context_df[col_a].corr(context_df[col_b])
context_df[['Year','Duration','N_Gram_Similarity']].groupby('Year').mean().iplot(title=f'Average {col_a} and {col_b} per Year - correlation: {np.round(correlation,3)}',
                                                                                 xTitle='Year',
                                                                                 yTitle='Average X')

# Objective 4&5: Put Songs into Clusters - Cluster Remaining 5

- First adding useful context before clustering

###### Sentiment Analysis of Topics and Clusters

In [None]:
test_lyric = lyrics_clean['Lyrics'][10] # One set of lyrics as a sample

In [None]:
sia = SentimentIntensityAnalyzer() # Init SIA

In [None]:
sia.polarity_scores(test_lyric)['compound'] # Quick test!

In [None]:
def compound_assignment(score):
    # Custom sentiment categories and threshold ranges
    # Covering regular intervals from 1 to -1
    sentiment_categories = {
        "Strongly Positive": (0.5, 1.0),
        "Moderately Positive": (0.2, 0.5),
        "Slightly Positive": (0.05, 0.2),
        "Neutral": (-0.05, 0.05),
        "Slightly Negative": (-0.2, -0.05),
        "Moderately Negative": (-0.5, -0.2),
        "Strongly Negative": (-1.0, -0.5),
    }

    # Sentiment category based on the custom thresholds
    for category, (lower_threshold, upper_threshold) in sentiment_categories.items():
        if lower_threshold <= score <= upper_threshold:
            return category
    
    return "Undefined"  # Default category if the score is outside of defined thresholds

In [None]:
compound_assignment(1) # test

In [None]:
lyrics_clean['Compound Sentiment Score'] = lyrics_clean['Lyrics'].apply(lambda x: sia.polarity_scores(x)['compound'])
lyrics_clean['Compound Sentiment Assignment'] = lyrics_clean['Compound Sentiment Score'].apply(lambda x: compound_assignment(x))

In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(lyrics_clean['Lyrics'])

In [None]:
feature_cols = ['Lyrics','N_Gram_Similarity', 
                'Compound Sentiment Score',
                "Repetition Instances", 
                "Repetition Frequency"]

In [None]:
feature_df = lyrics_clean[feature_cols]

In [None]:
lyrics_tfidf = tfidf_vectorizer.fit_transform(feature_df['Lyrics'])

In [None]:
lyrics_tfidf_df = pd.DataFrame(lyrics_tfidf.toarray(),
                               columns=tfidf_vectorizer.get_feature_names_out())

In [None]:
feature_matrix = pd.concat([lyrics_tfidf_df,feature_df.drop('Lyrics',axis=1)],axis=1)

In [None]:
# Using Elbow Method
range_upper_bound = int(len(lyrics_clean['Lyrics'].unique())/4) 
range_n_clusters = list(range(1,range_upper_bound))

sum_squares = []
for n_clusters in notebook.tqdm(range_n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(feature_matrix)
    sum_squares.append(kmeans.inertia_)

In [None]:
pd.Series(sum_squares).iplot(title='Elbow Plot',
                             xTitle='Clusters',
                             yTitle='SSDs') # I call it about 4?

###### Kneedle Locate value of K

In [None]:
elbow_df = pd.DataFrame(sum_squares)

In [None]:
kneedle = KneeLocator(elbow_df.index[1:],elbow_df[0][1:], curve="convex", direction="decreasing")

In [None]:
kneedle.plot_knee()

In [None]:
kneedle.knee

In [None]:
optimal_cluster_num = kneedle.knee

In [None]:
kmeans = KMeans(n_clusters=optimal_cluster_num, init='k-means++', max_iter=300, n_init=10, random_state=0)
cluster_labels = kmeans.fit_predict(feature_matrix)

In [None]:
pd.Series(cluster_labels).value_counts()

In [None]:
lyrics_clean['Cluster Labels'] = pd.Series(cluster_labels)

# Obj 4&5 pt.2 - Clustering on JUST Lyrics

In [None]:
# Create new matrix based on instance of tfidf_vectorizer above.
lyrics_clean_matrix = tfidf_vectorizer.fit_transform(lyrics_clean['Lyrics'])

In [None]:
kmeans = KMeans(n_clusters=optimal_cluster_num, init='k-means++', max_iter=300, n_init=10, random_state=0)
cluster_labels = kmeans.fit_predict(lyrics_clean_matrix)

In [None]:
lyrics_clean['Cluster Labels LYRICS_ONLY'] = pd.Series(cluster_labels)

### Meaning

In [None]:
def preprocess_txt(text):
    
    # Tokenize input text
    tokens = nltk.word_tokenize(text)
    
    # Clean and lower case it all
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    
    return tokens

In [None]:
grouped_lyrs = lyrics_clean.groupby('Cluster Labels LYRICS_ONLY')
theme_ls = []
top_x = 20
for cluster_label, group in grouped_lyrs:
    
    # Prepare corpus of words
    all_lyrics = " ".join(group['Lyrics'])
    preprocessed_lyrics = preprocess_txt(all_lyrics)
    
    # Calc the word frequencies
    word_frequencies = Counter(preprocessed_lyrics)
    
    # Get just the top 10
    common_words = word_frequencies.most_common(top_x)
    
    # make dataframe output
    common_words_df = pd.DataFrame(common_words, columns=['Word', 'Frequency'])
    
    # Add a new column for the cluster label
    common_words_df['Cluster Label'] = cluster_label
    
    # Add the DataFrame to the list
    theme_ls.append(common_words_df)

In [None]:
theme_ls

# Sandbox and Outputs

In [1]:
import nltk
from nltk.corpus import wordnet as wn

# Ensure you have the WordNet data downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')  # For extended synsets

words = ["anchor", "captain", "deck", "rudder"]

# Get synsets for each word
synsets = [wn.synsets(word) for word in words]

# Extract the most common hypernyms (general concepts)
hypernyms = []
for synset_list in synsets:
    for synset in synset_list:
        hypernyms.extend(synset.hypernyms())

# Count the frequency of each hypernym
from collections import Counter
hypernym_counts = Counter(hypernyms)

# Get the most common hypernym
most_common_hypernyms = hypernym_counts.most_common(5)
for hypernym, count in most_common_hypernyms:
    print(hypernym, hypernym.definition())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...


Synset('fasten.v.01') cause to be firmly attached
Synset('hook.n.04') a mechanical device that is curved or bent to suspend or hold or pull something
Synset('support.n.03') something providing immaterial assistance to a person or cause or interest
Synset('television_reporter.n.01') someone who reports news stories via television
Synset('commissioned_military_officer.n.01') a commissioned officer in the Army or Air Force or Marine Corps
