In [1]:
# basic
import pandas as pd
import numpy as np
import re
from datasketch import MinHash

# Gensim for LDA topic grouping
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
from spacy.tokens import Span

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt

# nltk
import nltk
from nltk.corpus import stopwords

# warnings
import warnings
warnings.filterwarnings("ignore")

# nlpsp

nlp = spacy.load("en_core_web_sm")
import pytextrank
nlp.add_pipe("textrank")
from nlp_rake import Rake

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
stopwords_all = []
for language in stopwords.fileids():
    stopwords_all += stopwords.words(language)
stopwords_all.extend(
    ['feat', 'verse', 'chorus', 'prechorus', 'artist', 'spotify', 'error', 'issue', 'unknown']
)

In [3]:
kaggle_df = pd.read_csv('../data/etl_integration/kaggle_spotify_data_with_id.csv', encoding='latin-1')
lyrics_firstattempt_df = pd.read_csv('../data/etl_integration/lyrics_FIRSTATTEMPT.csv')
lyrics_36_df = pd.read_csv('../data/etl_integration/lyrics_A2_37new.csv')
kaggle_lyrics_df = pd.concat([lyrics_firstattempt_df, lyrics_36_df])
kaggle_lyrics_df['lyrics'] = kaggle_lyrics_df['lyrics'].apply(
    lambda x: re.sub(r'[\n ]+', ' ', re.sub(r'[^a-zA-Z\d\']+', ' ', re.sub(r'(.*Lyrics)|(\[.*\])', '', x)))
)
kaggle_lyrics_df.head()

Unnamed: 0,id,title,full_title,title_with_featured,artist,lyrics_state,lyrics,csv_title,csv_artist
0,9329251,Jung Kook (정국) - Seven Ft. Latto (Explicit Ver...,Jung Kook (정국) - Seven Ft. Latto (Explicit Ver...,Jung Kook (정국) - Seven Ft. Latto (Explicit Ver...,Genius Traducciones al Español,complete,Cargas con todo el peso del mundo en tus homb...,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook"
1,8902658,LALA,LALA by Myke Towers,LALA,Myke Towers,complete,La la la la la la Full Harmony La la la la la...,LALA,Myke Towers
2,9228106,​vampire,​vampire by Olivia Rodrigo,​vampire,Olivia Rodrigo,complete,Hate to give the satisfaction asking how you'...,vampire,Olivia Rodrigo
3,4712978,Cruel Summer,Cruel Summer by Taylor Swift,Cruel Summer,Taylor Swift,complete,Yeah yeah yeah yeah Fever dream high in the q...,Cruel Summer,Taylor Swift
4,9123229,WHERE SHE GOES,WHERE SHE GOES by Bad Bunny,WHERE SHE GOES,Bad Bunny,complete,Baby dime la verdad Si te olvidaste de m Yo s...,WHERE SHE GOES,Bad Bunny


## Merge with our kaggle dataset by jaccard distance of track name and artist name

In [4]:
kaggle_lyrics_df

Unnamed: 0,id,title,full_title,title_with_featured,artist,lyrics_state,lyrics,csv_title,csv_artist
0,9329251,Jung Kook (정국) - Seven Ft. Latto (Explicit Ver...,Jung Kook (정국) - Seven Ft. Latto (Explicit Ver...,Jung Kook (정국) - Seven Ft. Latto (Explicit Ver...,Genius Traducciones al Español,complete,Cargas con todo el peso del mundo en tus homb...,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook"
1,8902658,LALA,LALA by Myke Towers,LALA,Myke Towers,complete,La la la la la la Full Harmony La la la la la...,LALA,Myke Towers
2,9228106,​vampire,​vampire by Olivia Rodrigo,​vampire,Olivia Rodrigo,complete,Hate to give the satisfaction asking how you'...,vampire,Olivia Rodrigo
3,4712978,Cruel Summer,Cruel Summer by Taylor Swift,Cruel Summer,Taylor Swift,complete,Yeah yeah yeah yeah Fever dream high in the q...,Cruel Summer,Taylor Swift
4,9123229,WHERE SHE GOES,WHERE SHE GOES by Bad Bunny,WHERE SHE GOES,Bad Bunny,complete,Baby dime la verdad Si te olvidaste de m Yo s...,WHERE SHE GOES,Bad Bunny
...,...,...,...,...,...,...,...,...,...
32,436984,Wreck Room,"Wreck Room by Canibus (Ft. Crooked I, Flawless...","Wreck Room (Ft. Crooked I, Flawless the MC & N...",Canibus,complete,Look how many beats I gotta put in the casket...,THE LONELIEST,Mï¿½ï¿½ne
33,8404234,Bamba,Bamba by Luciano & Aitch (Ft. BIA),Bamba (Ft. BIA),Luciano & Aitch,complete,Geenaro So I'm goin' to Ghana Babe come on va...,Bamba (feat. Aitch & BIA),"Luciano, Aitch, Bï¿½"
34,9229938,LAGUNAS,LAGUNAS by Peso Pluma & Jasiel Nuñez,LAGUNAS,Peso Pluma & Jasiel Nuñez,complete,Esta noche hay luna llena cargo energ as buen...,LAGUNAS,"Jasiel Nuï¿½ï¿½ez, Peso P"
35,9103525,VAGABUNDO,"VAGABUNDO by Sebastián Yatra, Manuel Turizo & ...",VAGABUNDO,"Sebastián Yatra, Manuel Turizo & Beéle",complete,Puedes salir con cualquiera na na na na na Pa...,VAGABUNDO,"Sebastian Yatra, Manuel Turizo, Beï¿½ï"


In [5]:
def get_string_jaccard(s1, s2):
    m1, m2 = MinHash(), MinHash()
    for d in s1.split():
        m1.update(d.encode('utf8'))
    for d in s2.split():
        m2.update(d.encode('utf8'))
    return m1.jaccard(m2)
spotify_ids = []
spotify_id_confidence = []
# Find the song in the kaggle dataset with the best jaccard match
for i, lyrics_row in kaggle_lyrics_df.reset_index(drop=True).iterrows():
    print(i, end='\r')
    closest_match = -1
    closest_value = -1
    all_close_vals = []
    for j, kaggle_row in kaggle_df.iterrows():
        current_value = get_string_jaccard(kaggle_row['track_name'], lyrics_row['title']) * \
            get_string_jaccard(kaggle_row['artist(s)_name'], lyrics_row['artist'])
        if current_value > closest_match:
            closest_match = j
            closest_value = current_value
        if current_value > 0.25:
            all_close_vals.append((current_value, j))
    if closest_value < 0.25:
        spotify_ids.append(np.nan)
        spotify_id_confidence.append(np.nan)
    else:
        spotify_ids.append(kaggle_df.iloc[closest_match]['track_id'])
        spotify_id_confidence.append(closest_value)

917

In [6]:
kaggle_lyrics_df.insert(0, 'sid', spotify_ids)
kaggle_lyrics_df['sid_confidence'] = spotify_id_confidence

In [7]:
kaggle_lyrics_df = kaggle_lyrics_df.dropna(subset='sid')

## Merge kaggle with playlist tracks

In [8]:
playlist_lyrics_df = pd.read_csv('../data/lyrics_nlp/playlist_lyrics.csv')
playlist_lyrics_df['lyrics'][0]

'50 ContributorsTranslationsPortuguêsEspañolDeutschFrançaisItaliano\u200bgreedy Lyrics[Intro]\n(Woo)\n\n[Verse 1]\nHe said, "Are you serious? I\'ve tried, but I can\'t figure out\nI\'ve been next to you all night and still don\'t know what you\'re about\nYou keep ta- (Ta-ta-), talkin\', but not much comin\' out your mouth\nCan\'t you tell that I want you?", I say, yeah\n[Chorus]\nI would want myself\nBaby, please believe me\nI\'ll put you through hell\nJust to know me, yeah, yeah\nSo sure of yourself\nBaby, don\'t get greedy\nThat shit won\'t end well\n(No, it won\'t) End well\n\n[Post-Chorus]\n(Uh-uh, uh-uh-uh, uh-uh, woo)\n\n[Verse 2]\nI see you eyein\' me down, but you\'ll never know much past my name\nOr how I\'m runnin\' this room around and that I\'m still half your age\nYeah, you\'re loo- (Loo-loo-), lookin\' at me like I\'m some sweet escape\nObvious that you want me, but I said\nSee Tate McRae LiveGet tickets as low as $11You might also like[Chorus]\nI would want myself\nBaby,

In [9]:
kaggle_lyrics_df['dataset'] = 'kaggle'

In [10]:
playlist_lyrics_df = playlist_lyrics_df.rename({'g_title': 'title', 'g_artist': 'artist'}, axis=1)
playlist_lyrics_df['dataset'] = 'spotify_playlists'
playlist_lyrics_df['lyrics'] = playlist_lyrics_df['lyrics'].apply(
    lambda x: re.sub(r'[\n ]+', ' ', re.sub(r'[^a-zA-Z\d\']+', ' ', re.sub(r'(.*Lyrics)|(\[.*\])', '', x)))
)
print(len(playlist_lyrics_df))
playlist_tracks_df = pd.read_csv('../data/etl_integration/playlist_tracks.csv').rename({'name': 'spotify_track_name', 'artists': 'spotify_artists'}, axis=1)
playlist_lyrics_df = playlist_lyrics_df.merge(playlist_tracks_df, left_on='sid', right_on='spotify_id')
print(len(playlist_lyrics_df))

1184
1184


## Get genius lyrics for playlist tracks with jaccard score >= 0.25

In [11]:
spotify_id_confidence = []
playlist_lyrics_df['spotify_artists'] = playlist_lyrics_df['spotify_artists'].apply(lambda x: ', '.join(eval(x).values()))
for i, playlist_lyrics_row in playlist_lyrics_df.iterrows():
    confidence = get_string_jaccard(playlist_lyrics_row['title'], playlist_lyrics_row['spotify_track_name']) * \
                 get_string_jaccard(playlist_lyrics_row['artist'], playlist_lyrics_row['spotify_artists'])
    spotify_id_confidence.append(confidence)
playlist_lyrics_df['sid_confidence'] = spotify_id_confidence
playlist_lyrics_df = playlist_lyrics_df[playlist_lyrics_df['sid_confidence'] >= 0.25].copy()

In [12]:
lyrics_df = pd.concat([kaggle_lyrics_df[['sid', 'title', 'artist', 'lyrics', 'sid_confidence', 'dataset']].reset_index(drop=True),
                       playlist_lyrics_df[['sid', 'title', 'artist', 'lyrics', 'sid_confidence', 'dataset']].reset_index(drop=True)])\
                       .reset_index(drop=True)
lyrics_df

Unnamed: 0,sid,title,artist,lyrics,sid_confidence,dataset
0,5u5rY87oaj6Tk8DL4HxQqy,LALA,Myke Towers,La la la la la la Full Harmony La la la la la...,1.000000,kaggle
1,1BxfuPKGuaTgP7aM0Bbdwr,Cruel Summer,Taylor Swift,Yeah yeah yeah yeah Fever dream high in the q...,1.000000,kaggle
2,2FDTHlrBguDzQkp7PVj16Q,Sprinter,Dave & Central Cee,The mandem too inconsiderate five star hotel ...,0.382812,kaggle
3,3qQbCzHBycnDpGskqOWY0E,Ella Baila Sola,Eslabon Armado & Peso Pluma,Compa qu le parece esa morra La que anda bail...,0.546875,kaggle
4,6XbtvPmIpyCbjuT0e8cQtp,Columbia,Quevedo,Eh oh oh oh Oh oh Volvi de estudiar en Columb...,1.000000,kaggle
...,...,...,...,...,...,...
1249,4h3KlpOEXS6FxIpab6EKlf,INCOMING,MC ORSEN,MC ORSEN Like This Project Pat don't give a f...,1.000000,spotify_playlists
1250,0SzDVyiiUDpuhzrphnoejr,North Memphis,Pharmacist,Pharmacist motherfucker Project Pat don't giv...,1.000000,spotify_playlists
1251,1vyg1TIfJK409mLL6LaxeG,AUTOMOTIVO ANGELICAL V4,DJ ZK3,o DJ ZK3 o brabo da putaria Vai DJ do baile Q...,1.000000,spotify_playlists
1252,4oMyggIzClkOcCTvotFLkP,Why Not,Ghostface Playa,Oh my dog Whassup Ha ha ha ha Ah shit You mig...,1.000000,spotify_playlists


## TextRank

In [13]:
# from pytextrank import top_keywords_sentences
# for song_lyrics in lyrics_df['lyrics']:
#     sentence, keywords, graph, ranks, norm_rank_list, kernel = top_keywords_sentences(song_lyrics,
#                 stopwords=stopwords_eng, phrase_limit=15)
#     print(keywords)
#     print(ranks)
#     break
# DOESNT WORK: top_keywords_sentences is someone's random code they forked and added and I couldn't import it

In [14]:
all_phrases = []
for song_lyrics in lyrics_df['lyrics']:
    doc = nlp(song_lyrics)
    top_15_phrases = doc._.phrases[:15] if len(doc._.phrases) >= 15 else doc._.phrases
    top_phrases = [(phrase.text, phrase.rank) for phrase in doc._.phrases[:15]]
    all_phrases.append(top_phrases)
lyrics_df['phrases_textrank'] = all_phrases
lyrics_df['phrases_textrank']

0       [(La la la la la la, 0.3602452716030151), (La ...
1       [(secrets, 0.08582506418705997), (summer, 0.07...
2       [(broke bitches man book, 0.054744594951693315...
3       [(un vato que tiene, 0.0957735242566446), (y l...
4       [(la USA y, 0.09031657360992865), (En llamada ...
                              ...                        
1249    [(Project Pat Project Project Project Pat, 0.4...
1250    [(North Memphis North Memphis, 0.2206389726612...
1251    [(Machuca machuca Machuca machuca, 0.166450333...
1252    [(Whassup Ha, 0.13156729122949154), ( Oh my do...
1253    [(Bi bi bi bi bi bi bi bi bi bi bi bi bi bi bi...
Name: phrases_textrank, Length: 1254, dtype: object

## RAKE

In [15]:
rake = Rake()

In [16]:
all_phrases = []
for song_lyrics in lyrics_df['lyrics']:
    keywords = rake.apply(song_lyrics)
    top_phrases = keywords[:15]
    all_phrases.append(top_phrases)
lyrics_df['phrases_rake'] = all_phrases
lyrics_df['phrases_rake']

0       [(e' inevitable beb, 9.0), (t quieres mami, 8....
1       [(baby comin' home, 9.0), (dice angels roll, 8...
2       [(tokyo drift 'cause, 9.0), (tap dance bap, 9....
3       [(anda bailando sola, 9.0), (verbo tomamos tra...
4       [(mirada sonrisita nerviosa, 9.0), (amo haci n...
                              ...                        
1249    [(fuck project patyou, 8.928571428571429), (fu...
1250    [(fu project pat, 9.0), (sum' gonna smoke, 9.0...
1251            [(vem me comer, 9.0), (quatro bota, 4.0)]
1252                                                   []
1253                                                   []
Name: phrases_rake, Length: 1254, dtype: object

## LDA

In [17]:
# Convert to list
data = lyrics_df.lyrics.values.tolist()

In [18]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [19]:
# Define functions for stopwords and lemmatization
def remove_stopwords(texts):
    return [[word for word in doc if word not in stopwords_all] for doc in texts]


def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [20]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [21]:
lyrics_df['keywords'] = pd.Series(data_lemmatized).apply(lambda x: list(zip(pd.Series(x).value_counts().index, pd.Series(x).value_counts()))[:15])

In [22]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Frequency list   
corpus = [id2word.doc2bow(text) for text in texts]

In [23]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [24]:
document_topics = []
for row in corpus:
    document_topics.append(lda_model.get_document_topics(row))

In [25]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)

In [26]:
vis

In [27]:
lyrics_df['topics'] = document_topics

In [28]:
lyrics_df[lyrics_df['dataset'] == 'spotify_playlists'].to_csv('../data/lyrics_nlp/playlist_lyrics_keywords_phrases_topics.csv')

In [29]:
lyrics_df[lyrics_df['dataset'] == 'kaggle'].to_csv('../data/lyrics_nlp/kaggle_lyrics_keywords_phrases_topics.csv')

In [30]:
topic_terms = lda_model.show_topics(num_topics=20, num_words=15, formatted=False)
topic_terms_arr = []
for row in topic_terms:
    row_to_add = []
    row_to_add.append(row[0])
    for term in row[1]:
        row_to_add.append(term[0])
        row_to_add.append(term[1])
    topic_terms_arr.append(row_to_add)
topic_terms_cols = ['topic_id']
for i in range(1, 16):
    topic_terms_cols.append(f'term_{i}_word')
    topic_terms_cols.append(f'term_{i}_probability')
topic_terms_df = pd.DataFrame(topic_terms_arr, columns=topic_terms_cols)

In [31]:
topic_terms_df.to_csv('../data/lyrics_nlp/lyrics_topics_terms.csv')

### node_keywords

In [32]:
all_keywords = set()
for i in range(1, 16):
    all_keywords.update(set(list(topic_terms_df[f'term_{i}_word'].unique())))
node_keywords_df = pd.Series(list(all_keywords))
node_keywords_df.to_csv('../data/lyrics_nlp/node_keywords.csv')

### edge_track_keyword

In [33]:
track_keywords = []
for i, row in lyrics_df.drop_duplicates(subset='sid').iterrows():
    for keyword, count in row['keywords']:
        track_keywords.append([row['sid'], keyword, count])
track_keywords_df = pd.DataFrame(track_keywords)
track_keywords_df.to_csv('../data/lyrics_nlp/track_keyword_occurences.csv')

In [34]:
topic_terms_df = topic_terms_df.replace({'bitch': 'b****', 'fuck': 'f***', 'nigga': 'n****', 'pussy': 'p****', 'niggas': 'n*****',
                                         'damn': 'd***', 'shit': 's***'})

## Note that these topic names are not accurate anymore because I reran the notebook and the topics changed

In [35]:
topic_terms_df.insert(0, 'Topics', ['Explicit', 'Romantic Country', 'Spanish/time of day?', 'Heartbreak', 'Aggressive Romance', 'Crime',
                                    'General verbs', 'Movement', 'Alcohol', 'Breakup/Leaving', 'Magical/Fantasy', 'Sensual Spanish',
                                    'Mad at boy', 'Nostalgia', 'Physical Appearance', 'Hip Hop Romance', 'Wistful Romance',
                                    'Upbeat Dance', 'Rebellious', 'Weather'])

In [36]:
topic_terms_df['Percentage_documents'] = [
    4.7, 2.9, 2.2, 2.6, 7.2, 1.5, 26.1, 5.3, 1.0, 9.2, 1.0, 0.7, 1.7, 16.4, 3.3, 2.9, 7.5, 1.5, 1.2, 1.3
]

In [37]:
topic_terms_df[['Topics'] + [f'term_{i}_word' for i in range(1, 11)] + ['Percentage_documents']].sort_values(by='Percentage_documents', ascending=False)

Unnamed: 0,Topics,term_1_word,term_2_word,term_3_word,term_4_word,term_5_word,term_6_word,term_7_word,term_8_word,term_9_word,term_10_word,Percentage_documents
6,General verbs,game,mean,burn,watch,set,money,learn,deep,world,memory,26.1
13,Nostalgia,away,wish,walk,run,drive,fly,heart,remember,know,leave,16.4
9,Breakup/Leaving,ride,feel,road,free,hour,today,moon,high,pocket,pass,9.2
16,Wistful Romance,love,baby,night,light,bring,crazy,sweet,put,feel,back,7.5
4,Aggressive Romance,back,give,shake,big,put,play,break,lookin,hell,truck,7.2
7,Movement,lonely,quick,question,wide,force,awake,meet,linger,wonderin,conversation,5.3
0,Explicit,b****,f***,n****,s***,hoe,gon,p****,n*****,money,bout,4.7
14,Physical Appearance,girl,get,boy,real,s***,hit,pull,high,d***,slow,3.3
15,Hip Hop Romance,make,dance,jump,body,pop,water,summer,moment,wave,fight,2.9
1,Romantic Country,comin,different,easy,safe,tonight,paro,numb,frente,nunca,barrio,2.9


## Utility function to look at the top songs for each topic to help decide on topic names

In [38]:
def check_for_topic(x, topic_check):
    for topic, prob in x:
        if topic == topic_check:
            return prob
    return np.nan
for topic in range(20):
    topic_x_probs = lyrics_df['topics'].apply(check_for_topic, args=(topic,))
    topic_x_probs_top5 = topic_x_probs[topic_x_probs.isin(topic_x_probs.sort_values(ascending=False).iloc[:5])]
    print(topic)
    print(lyrics_df.loc[topic_x_probs_top5.index, ['title', 'artist']])

0
          title    artist
24     AMARGURA   KAROL G
503         FAN    Offset
659   Mil Veces    Anitta
1076   AMARGURA   KAROL G
1249   INCOMING  MC ORSEN
1
                 title        artist
0                 LALA   Myke Towers
178    About Damn Time         Lizzo
354    About Damn Time         Lizzo
472               LALA   Myke Towers
727  PALM OF YOUR HAND  Brandon Lake
2
                 title                      artist
13        MOJABI GHOST           Tainy & Bad Bunny
29          La Bachata               Manuel Turizo
40         El Merengue  Marshmello & Manuel Turizo
324             Yo Voy               Zion & Lennox
651  MI EX TENÍA RAZÓN                     KAROL G
3
                                  title                    artist
17                            Anti-Hero              Taylor Swift
561                           Anti-Hero              Taylor Swift
688                            Dreaming  Marshmello, P!nk & Sting
791   You Spin Me Round (Like a Record)     