This notebook aims at going further in the exploratory data analysis. We'll use more complex models, implemented in the gensim library to perform topic modelling and convert songs to vector embeddings, allowing us to look for similarities between songs and between artists.

In [61]:
import pandas as pd
import numpy as np
import gensim.models.word2vec as w2v
import multiprocessing
import os
import re
import pprint
import time
import sklearn.manifold
import matplotlib.pyplot as plt
import nltk
import plotly.graph_objs as go
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from ast import literal_eval

#nltk.download('stopwords')

In [None]:
repository = 'data/'
songs_df = pd.read_csv(repository + "genius_song_lyrics_tokenized.csv", header=0)
songs_df['tokens']=songs_df['tokens'].apply(literal_eval)
songs_df=songs_df.reset_index()
artist_df = pd.read_csv(repository + "genius_artists.csv", header=0)
artist_df['tokens']=artist_df['tokens'].apply(literal_eval)

# I - LATENT DIRICHLET ALLOCATION

In this section we will, perform a latent dirichlet allocation to extract the main themes from the genres and the artists lyrics.

In [63]:
def perform_lda_by_genre(num_topics=5, passes=10):
    """Performs LDA topic modeling for each genre in the DataFrame.

    Args:
        num_topics (int, optional): Number of topics to extract per genre. Defaults to 5.
        passes (int, optional): Number of passes through the corpus during training. Defaults to 50.

    Returns:
        dict: A dictionary where keys are genres and values are the corresponding LDA models.
    """

    genre_models = {}  # Initialize an empty dictionary to store the models

    # Iterate through each unique genre in the 'tag' column
    for genre in songs_df['tag'].unique():
        # Filter the DataFrame to include only songs of the current genre
        genre_df = songs_df[songs_df['tag'] == genre]

        # Create a dictionary of unique words in the genre's lyrics
        dictionary = Dictionary(genre_df['tokens'])

        # Create a corpus (bag-of-words representation) for LDA
        corpus = [dictionary.doc2bow(text) for text in genre_df['tokens']]

        # Train the LDA model for the current genre
        lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

        # Store the trained model in the dictionary
        genre_models[genre] = lda_model

        # Print the topics discovered for the genre
        print(f"Topics for genre: {genre}")
        topics = lda_model.print_topics(num_words=10)  # Get the top 10 words for each topic
        for topic in topics:
            print(topic)
        print("-" * 20)  # Separator

    return genre_models  # Return the dictionary of genre models

def perform_lda_by_artist(artist: str, num_topics=5):
    """Performs LDA topic modeling for a specific artist.

    Args:
        artist (str): The name of the artist.
        num_topics (int, optional): Number of topics to extract. Defaults to 5.

    Returns:
        gensim.models.ldamodel.LdaModel: The trained LDA model for the artist.
    """

    # Filter the DataFrame to include only songs by the specified artist
    artist_df = songs_df[songs_df['artist'] == artist]

    # Create a dictionary of unique words in the artist's lyrics
    dictionary = Dictionary(artist_df['tokens'])

    # Create a corpus (bag-of-words representation) for LDA
    corpus = [dictionary.doc2bow(text) for text in artist_df['tokens']]

    # Train the LDA model for the artist
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=50)

    # Print the topics discovered for the artist
    print(f"Topics for artist: {artist}")
    topics = lda_model.print_topics(num_words=10)  # Get the top 10 words for each topic
    for topic in topics:
        print(topic)
    print("-" * 20)  # Separator

    return lda_model  # Return the trained LDA model

In [66]:
songs_df

Unnamed: 0,index,title,tag,artist,year,lyrics,views,rank,tokens,valence
0,0,Shape of You,pop,Ed Sheeran,2017,[Verse 1]\nThe club isn't the best place to fi...,14569727,1,"[club, best, place, find, lover, bar, go, frie...",0.9993
1,1,​thank u next,pop,Ariana Grande,2018,[Verse 1]\nThought I'd end up with Sean\nBut h...,9072131,2,"[thought, end, sean, wa, match, wrote, song, r...",0.9996
2,2,Work,pop,Rihanna,2016,"[Chorus: Rihanna]\nWork, work, work, work, wor...",7871555,3,"[work, work, work, work, work, work, said, haf...",-0.9978
3,3,Too Good at Goodbyes,pop,Sam Smith,2017,[Verse 1]\nYou must think that I'm stupid\nYou...,7784454,4,"[must, think, stupid, must, think, fool, must,...",-0.9770
4,4,7 rings,pop,Ariana Grande,2019,"[Verse 1]\nYeah, breakfast at Tiffany's and bo...",7387794,5,"[yeah, breakfast, tiffani, bottl, bubbl, girl,...",0.9972
...,...,...,...,...,...,...,...,...,...,...
995,995,The Other,pop,Lauv,2015,[Verse 1]\nLike a spotlight the water hits me\...,464603,996,"[like, spotlight, water, hit, ran, extra, cold...",-0.9000
996,996,Remind Me to Forget,pop,Kygo,2018,"[Verse 1]\nIt never fades away, it's staying\n...",464450,997,"[never, fade, away, stay, kiss, like, broken, ...",-0.9518
997,997,IU - 에잇 eight ft. SUGA English Translation,pop,Genius English Translations,2020,[Verse 1: IU]\nSo are you happy now?\nFinally ...,464424,998,"[happi, final, happi, yeah, well, feel, like, ...",0.9861
998,998,Ive Been Waiting,pop,Lil Peep & iLoveMakonnen,2019,"[Intro: Lil Peep]\nWhoa, yeah\n\n[Verse 1: iLo...",464078,999,"[whoa, yeah, wait, wait, wait, 'caus, ca, get,...",-0.9969


In [67]:
artist_models = perform_lda_by_artist('Ed Sheeran')
genre_models = perform_lda_by_genre()

Topics for artist: Ed Sheeran
(0, '0.027*"know" + 0.017*"love" + 0.015*"wa" + 0.015*"look" + 0.014*"could" + 0.012*"babi" + 0.012*"girl" + 0.012*"\'\'" + 0.011*"beauti" + 0.009*"na"')
(1, '0.049*"come" + 0.040*"love" + 0.028*"oh" + 0.019*"bodi" + 0.018*"babi" + 0.013*"wa" + 0.011*"like" + 0.008*"day" + 0.008*"everi" + 0.008*"know"')
(2, '0.023*"love" + 0.020*"ooh" + 0.016*"know" + 0.011*"way" + 0.011*"habit" + 0.011*"bad" + 0.010*"one" + 0.010*"lead" + 0.009*"take" + 0.009*"find"')
(3, '0.025*"barcelona" + 0.019*"come" + 0.019*"border" + 0.016*"south" + 0.014*"like" + 0.013*"free" + 0.011*"water" + 0.011*"jump" + 0.010*"got" + 0.010*"happier"')
(4, '0.051*"need" + 0.032*"man" + 0.017*"know" + 0.013*"burn" + 0.012*"see" + 0.012*"na" + 0.011*"wan" + 0.010*"fire" + 0.009*"come" + 0.008*"back"')
--------------------
Topics for genre: pop
(0, '0.048*"yeah" + 0.031*"na" + 0.022*"like" + 0.022*"got" + 0.020*"babi" + 0.015*"wan" + 0.014*"know" + 0.012*"oh" + 0.011*"ai" + 0.011*"girl"')
(1, '0.

# II - SONGS TO VECTORS

We will now try to generate meaningful dense embeddings thanks to the word2vec model, that we will train. We'll start by tokenizing our lyrics, with a different and simpler function that in the previous notebook, since this time we want word conjugations, declinations ... and we don't want to get rid of any stop word, we want the largest possible vocabulary.

### II.1 - CREATING THE MODEL

In [68]:
def tokenize(lyrics: str) -> list[str]:
    """Tokenizes lyrics by removing section headers, separating symbols, and converting to lowercase.

    Args:
        lyrics (str): The input song lyrics.

    Returns:
        list[str]: A list of tokens (words and separated symbols).
    """
    # Remove section headers like [Verse 1], [Chorus], etc. using regular expressions
    lyrics = re.sub(r'\[.*?\]', '', lyrics)

    # Define a list of symbols to be separated
    symbols = [
    ".", ",", "!", "?", ";", ":",'"', "(", ")", "[", "]", "{", "}",
    "-", "–", "—", "_", "/", "\\",
    "&", "*", "@", "#", "%", "^", "°",
    "+", "=", "<", ">", "|", "~", "`",
    ]

    # Separate symbols by adding spaces around them. This ensures that 2 tokens
    # ("really" and "?") will be generated from "really?".
    for sym in symbols:
        lyrics = lyrics.replace(sym, f" {sym} ")

    # Convert lyrics to lowercase and split into tokens by whitespace
    return lyrics.lower().split()

# Apply the 'tokenize' function to the 'lyrics' column of the 'songs_df' DataFrame
# and store the resulting tokens in a new column named 'tokens'
songs_df['tokens'] = songs_df['lyrics'].apply(tokenize)
songs_df # Display the DataFrame with the new 'tokens' column

Unnamed: 0,index,title,tag,artist,year,lyrics,views,rank,tokens,valence
0,0,Shape of You,pop,Ed Sheeran,2017,[Verse 1]\nThe club isn't the best place to fi...,14569727,1,"[the, club, isn't, the, best, place, to, find,...",0.9993
1,1,​thank u next,pop,Ariana Grande,2018,[Verse 1]\nThought I'd end up with Sean\nBut h...,9072131,2,"[thought, i'd, end, up, with, sean, but, he, w...",0.9996
2,2,Work,pop,Rihanna,2016,"[Chorus: Rihanna]\nWork, work, work, work, wor...",7871555,3,"[work, ,, work, ,, work, ,, work, ,, work, ,, ...",-0.9978
3,3,Too Good at Goodbyes,pop,Sam Smith,2017,[Verse 1]\nYou must think that I'm stupid\nYou...,7784454,4,"[you, must, think, that, i'm, stupid, you, mus...",-0.9770
4,4,7 rings,pop,Ariana Grande,2019,"[Verse 1]\nYeah, breakfast at Tiffany's and bo...",7387794,5,"[yeah, ,, breakfast, at, tiffany's, and, bottl...",0.9972
...,...,...,...,...,...,...,...,...,...,...
995,995,The Other,pop,Lauv,2015,[Verse 1]\nLike a spotlight the water hits me\...,464603,996,"[like, a, spotlight, the, water, hits, me, ran...",-0.9000
996,996,Remind Me to Forget,pop,Kygo,2018,"[Verse 1]\nIt never fades away, it's staying\n...",464450,997,"[it, never, fades, away, ,, it's, staying, you...",-0.9518
997,997,IU - 에잇 eight ft. SUGA English Translation,pop,Genius English Translations,2020,[Verse 1: IU]\nSo are you happy now?\nFinally ...,464424,998,"[so, are, you, happy, now, ?, finally, happy, ...",0.9861
998,998,Ive Been Waiting,pop,Lil Peep & iLoveMakonnen,2019,"[Intro: Lil Peep]\nWhoa, yeah\n\n[Verse 1: iLo...",464078,999,"[whoa, ,, yeah, i've, been, waiting, (, i've, ...",-0.9969


In [69]:
# Initialize the Word2Vec model with specified parameters.
text_corpus = songs_df[songs_df['rank'] <= 120000]['tokens'].tolist()

min_word_count = 1
num_workers = multiprocessing.cpu_count()
context_size = 7
downsampling = 1e-1
seed = 1
num_features = 50

songs2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    min_count=min_word_count,
    window=context_size,
    vector_size=num_features,
    sample=downsampling
)

songs2vec.build_vocab(text_corpus)


In [70]:
# Train the model on our dataset
start_time = time.time()

songs2vec.train(text_corpus, total_examples=songs2vec.corpus_count, epochs=1)

if not os.path.exists("trained"):
    os.makedirs("trained")

songs2vec.save(os.path.join("trained", "songs2vectors.w2v"))

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.721052885055542 seconds ---


In [71]:
songs2vec = w2v.Word2Vec.load(os.path.join("trained", "songs2vectors.w2v"))

We have now trained our model, built a vocabulary and setting a vector embedding for each token in the vocaburay. Let's take a look a these embeddings to make sure they make sense.

In [72]:
songs2vec.wv.most_similar("love")

[('touch', 0.8919926285743713),
 ('tired', 0.8919310569763184),
 ('knows', 0.8898550868034363),
 ('songs', 0.8883442878723145),
 ('worth', 0.8877366185188293),
 ('lie', 0.8858993053436279),
 ('after', 0.8832060694694519),
 ('lies', 0.8821540474891663),
 ('trust', 0.8810770511627197),
 ('learn', 0.8806354403495789)]

In [73]:
songs2vec.wv.most_similar("bird")

[('dog', 0.9857643246650696),
 ('closet', 0.9834131002426147),
 ('flying', 0.9823722243309021),
 ('nicki', 0.982118546962738),
 ('thirsty', 0.9819675087928772),
 ('slightly', 0.9817320704460144),
 ('bond', 0.9814490675926208),
 ('handsome', 0.9811965823173523),
 ('plastic', 0.9809665083885193),
 ('crowded', 0.9809281826019287)]

As we can see, the embeddings seem rather reliable, according to the previous test. Our dataset is so large that even with trained with only 1 epoch, the models achieves satisfying performances.

### II.2 - COMPUTING SONGS EMBEDDINGS

Now that we have embeddings for words and know they work well, let's compute the song embeddings.

In [74]:
def songVector(song):
    """
    Computes a song embedding by averaging the word embeddings of its lyrics.

    Args:
        song (pd.Series): A row from the songs DataFrame representing a song.

    Returns:
        np.ndarray: The normalized song embedding vector.
    """
    
    # Get the tokenized lyrics of the song
    words = song['tokens']
    # Initialize an empty vector to store the song embedding
    vector_sum = np.zeros(songs2vec.vector_size)

    # Iterate through each word in the lyrics
    for word in words:
        vector_sum = vector_sum + songs2vec.wv[word]

    if len(words) > 0 :
      vector_sum = vector_sum / len(words)  # Normalize the vector sum

    # Reshape the vector sum to a 2D array
    vector_sum = vector_sum.reshape(1, -1)
    # Normalize the vector sum to unit length
    normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
    # Return the normalized song embedding
    return normalised_vector_sum

# Apply the songVector function to each song in the DataFrame to create song embeddings
songs_df['song_vector'] = songs_df.apply(songVector, axis=1)
# Stack the song embeddings into a matrix
song_embeddings_matrix = np.stack(songs_df['song_vector'].apply(np.ravel).values)

# Create a temporary DataFrame with title, artist, and song vector
temp_df = songs_df[['title', 'artist', 'song_vector']]
# Flatten the song vectors in the temporary DataFrame
temp_df['song_vector'] = temp_df['song_vector'].apply(np.ravel)

# Group by artist and compute average embedding for each artist
artist_embeddings = (
    temp_df.groupby('artist')['song_vector']
    .apply(lambda vectors: np.mean(np.vstack(vectors), axis=0))
    .reset_index(name='artist_vector')
)

# Merge the artist embeddings with the artist DataFrame
artist_df = pd.merge(artist_df, artist_embeddings, on='artist', how='inner')
# Stack all artist vectors into a matrix
artist_embeddings_matrix = np.vstack(artist_df['artist_vector'].values)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [75]:
def most_similar_songs(song_title, top_n=5):
    """
    Given a song title, print the top N most similar songs based on embedding vectors.

    Parameters:
    - song_title: Title of the song to compare against.
    - top_n: Number of similar songs to return.

    Returns:
    - Nothing.
    """

    # Get the target song's embedding
    if song_title not in songs_df['title'].values:
        raise ValueError(f"Song '{song_title}' not found in the DataFrame.")

    target_embedding = songs_df[songs_df['title'] == song_title]['song_vector'].values[0].reshape(1, -1)

    # Compute cosine similarities
    similarities = cosine_similarity(target_embedding, song_embeddings_matrix)[0]
    final_df = songs_df.drop(columns=['tokens','song_vector'])
    # Add similarity scores to the DataFrame
    final_df['similarity'] = similarities

    # Exclude the target song itself and get the top N similar songs
    similar_songs = final_df[final_df['title'] != song_title].sort_values(by='similarity', ascending=False).head(top_n)
    print(f"Top {top_n} most similar songs to '{song_title}':")
    print("=" * 60)
    for i, row in similar_songs.iterrows():
        title = row['title']
        similarity = row['similarity']
        year = row['year']
        artist = row['artist']
        rank = row['rank']
        valence = row['valence']
        lyrics_snippet = row['lyrics'][:100] + '...'  # Show first 100 characters
        print(f"🎵 {title}")
        print(f"   Artist: {artist}")
        print(f"   Year: {year}")
        print(f"   Similarity: {similarity:.4f}")
        print(f"   Valence: {valence}")
        print(f"   Lyrics: {lyrics_snippet}")
        print("-" * 60)
    return

def most_similar_artists(artist_name, top_n=5):
    """
    Given an artist name, print the top N most similar artists based on averaged song embeddings.

    Parameters:
    - artist_name: Name of the artist to compare
    - top_n: Number of similar artists to return

    Returns:
    - Nothing
    """

    # Check if the artist exists
    if artist_name not in artist_df['artist'].values:
        raise ValueError(f"Artist '{artist_name}' not found.")

    # Get target vector
    target_vector = artist_df.loc[artist_df['artist'] == artist_name, 'artist_vector'].values[0]
    target_vector = target_vector.reshape(1, -1)

    # Compute cosine similarities
    similarities = cosine_similarity(target_vector, artist_embeddings_matrix)[0]
    df=artist_df.copy()
    # Attach similarities
    df['similarity'] = similarities

    # Remove the target artist and sort
    df = df[df['artist'] != artist_name] \
                .sort_values(by='similarity', ascending=False) \
                .head(top_n)
                
    print(f"Top {top_n} most similar artists to '{artist_name}':")
    print("=" * 60)
    for i, row in df.iterrows():
        print(f"   Artist: {row['artist']}")
        print(f"   Similarity: {row['similarity']:.4f}")
        print(f"   Valence: {row['valence']}")
        print(f"   Total views: {row['total_views']}")
        print("-" * 60)

    return


Let's play and find what are the closest songs/artists to other songs/artists according to their lyrics !


In [None]:
most_similar_songs("Love Again", top_n=5)

Top 5 most similar songs to 'Shape of You':
🎵 ​​breathin
   Artist: Ariana Grande
   Year: 2018
   Similarity: 0.9966
   Valence: 0.9224
   Lyrics: [Intro]
​​lacigam gnihtemos od oT
​​thgin laiceps ruoy s'thginot tuB

[Verse 1]
Some days, things ju...
------------------------------------------------------------
🎵 Luis Fonsi  Daddy Yankee - Despacito Remix ft. Justin Bieber English Translation
   Artist: Genius English Translations
   Year: 2017
   Similarity: 0.9966
   Valence: 0.9902
   Lyrics: [Intro: Justin Bieber]
Come on over in my direction
So thankful for that, it's such a blessin', yeah...
------------------------------------------------------------
🎵 Moonlight
   Artist: Ariana Grande
   Year: 2016
   Similarity: 0.9965
   Valence: 0.9488
   Lyrics: [Verse 1]
The sun is setting and you're right here by my side
And the movie is playing, but we won't...
------------------------------------------------------------
🎵 West Coast
   Artist: Lana Del Rey
   Year: 2014
   Similarity: 

In [None]:
most_similar_artists("The Doors", top_n=5)

Top 5 most similar artists to 'Ed Sheeran':
   Artist: Taylor Swift
   Similarity: 0.9993
   Valence: 0.4766114457831326
   Total views: 77642721
------------------------------------------------------------
   Artist: One Direction
   Similarity: 0.9993
   Valence: 0.4559825
   Total views: 15999395
------------------------------------------------------------
   Artist: Halsey
   Similarity: 0.9991
   Valence: 0.1561876288659793
   Total views: 21476837
------------------------------------------------------------
   Artist: Ed Sheeran & Beyonc
   Similarity: 0.9990
   Valence: 0.9991
   Total views: 1949672
------------------------------------------------------------
   Artist: Kanye West
   Similarity: 0.9989
   Valence: 0.1738341463414634
   Total views: 159846342
------------------------------------------------------------


### II.3 REDUCING DIMENSIONALITY
Reducing dimensionality is crucial for making sense of complex data like song embeddings. t-SNE works really well for this because it keeps similar songs close together when mapping everything into a simpler, low-dimensional space. This makes it easier to visualize data, spot patterns, find outliers, and group songs based on lyrics, which is great for things like music recommendations or building playlists. Let's dive into it!

In [79]:
# Stack all song vectors (song embeddings) vertically into a matrix 'X'
X = np.vstack(songs_df['song_vector'].values)
# X = song_embeddings_matrix  # Alternative: Use the previously created song embeddings matrix

# Record the start time to measure the execution time of t-SNE
start_time = time.time()

# Initialize t-SNE with specified parameters:
# - n_components: Reduce dimensionality to 2 components (for visualization)
# - n_iter: Maximum number of iterations for optimization
# - random_state: Seed for reproducibility
# - verbose: Control the verbosity of the output
tsne = sklearn.manifold.TSNE(n_components=2, n_iter=250, random_state=0, verbose=2)

# Apply t-SNE to reduce the dimensionality of the song embeddings to 2D
# and store the results in 'all_word_vectors_matrix_2d'
all_word_vectors_matrix_2d = tsne.fit_transform(X)

# Print the time taken for t-SNE to complete
print("--- %s seconds ---" % (time.time() - start_time))

# Create a DataFrame 'wv_df' from the 2D song embeddings, with columns 'X' and 'Y'
wv_df = pd.DataFrame(all_word_vectors_matrix_2d, columns=['X', 'Y'])
# Reset the index of 'wv_df' and drop the old index
wv_df.reset_index(drop=True, inplace=True)

# Concatenate the original 'songs_df' with the 2D embeddings ('wv_df')
# along the columns (axis=1) to create a new DataFrame 'two_dimensional_songs'
two_dimensional_songs = pd.concat([songs_df.reset_index(), wv_df], axis=1)
# Group by artist and calculate the average X and Y coordinates
artist_coordinates = two_dimensional_songs.groupby('artist')[['X', 'Y']].mean().reset_index()
artist_df=pd.merge(artist_df, artist_coordinates, on='artist', how='inner')


# Display the 'two_dimensional_songs' DataFrame
two_dimensional_songs

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1000 samples in 0.005s...
[t-SNE] Computed neighbors for 1000 samples in 0.101s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1000
[t-SNE] Mean sigma: 0.034771
[t-SNE] Computed conditional probabilities in 0.056s



'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



[t-SNE] Iteration 50: error = 66.3671417, gradient norm = 0.0114263 (50 iterations in 0.215s)
[t-SNE] Iteration 100: error = 66.1180115, gradient norm = 0.0012524 (50 iterations in 0.190s)
[t-SNE] Iteration 150: error = 66.1148834, gradient norm = 0.0014106 (50 iterations in 0.182s)
[t-SNE] Iteration 200: error = 66.1146164, gradient norm = 0.0015127 (50 iterations in 0.194s)
[t-SNE] Iteration 250: error = 66.1159439, gradient norm = 0.0070701 (50 iterations in 0.191s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 66.115944
[t-SNE] KL divergence after 251 iterations: 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.000000
--- 1.2213099002838135 seconds ---


Unnamed: 0,level_0,index,title,tag,artist,year,lyrics,views,rank,tokens,valence,song_vector,X,Y
0,0,0,Shape of You,pop,Ed Sheeran,2017,[Verse 1]\nThe club isn't the best place to fi...,14569727,1,"[the, club, isn't, the, best, place, to, find,...",0.9993,"[[-0.09835523530811562, -0.017306410342092945,...",0.038348,0.000199
1,1,1,​thank u next,pop,Ariana Grande,2018,[Verse 1]\nThought I'd end up with Sean\nBut h...,9072131,2,"[thought, i'd, end, up, with, sean, but, he, w...",0.9996,"[[-0.10671016169451913, 0.010760123867523366, ...",-0.147135,-0.000143
2,2,2,Work,pop,Rihanna,2016,"[Chorus: Rihanna]\nWork, work, work, work, wor...",7871555,3,"[work, ,, work, ,, work, ,, work, ,, work, ,, ...",-0.9978,"[[-0.10575750227304223, 0.02552068677646066, 0...",-0.197395,-0.000298
3,3,3,Too Good at Goodbyes,pop,Sam Smith,2017,[Verse 1]\nYou must think that I'm stupid\nYou...,7784454,4,"[you, must, think, that, i'm, stupid, you, mus...",-0.9770,"[[-0.10453970273780919, 0.012245111703351899, ...",0.195204,0.000384
4,4,4,7 rings,pop,Ariana Grande,2019,"[Verse 1]\nYeah, breakfast at Tiffany's and bo...",7387794,5,"[yeah, ,, breakfast, at, tiffany's, and, bottl...",0.9972,"[[-0.10839095524756127, 0.01623228222604399, 0...",-0.037486,-0.000702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,995,The Other,pop,Lauv,2015,[Verse 1]\nLike a spotlight the water hits me\...,464603,996,"[like, a, spotlight, the, water, hits, me, ran...",-0.9000,"[[-0.08581113506816736, 0.0043553994740125725,...",0.289683,0.000577
996,996,996,Remind Me to Forget,pop,Kygo,2018,"[Verse 1]\nIt never fades away, it's staying\n...",464450,997,"[it, never, fades, away, ,, it's, staying, you...",-0.9518,"[[-0.09686662686743817, 0.0037551854024499326,...",0.143470,0.000372
997,997,997,IU - 에잇 eight ft. SUGA English Translation,pop,Genius English Translations,2020,[Verse 1: IU]\nSo are you happy now?\nFinally ...,464424,998,"[so, are, you, happy, now, ?, finally, happy, ...",0.9861,"[[-0.07639461211044563, -0.004941656075038006,...",-0.077258,-0.000065
998,998,998,Ive Been Waiting,pop,Lil Peep & iLoveMakonnen,2019,"[Intro: Lil Peep]\nWhoa, yeah\n\n[Verse 1: iLo...",464078,999,"[whoa, ,, yeah, i've, been, waiting, (, i've, ...",-0.9969,"[[-0.11244546656724476, 0.00023399112343966015...",0.233445,0.000447


In [80]:
# Optional: seed for consistent colors
np.random.seed(42)

# Normalize views to a reasonable range for marker sizes
views = two_dimensional_songs['views']
min_size = 5
max_size = 30

# Normalize view counts to marker sizes between min_size and max_size
normalized_sizes = min_size + (views - views.min()) / (views.max() - views.min()) * (max_size - min_size)

fig = go.Figure(data=go.Scatter(
    x = two_dimensional_songs['X'],
    y = two_dimensional_songs['Y'],
    text = two_dimensional_songs['artist'] + " - " + two_dimensional_songs['title'] + "<br>Views: " + views.astype(str),
    mode='markers',
    marker=dict(
        size=normalized_sizes,
        color=np.random.randn(len(two_dimensional_songs)),  # random color for now
        colorscale='Viridis',
        showscale=True
    )
))

fig.update_layout(
    title='2D map of songs (Point size = Views)',
    xaxis_title='X',
    yaxis_title='Y',
    template='plotly_white'
)

fig.show()

In [81]:
fig = px.scatter(two_dimensional_songs, x='X', y='Y',color='tag', title='2D map of songs by genre', hover_data=['title'])
fig.show()

In [82]:
#Plotting the songs of top 10 artists
top_artists = artist_df.head(10)['artist'].to_list()
filtered_df = two_dimensional_songs[two_dimensional_songs['artist'].isin(top_artists)]

fig = px.scatter(filtered_df, x='X', y='Y', color='artist',
                 title='2D map of songs by artists',hover_data=['title'])

fig.show()


In [83]:
# Optional: seed for consistent colors
np.random.seed(42)

# Normalize views to a reasonable range for marker sizes
views = artist_df['total_views']
min_size = 5
max_size = 30

# Normalize view counts to marker sizes between min_size and max_size
normalized_sizes = min_size + (views - views.min()) / (views.max() - views.min()) * (max_size - min_size)

fig = go.Figure(data=go.Scatter(
    x = two_dimensional_songs['X'],
    y = two_dimensional_songs['Y'],
    text = two_dimensional_songs['artist'] + " - " + "<br>Views: " + views.astype(str)+ "<br>Valence: " + two_dimensional_songs['valence'].astype(str),
    mode='markers',
    marker=dict(
        size=normalized_sizes,
        color=np.random.randn(len(two_dimensional_songs)),  # random color for now
        colorscale='Viridis',
        showscale=True
    )
))

fig.update_layout(
    title='2D map of artists (Point size = Views)',
    xaxis_title='X',
    yaxis_title='Y',
    template='plotly_white'
)

fig.show()