In [1]:
pip install fasttext



In [2]:
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import fasttext
import fasttext.util

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#fasttext.util.download_model('en', if_exists='ignore')  # It takes really long time
#ft = fasttext.load_model(cc.en.300.bin)
!gunzip /content/drive/MyDrive/cc.en.300.bin.gz
model_path = '/content/drive/MyDrive/cc.en.300.bin'
ft = fasttext.load_model(model_path)

gzip: /content/drive/MyDrive/cc.en.300.bin.gz: No such file or directory




In [5]:
df = pd.read_csv("./song_lyrics_sampled_proc.csv")
print('Data rows number: ', len(df))
df.head()

Data rows number:  3919


Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_proc,lyrics_num_tks
0,Toothpick,pop,Biting Elbows,2012,Some folks got the patience of the angels\nNot...,1166787,folk got patience angel heart well yearns veng...,169
1,6 Feet Under,pop,Ana Johnsson,2004,You just left me 6 feet under ground I'm burni...,803057,left foot ground burning sight light foot buri...,100
2,The Poetaster Act 4. Scene 2,misc,Ben Jonson,1601,"A Room in Lupus's House.\n\nEnter Lupus, HISTR...",674438,room house enter lupus histrio lictors tuc sta...,170
3,Hes Gone,pop,Phil Lesh & Friends,2015,"Rat in a drain ditch, caught on a limb, you kn...",961823,rat drain ditch caught limb know better know l...,118
4,Ill Never Say,pop,Helen Ward,2015,"I'll never say ""never again"" again\nCause here...",1163619,never say never cause love head heel love neve...,35


In [6]:
def embed_lyrics_batch(lyrics_batch, model):
    embeddings = []
    for text in lyrics_batch:
        embedding = model.get_sentence_vector(text)
        embeddings.append(embedding)
    return np.array(embeddings)

# Load the preprocessed lyrics from the CSV file
df = pd.read_csv('./song_lyrics_sampled_proc.csv')
lyrics_list = df['lyrics_proc'].tolist()

batch_size = 1000 # have to do it in batches or it exceeds the resources available on colab
all_embeddings = []

for i in range(0, len(lyrics_list), batch_size):
    lyrics_batch = lyrics_list[i:i+batch_size]
    batch_embeddings = embed_lyrics_batch(lyrics_batch, ft)
    all_embeddings.extend(batch_embeddings)

# Convert the list of embeddings to a single array
embeddings_array = np.array(all_embeddings)

print(embeddings_array[:10])
print(f"Embeddings Array Shape: {embeddings_array.shape}")



[[-0.01416508  0.01029913  0.02259598 ...  0.0665962   0.00952111
  -0.01773393]
 [-0.01170727  0.00986252  0.03055934 ...  0.07118364  0.03680818
   0.00289707]
 [ 0.00380389 -0.00256328  0.01504422 ...  0.06606562  0.00535535
  -0.01890685]
 ...
 [-0.0086156   0.02556102  0.01028561 ...  0.06015038  0.00014415
  -0.00983291]
 [-0.00287815  0.03230715  0.02522252 ...  0.06652657 -0.01404283
   0.01453253]
 [ 0.00650915  0.00795878  0.00393557 ...  0.06554093 -0.01665118
  -0.02651232]]
Embeddings Array Shape: (3919, 300)


In [7]:
# Add embeddings to the original DataFrame as a single column
df['fastText_embeddings'] = embeddings_array.tolist()
df.head()

Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_proc,lyrics_num_tks,fastText_embeddings
0,Toothpick,pop,Biting Elbows,2012,Some folks got the patience of the angels\nNot...,1166787,folk got patience angel heart well yearns veng...,169,"[-0.014165075495839119, 0.010299132205545902, ..."
1,6 Feet Under,pop,Ana Johnsson,2004,You just left me 6 feet under ground I'm burni...,803057,left foot ground burning sight light foot buri...,100,"[-0.011707272380590439, 0.009862515144050121, ..."
2,The Poetaster Act 4. Scene 2,misc,Ben Jonson,1601,"A Room in Lupus's House.\n\nEnter Lupus, HISTR...",674438,room house enter lupus histrio lictors tuc sta...,170,"[0.003803892759606242, -0.002563277492299676, ..."
3,Hes Gone,pop,Phil Lesh & Friends,2015,"Rat in a drain ditch, caught on a limb, you kn...",961823,rat drain ditch caught limb know better know l...,118,"[0.002516944194212556, 0.018001064658164978, 0..."
4,Ill Never Say,pop,Helen Ward,2015,"I'll never say ""never again"" again\nCause here...",1163619,never say never cause love head heel love neve...,35,"[0.036130391061306, 0.02793777920305729, 0.020..."


In [8]:
def find_optimal_pca_dimensions(df, embeddings_column, dimensions, n_clusters=5):

    # Extract the embeddings from the DataFrame
    embeddings = np.stack(df[embeddings_column].values)

    # Apply Standard Scaler
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)

    results = []
    for dim in dimensions:
        # Apply PCA
        pca = PCA(n_components=dim)
        reduced_embeddings = pca.fit_transform(scaled_embeddings)

        # Cluster the reduced embeddings
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(reduced_embeddings)

        # Compute the Silhouette score
        silhouette_avg = silhouette_score(reduced_embeddings, cluster_labels)

        # Calculate variance preserved
        variance_preserved = np.sum(pca.explained_variance_ratio_)

        # Append results
        results.append({
            'Dimensions': dim,
            'Silhouette Score': silhouette_avg,
            'Variance Preserved': variance_preserved
        })

    return pd.DataFrame(results)

dimensions = [2,3,5,10,15,20,35,50]

table = find_optimal_pca_dimensions(df,'fastText_embeddings',dimensions)
table


Unnamed: 0,Dimensions,Silhouette Score,Variance Preserved
0,2,0.342504,0.19275
1,3,0.263345,0.238242
2,5,0.19275,0.310114
3,10,0.136945,0.413049
4,15,0.115349,0.478027
5,20,0.103483,0.526772
6,35,0.086135,0.630767
7,50,0.078454,0.700414


In [9]:
def find_best_tradeoff(results_df):

    # Calculate the absolute difference between the normalized metrics
    results_df['Difference'] = np.abs(results_df['Silhouette Score'] - results_df['Variance Preserved'])

    # Find the dimension with the minimum difference
    best_dimension = results_df.loc[results_df['Difference'].idxmin(), 'Dimensions']

    return best_dimension, results_df

best_dim, _ = find_best_tradeoff(table)
print("The best dimension is: ", best_dim)

# Extract the embeddings from the DataFrame
embeddings = np.stack(df['fastText_embeddings'].values)

# Apply Standard Scaler
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings)

# Apply PCA
pca = PCA(n_components=best_dim)
reduced_embeddings = pca.fit_transform(scaled_embeddings)

# Add reduced embedding as new column
df['reduced_embeddings'] = list(reduced_embeddings)
df

The best dimension is:  3


Unnamed: 0,title,tag,artist,year,lyrics,id,lyrics_proc,lyrics_num_tks,fastText_embeddings,reduced_embeddings
0,Toothpick,pop,Biting Elbows,2012,Some folks got the patience of the angels\nNot...,1166787,folk got patience angel heart well yearns veng...,169,"[-0.014165075495839119, 0.010299132205545902, ...","[-0.4237052606380154, -0.16960401615530912, -3..."
1,6 Feet Under,pop,Ana Johnsson,2004,You just left me 6 feet under ground I'm burni...,803057,left foot ground burning sight light foot buri...,100,"[-0.011707272380590439, 0.009862515144050121, ...","[-4.663875573183138, -9.001501984756297, -9.33..."
2,The Poetaster Act 4. Scene 2,misc,Ben Jonson,1601,"A Room in Lupus's House.\n\nEnter Lupus, HISTR...",674438,room house enter lupus histrio lictors tuc sta...,170,"[0.003803892759606242, -0.002563277492299676, ...","[-9.429142453813633, 0.8906473712519875, 5.976..."
3,Hes Gone,pop,Phil Lesh & Friends,2015,"Rat in a drain ditch, caught on a limb, you kn...",961823,rat drain ditch caught limb know better know l...,118,"[0.002516944194212556, 0.018001064658164978, 0...","[-0.3746409127368931, 2.7309582239840204, -7.9..."
4,Ill Never Say,pop,Helen Ward,2015,"I'll never say ""never again"" again\nCause here...",1163619,never say never cause love head heel love neve...,35,"[0.036130391061306, 0.02793777920305729, 0.020...","[7.851380379311003, -0.8563798233857219, 5.518..."
...,...,...,...,...,...,...,...,...,...,...
3914,I Think About You,pop,David Devant & His Spirit Wife,1997,Now get this clear\nI'm being sincere\nYour bi...,935693,get clear sincere biggest fan also ran someone...,103,"[-0.009474361315369606, 0.008337694220244884, ...","[4.77995370110509, -1.1205131626892788, 1.3503..."
3915,About Face,rock,Have Heart,2006,Fucked it up again but I'm not looking the oth...,1244191,fucked looking way see error know wrong going ...,61,"[-0.0020460819359868765, -0.002099560108035803...","[9.763049977820986, -1.7150736690525434, 1.082..."
3916,Let Em Know,pop,Grade 8,2003,Spit shit right into my ear kid\nI don't wanna...,924470,spit shit right ear kid wan hear look alive fe...,167,"[-0.003494808217510581, 0.014166220091283321, ...","[3.04179688109523, 2.5467838952043875, -2.8859..."
3917,Wings Of Desire,rock,Wishbone Ash,1991,I've seen you in black and white\nLike a silen...,1053952,seen black white like silent movie colour came...,70,"[0.010648715309798717, 0.011540566571056843, 0...","[4.271864823526334, -1.7694516478511675, 0.644..."


In [10]:
def plot_elbow_curve(df, embeddings_column='reduced_embeddings', cluster_range=(2, 50)):
    # Extract the embeddings from the DataFrame
    reduced_embeddings = np.stack(df[embeddings_column].values)

    # Initialize variables to store inertia values
    cluster_range_values = range(cluster_range[0], cluster_range[1] + 1)
    inertia_values = []

    # Compute inertia for each number of clusters
    for n_clusters in cluster_range_values:
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        kmeans.fit(reduced_embeddings)
        inertia_values.append(kmeans.inertia_)

    # Calculate the best number of clusters using the elbow method
    deltas = np.diff(inertia_values, 2)
    elbow_index = np.argmin(deltas) + 1  # Add 1 because of the diff operation
    best_clusters = cluster_range_values[8]

    # Plot the elbow curve using Plotly
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=list(cluster_range_values)[::1],
        y=inertia_values,
        mode='lines+markers',
        name='Inertia'
    ))

    fig.add_shape(
        type="line",
        x0=best_clusters,
        y0=min(inertia_values),
        x1=best_clusters,
        y1=max(inertia_values),
        line=dict(
            color="red",
            width=2,
            dash="dashdot",
        ),
    )

    fig.update_layout(
        title='Elbow Curve for KMeans Clustering',
        xaxis_title='Number of Clusters',
        yaxis_title='Inertia',
        xaxis=dict(tickmode='linear', dtick=5),
        template='plotly_white'
    )

    fig.show()

plot_elbow_curve(df)

In [11]:
def perform_clustering_and_visualize(df, embeddings_column='reduced_embeddings', n_clusters=10):
    # Extract the embeddings from the DataFrame
    reduced_embeddings = np.stack(df[embeddings_column].values)

    # Apply KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(reduced_embeddings)

    # Apply t-SNE for visualization
    tsne = TSNE(n_components=2, random_state=42)
    tsne_embeddings = tsne.fit_transform(reduced_embeddings)

    # Create a DataFrame for visualization
    tsne_df = pd.DataFrame(tsne_embeddings, columns=['x', 'y'])
    tsne_df['cluster'] = cluster_labels

    # Visualize using Plotly
    fig = px.scatter(
        tsne_df, x='x', y='y', color='cluster',
        title='t-SNE Visualization of Clusters',
        labels={'x': 't-SNE Dimension 1', 'y': 't-SNE Dimension 2'}
    )
    fig.show()
    return cluster_labels

cluster_labels = perform_clustering_and_visualize(df)
# Add cluster labels to the lyrics embeddings
df['cluster'] = cluster_labels

In [12]:
def top_n_similar_embeddings(df, target_index, n=10):

    # Extract the target embedding and its cluster
    target_embedding = df.at[target_index, 'reduced_embeddings']
    target_cluster = df.at[target_index, 'cluster']

    # Filter the DataFrame to get only the embeddings in the same cluster
    cluster_df = df[df['cluster'] == target_cluster]
    cluster_embeddings = np.stack(cluster_df['reduced_embeddings'].values)

    # Calculate cosine similarities
    similarities = cosine_similarity([target_embedding], cluster_embeddings)[0]

    # Get the top n similar embeddings
    top_n_indices = np.argsort(similarities)[-n-1:][::-1]
    top_n_similar_ids = cluster_df.iloc[top_n_indices]['id'].values
    top_n_similarities = similarities[top_n_indices]

    return pd.DataFrame({
        'id': top_n_similar_ids,
        'similarity': top_n_similarities
    })

# Show top 10 similar song to the first song (the first row is the song itself)
top_n_similar_embeddings(df,target_index=0,n=10)

Unnamed: 0,id,similarity
0,1166787,1.0
1,1307544,0.999901
2,859037,0.998684
3,1047589,0.997558
4,190114,0.994388
5,116860,0.993862
6,1322671,0.993588
7,1165355,0.992939
8,1225450,0.992498
9,806933,0.991813
