In [12]:
import json
import os
import pandas as pd
import spotipy
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import seaborn as sns
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from dask.distributed import Client
import joblib

In [13]:
spt_data = pd.read_csv("/Users/ak1050588/Downloads/archive/data.csv")
genre_data = pd.read_csv("/Users/ak1050588/Downloads/archive/data_by_genres.csv")
artist_data = pd.read_csv("/Users/ak1050588/Downloads/archive/data_by_artist.csv")
data_by_yr = pd.read_csv("/Users/ak1050588/Downloads/archive/data_by_year.csv")

In [14]:
spt_data.head(15)

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920
5,0.996,['Mamie Smith & Her Jazz Hounds'],0.424,198627,0.245,0,3HnrHGLE9u2MjHtdobfWl9,0.799,5,0.235,-11.47,1,Crazy Blues - 78rpm Version,9,1920,0.0397,103.87,0.477,1920
6,0.992,['Mamie Smith'],0.782,195200,0.0573,0,5DlCyqLyX2AOVDTjjkDZ8x,2e-06,5,0.176,-12.453,1,Don't You Advertise Your Man,5,1920,0.0592,85.652,0.487,1920
7,0.996,['Mamie Smith & Her Jazz Hounds'],0.474,186173,0.239,0,02FzJbHtqElixxCmrpSCUa,0.186,9,0.195,-9.712,1,Arkansas Blues,0,1920,0.0289,78.784,0.366,1920
8,0.996,['Francisco Canaro'],0.469,146840,0.238,0,02i59gYdjlhBmbbWhf8YuK,0.96,8,0.149,-18.717,1,La Chacarera - Remasterizado,0,1920-07-08,0.0741,130.06,0.621,1920
9,0.00682,['Meetya'],0.571,476304,0.753,0,06NUxS2XL3efRh0bloxkHm,0.873,8,0.092,-6.943,1,Broken Puppet - Original Mix,0,1920-01-01,0.0446,126.993,0.119,1920


In [18]:
"Data Visualization"
client = Client(processes=False)
# Using Expectation-Maximization (EM) Clustering using Gaussian Mixture Models (GMM) to cluster data into 20 cluster groups
#(Advantage of EM over Kmeans : This algorithm is not sensitive to the choice of distance metric and no need to specify the number of clusters. You have the option of choosing the best-looking clusters.)
clustering_pipe = Pipeline([('scalar',StandardScaler()),('GaussianMixture',GaussianMixture(n_components=20))])
X = genre_data.select_dtypes(np.number)
with joblib.parallel_backend('dask'):
    genre_data['cluster'] = clustering_pipe.fit_predict(X)



Port 8787 is already in use.
Perhaps you already have a cluster running?
Hosting the HTTP server on port 54376 instead



In [19]:
# Using t-distributed stochastic neighbor embedding for dimensionality reduction
tsne_pipe = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=2))])
genre_embedding = tsne_pipe.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 3232 samples in 0.002s...
[t-SNE] Computed neighbors for 3232 samples in 0.268s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3232
[t-SNE] Computed conditional probabilities for sample 2000 / 3232
[t-SNE] Computed conditional probabilities for sample 3000 / 3232
[t-SNE] Computed conditional probabilities for sample 3232 / 3232
[t-SNE] Mean sigma: 0.789973
[t-SNE] Computed conditional probabilities in 0.076s
[t-SNE] Iteration 50: error = 82.9452667, gradient norm = 0.0122687 (50 iterations in 0.579s)
[t-SNE] Iteration 100: error = 76.8378296, gradient norm = 0.0134217 (50 iterations in 0.615s)
[t-SNE] Iteration 150: error = 76.1805878, gradient norm = 0.0019175 (50 iterations in 0.500s)
[t-SNE] Iteration 200: error = 76.1669540, gradient norm = 0.0021351 (50 iterations in 0.499s)
[t-SNE] Iteration 250: error = 76.1775513, gradient norm = 0.0007121 (50 iterations in 0.500s)
[t-SNE] KL divergence after 250 iterati

In [20]:
fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()

In [21]:
# Using Principal component analysis (PCA) for dimensionality reduction
pca_pipe = Pipeline([('scaler', StandardScaler()), ('PCA',PCA(n_components=2))])
genre_embedding = pca_pipe.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres__pca'] = genre_data['genres']
projection['cluster_pca'] = genre_data['cluster']
fig = px.scatter(
    projection, x='x', y='y', color='cluster_pca', hover_data=['x', 'y', 'genres__pca'])
fig.show()


In [22]:
X = spt_data.select_dtypes(np.number)
with joblib.parallel_backend('dask'):
    cluster_labels = clustering_pipe.fit_predict(X)
spt_data['general_genre'] = cluster_labels

In [23]:
# recommendation system
"Logic: Filtering songs with same Genre cluster"
"       If song count is less than 2 in that cluster,we return the song"
"       If count of song is greter than 1,we calculate the distance and return songs with smallest distance"
spt_data[list(X.columns)] = spt_data[list(X.columns)].apply(lambda x: (x - x.min()) / (x.max()-x.min()))
spt_data.drop_duplicates(subset='name',keep='first',inplace=True)


In [24]:
def recommend(song_name,no_of_recommendation=1):
    song_data = spt_data[spt_data["name"]==song_name]
    song_data = song_data.select_dtypes(np.number)
    genre = int(song_data["general_genre"])
    song_data.drop("general_genre", axis=1, inplace=True)
    df_1 = spt_data[spt_data["name"]!=song_name]
    df_1 = df_1[df_1["general_genre"]==genre]
    df_2=df_1[list(X.columns)]
    
    #Finding the distance
    point_a = np.array(song_data)
    distance=[]
    for i in range(df_2.shape[0]):
        point_b = np.array(df_2.iloc[i])
        song_distance = np.linalg.norm(point_a-point_b)
        distance.append(song_distance)
    df_1["distance"]=distance
    df_1.sort_values(by="distance",ascending=True, inplace=True)
    df_1.reset_index(inplace=True)
    
    #Printing the song recommendation
    rec_song = df_1.loc[0:no_of_recommendation]
    print("Based on your preference, we recommend ")
    for i in range(no_of_recommendation):
        print("--> "+ rec_song.loc[i]["name"] + " by "+rec_song.loc[i]["artists"])

In [25]:
recommend("Radioactive",10)

Based on your preference, we recommend 
--> Numb by ['Linkin Park']
--> Dirty Little Secret by ['The All-American Rejects']
--> Let It Go - From "Frozen / Single Version by ['Demi Lovato']
--> In My Arms Instead by ['Randy Rogers Band']
--> Tu Poeta by ['Alex Campos']
--> I Saw You Close Your Eyes by ['Local Natives']
--> Stop by ["Jane's Addiction"]
--> Santa Monica - Remastered by ['Everclear']
--> Blow My Brains Out by ['Tikkle Me']
--> Never Let Me Go by ['Florence + The Machine']
