In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,name,album,artist,release_date,popularity,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Purple Haze,Are You Experienced,Jimi Hendrix,1967-05-12,69,0wJoRiX5K5BxlqZTolB2LD,0.533,0.905,2,-5.27,1,0.0754,0.00876,0.578,0.0698,0.486,108.9,170813,4
1,Born To Be Wild - Single Version,Steppenwolf,Steppenwolf,1968-01-01,66,3lN8PP6R2IxbLP05QpYXng,0.439,0.734,2,-12.168,1,0.097,0.262,0.333,0.244,0.54,145.703,212893,4
2,"Oh, Pretty Woman","Oh, Pretty Woman",Roy Orbison,1962,71,48i055G1OT5KxGGftwFxWy,0.619,0.603,9,-9.481,1,0.0342,0.712,0.0,0.0721,0.958,127.433,178933,4
3,The Weight - Remastered,Music From Big Pink (Expanded Edition),The Band,1968-07-01,0,0sDqo9UPzPUtu9wEkI3zRB,0.63,0.519,9,-10.997,1,0.0528,0.225,4e-06,0.0974,0.502,143.942,278627,4
4,Hey Jude - Remastered 2015,1 (Remastered),The Beatles,2000-11-13,72,0aym2LBJBk9DAYuHHutrIl,0.386,0.607,10,-7.7,1,0.0261,0.0112,1.4e-05,0.088,0.532,147.207,425653,4


In [4]:
df.shape

(1684, 19)

In [None]:
# plt.figure()
# for i, feature in enumerate(audio_feature_cols):
#     plt.subplot(4, 4, i+1)
#     plt.hist(df[feature])
#     plt.title(feature)

# plt.tight_layout()
# plt.show()

def plot(feat):
    plt.figure()
    plt.hist(df[feat])
    plt.xlabel(feat)
    plt.show()

audio_feature_cols = ['popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
                          'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']

for feat in audio_feature_cols:
    plot(feat)

In [5]:
continuous_features_cols = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
                          'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'year', 'popularity']

In [6]:
def clean_df(df):
    years = []
    for date in df['release_date']:
        years.append(int(date[:4]))

    df['year'] = years
    
    dropped_cols = ['name', 'artist', 'album', 'key', 'mode', 'time_signature', 'release_date']
    return df.drop(dropped_cols, axis=1)

In [7]:
from sklearn.preprocessing import MinMaxScaler

def scale_min_max(df):
    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(df)
    return pd.DataFrame(scaled_features, columns=df.columns)

In [8]:
from sklearn.preprocessing import StandardScaler

def scale_standard(df):
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df)
    return pd.DataFrame(scaled_features, columns=df.columns)

In [9]:
def scale_df(df):
    return scale_standard(df)

In [10]:
from sklearn.decomposition import PCA

def apply_PCA(df, n):
    pca = PCA(n_components=n)
    return pca.fit_transform(df)

In [11]:
def preprocess_df(df, n):
    df_cont = clean_df(df)
    df_ids = df_cont['id']
    df_cont = scale_df(df_cont.drop('id', axis=1))
    pca_arr = apply_PCA(df_cont, n)
    pca_df_cols = []
    for i in range(len(pca_arr[0])):
        pca_df_cols.append('feature {}'.format(i+1))
    df_cont = pd.DataFrame(pca_arr, columns=pca_df_cols)
    df_cont['id'] = df_ids
    return df_cont

In [12]:
"""
Fits a music recommender using minimum Euclidean distance.

:param df: song dataset dataframe
:param given_track_id: ID of the track that should be used for the basis of recommendations
:returns: list of tuples of tracks and ids sorted by Euclidean distance
:precondition: The given track ID is in df.
:precondition: The columns in dropped_cols are in df.
"""
def fit_min_euclid_dist_recommender(df, given_track_id):
    df_cont = preprocess_df(df, 0.9)
    
    given_track_df = df_cont.loc[df_cont['id'] == given_track_id]
    given_track_array = np.array(given_track_df.drop(['id'], axis=1)).reshape(-1,)
        
    track_distances = []

    for index in range(len(df_cont)):
        track_id = df_cont.iloc[index]['id']
        track_array = np.array(df_cont.iloc[index].drop(['id']))
        euclid_dist = np.linalg.norm(given_track_array - track_array)
        track_distances.append((track_id, euclid_dist))

    return sorted(track_distances, key=lambda x: x[1])

In [17]:
"""
:param sorted_tracks: list of sorted tracks returned from fit_min_euclid_dist_recommender
:param num_tracks: number of similar sogns that should be returned
:param closeness_index: determines how similar the recommended songs are, with 0 being the most similar
:returns: list of tuples that contain the song names and IDs of the recommended songs
:precondition: num_tracks < len(df)
:precondition: closeness_index >= 0 and closeness_index+num_tracks < len(df)
"""
def min_euclid_dist_recommender(sorted_tracks, num_tracks, closeness_index):
    rec_songs = []
    for i in range(num_tracks):
        song_df = df[df['id']==sorted_tracks[i+1+closeness_index][0]]
        rec_song = (song_df.iloc[0]['name'], song_df.iloc[0]['id'])
        rec_songs.append(rec_song)
    
    return rec_songs

In [14]:
df[df['name']=='Drag Me Down']

Unnamed: 0,name,album,artist,release_date,popularity,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
927,Drag Me Down,Made In The A.M. (Deluxe Edition),One Direction,2015-11-13,79,2K87XMYnUMqLcX3zvtAF4G,0.73,0.703,0,-5.672,0,0.0369,0.109,0.0,0.0657,0.595,138.113,192120,4


In [15]:
sorted_tracks = fit_min_euclid_dist_recommender(df, '2K87XMYnUMqLcX3zvtAF4G')

In [21]:
rec_songs = min_euclid_dist_recommender(sorted_tracks, 5, 0)
print(rec_songs)

[('Down', '6cmm1LMvZdB5zsCwX5BjqE'), ('Only Girl (In The World)', '0AH6WMe3OlAlUb5miXt2FQ'), ('Call Me Maybe', '20I6sIOMTCkB6w7ryavxtO'), ('Bad Boy', '5GKwq4sO5ZHKuWaDmdwMQc'), ('Stitches', '3zkWCteF82vJwv0hRLba76')]


In [None]:
def get_ids(rec_songs):
    ids_list = []
    for song in rec_songs:
        ids_list.append(song[1])
    
    return ids_list

In [None]:
def plot_closeness(df, given_song_id, rec_songs_id, dims):
    df_dims = df[dims+['id']]
    
    given_row = df_dims.loc[df_dims['id']==given_song_id]
    given_point = (given_row.iloc[0][dims[0]], given_row.iloc[0][dims[1]])
    
    rec_points = []
    
    for song in rec_songs_id:
        row = df_dims.loc[df_dims['id']==song]
        point = (row.iloc[0][dims[0]], row.iloc[0][dims[1]])
        rec_points.append(point)
    
    rec_x = [point[0] for point in rec_points]
    rec_y = [point[1] for point in rec_points]
    
    df_other = df_dims[~df_dims['id'].isin(rec_songs_id)]
    points = []
    
    for index in range(len(df_other)):
        point = (df_other.iloc[index][0], df_other.iloc[index][1])
        points.append(point)
    
    x = [point[0] for point in points]
    y = [point[1] for point in points]
    
    plt.figure()
    plt.scatter(x, y)
    plt.scatter(rec_x, rec_y, c='orange')
    plt.scatter(given_point[0], given_point[1], c='red')
    plt.xlabel(dims[0])
    plt.ylabel(dims[1])
    plt.show()

In [None]:
for i, feat_1 in enumerate(continuous_features_cols):
    for feat_2 in continuous_features_cols:
        if feat_2 == feat_1:
            continue
        if continuous_features_cols.index(feat_2) < i:
            continue
        
        plot_closeness(df, '7qiZfU4dY1lWllzX7mPBI3', get_ids(rec_songs), [feat_1, feat_2])

In [None]:
from numpy.random import default_rng

def init_centroids(feat_range, k, df):
    rng = default_rng()
    centroids = []
    for c in range(k):
        centroids.append((c, rng.uniform(low=feat_range[0], high=feat_range[1], size=len(df.columns)-1)))
    
    return centroids

In [None]:
def fit_k_means_clustering(df, k, stopping_dist):
    df_cont = preprocess_df(df, 0.9)
    
    # initialize centroids
    centroids = init_centroids((-1, 1), k, df_cont)
    prev_centroids = None
    clusters = None
    
    # iterate for an arbitrarily large number of times
    i = 0
    maxI = 100
    while i < maxI:
        i += 1
        print('iterations: {}'.format(i))
        
        # keep track of previous centroids
        prev_centroids = centroids
        
        clusters = []
        
        # loop through dataframe
        for index in range(len(df_cont)):
            # get track vector
            track_array = np.array(df_cont.iloc[index].drop(['id']))
            
            distances = []
            
            # loop through centroids and compute the Euclidean distance of each centroid to the track vector
            for labeled_centroid in centroids:
                euclid_dist = np.linalg.norm(labeled_centroid[1] - track_array)
                distances.append((labeled_centroid[0], euclid_dist))
            
            # assign track vector to the closest centroid
            sorted_distances = sorted(distances, key=lambda x: x[1])
            clusters.append((sorted_distances[0][0], track_array))
        
        centroids = []
        
        # loop through clusters to recompute centroids
        for c in range(k):
            cluster = []
            for labeled_cluster in clusters:
                if labeled_cluster[0] == c:
                    cluster.append(labeled_cluster[1])
            
            # compute mean vector by summing all vectors in a cluster and diving by the number of vectors
            vector_sum = np.zeros(len(cluster[0]))
            for vector in cluster:
                vector_sum = vector_sum + vector
            
            mean_vector = vector_sum/len(cluster)
            centroids.append((c, mean_vector))
        
        centroid_distances = []
        
        # compute the Euclidean distances between the current and previous centroids
        for c in range(k):
            centroid_euclid_dist = np.linalg.norm(centroids[c][1] - prev_centroids[c][1])
            centroid_distances.append(centroid_euclid_dist)
        
        centroid_distances = np.array(centroid_distances)
        dist_are_less = centroid_distances < stopping_dist
        
        # terminate loop if all computed distances are less than stopping_dist
        if np.all(dist_are_less):
            break
    
    centroids = [centroid[1] for centroid in centroids]
    labels = [cluster[0] for cluster in clusters]
    
    return {'centroids':centroids, 'labels':labels}

In [None]:
from numpy.random import default_rng

"""
Recommends a given number of songs that are similar to a given song using K-Means Clustering.

:param df: song dataset dataframe
:param given_track_id: ID of the track that should be used for the basis of recommendations
:param num_tracks: number of similar sogns that should be returned
:param labels: list of labels from k means clustering
:returns: list of tuples that contain the song names and IDs of the recommended songs
The tuples contain the song names and IDs of the recommended songs.
:precondition: The given track ID is in df.
:precondition: num_tracks <= len(df)
:precondition: The columns in dropped_cols are in df.
"""
def k_means_clustering_recommender(df, given_track_id, num_tracks, labels):
    given_track_index = df[df['id']==given_track_id].index.tolist()[0]
    given_track_label = labels[given_track_index]
    
    cluster_size = 0
    for label in labels:
        if label == given_track_label:
            cluster_size += 1
    
    rng = default_rng()
    label_indexes = rng.integers(cluster_size, size=num_tracks)
    
    rec_songs = []
    for index in label_indexes:
        rec_song = df.iloc[index]
        rec_songs.append((rec_song['name'], rec_song['id']))
    
    return rec_songs

In [None]:
model = fit_k_means_clustering(df, 6, 0.1)

In [None]:
df[df['name']=='Drag Me Down']

In [None]:
rec_songs = k_means_clustering_recommender(df, '2K87XMYnUMqLcX3zvtAF4G', 5, model['labels'])
print(rec_songs)

In [None]:
def plot_clusters(df, dims, labels):
    x = []
    y = []
    
    for index in range(len(df)):
        track = df.iloc[index]
        x.append(track[dims[0]])
        y.append(track[dims[1]])
    
    plt.figure()
    plt.scatter(x, y, c=labels, cmap='rainbow')
    plt.xlabel(dims[0])
    plt.ylabel(dims[1])
    plt.show()

In [None]:
for i, feat_1 in enumerate(continuous_features_cols):
    for feat_2 in continuous_features_cols:
        if feat_2 == feat_1:
            continue
        if continuous_features_cols.index(feat_2) < i:
            continue
        
        plot_clusters(df, [feat_1, feat_2], model['labels'])

In [None]:
from sklearn.cluster import KMeans

def fit_k_means_clustering_sk(k):
    model = KMeans(n_clusters=k)
    df_cont = preprocess_df(df, 0.9).drop('id', axis=1)
    model.fit(df_cont)
    return model

In [None]:
from numpy.random import default_rng

"""
Recommends a given number of songs that are similar to a given song using K-Means Clustering.

:param df: song dataset dataframe
:param given_track_id: ID of the track that should be used for the basis of recommendations
:param num_tracks: number of similar sogns that should be returned
:param labels: list of labels from k means clustering
:returns: list of tuples that contain the song names and IDs of the recommended songs
The tuples contain the song names and IDs of the recommended songs.
:precondition: The given track ID is in df.
:precondition: num_tracks <= len(df)
:precondition: The columns in dropped_cols are in df.
"""
def k_means_clustering_recommender_sk(df, given_track_id, num_tracks, labels):
    given_track_index = df[df['id']==given_track_id].index.tolist()[0]
    given_track_label = labels[given_track_index]
    
    cluster_size = 0
    for label in labels:
        if label == given_track_label:
            cluster_size += 1
    
    rng = default_rng()
    label_indexes = rng.integers(cluster_size, size=num_tracks)
    
    rec_songs = []
    for index in label_indexes:
        rec_song = df.iloc[index]
        rec_songs.append((rec_song['name'], rec_song['id']))
    
    return rec_songs

In [None]:
model_sk = fit_k_means_clustering_sk(6)

In [None]:
rec_songs_sk = k_means_clustering_recommender_sk(df, '2K87XMYnUMqLcX3zvtAF4G', 5, model_sk.labels_)
print(rec_songs_sk)

In [None]:
i = 0
for feat_1 in continuous_features_cols:
    for feat_2 in continuous_features_cols:
        if feat_2 == feat_1:
            continue
        if continuous_features_cols.index(feat_2) < i:
            continue
        
        plot_clusters(df, [feat_1, feat_2], model_sk.labels_)
    i += 1