# Import modules

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pickle

# Imports
from sklearn.cluster import KMeans
from sklearn import metrics 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing

# Functions to read artist and songs database

## Function to read artist database

In [2]:
artist_path ='/home/ernek/Main/Erdos/song_recommender/artist_data/'
def get_artist_dataframe(artist_path): 
    # returns a dataframe of all of the artists in the million playlist data set
    # The only attributes included are "followers", "name", "uri", "genres", "popularity"
    # WARNING - Some artists have no genres, in this case the value is an empty list
    # path - a string containing a path to the folder where the csv files are held.
    #        for example, on my machine, the folder where I ran this script also contained a folder
    #        called "song_data" which had the csv files in it. So I called get_song_dataframe('song_data/')
    #        NOTE - Make sure to include the slash!
    dfs = [pd.read_csv(artist_path + 'artist.slice.0-99999.csv'), pd.read_csv(artist_path + 'artist.slice.100000-199999.csv'), pd.read_csv(artist_path + 'artist.slice.200000-295859.csv')]
    df = pd.concat(dfs)
    del df['Unnamed: 0.1']
    del df['Unnamed: 0']
    
    return df

## Function to read songs database

In [3]:
songs_path = '/home/ernek/Main/Erdos/song_recommender/song_data/'
def get_song_dataframe(songs_path):
    # returns a dataframe of all of the songs in the million playlist data set
    # path - a string containing a path to the folder where the csv files are held.
    #        for example, on my machine, the folder where I ran this script also contained a folder
    #        called "song_data" which had the csv files in it. So I called get_song_dataframe('song_data/')
    #        NOTE - Make sure to include the slash!
    file_name_list = ['song.slice.' + str(i) + '-' + str(i + 49999) + '.csv' for i in range(0,2212292, 50000)] 
    file_name_list = file_name_list + ['song.slice.2250000-2262292.csv']
                        
    df_list = []
    for file_name in file_name_list:
        df_list.append(pd.read_csv(songs_path + file_name))
    
    data = pd.concat(df_list)
    return data

# Songs DataFrame

In [4]:
# read songs data into dataframe
data = get_song_dataframe(songs_path)
# drop unimportant columns
data.drop(['Unnamed: 0','pos', 'album_uri', 'duration_ms', 'album_name', 'num_playlist_appearances', 'time_signature', 'type'],axis=1, inplace=True)
# clean elements with with nan values on important columns
data = data[~data.isna().any(axis=1)]

In [5]:
data.head()

Unnamed: 0,artist_name,track_uri,artist_uri,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4.0,-7.105,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,0.774,0.838,5.0,-3.914,0.0,0.114,0.0249,0.025,0.242,0.924,143.04,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,0.664,0.758,2.0,-6.583,0.0,0.21,0.00238,0.0,0.0598,0.701,99.259,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,0.892,0.714,4.0,-6.055,0.0,0.141,0.201,0.000234,0.0521,0.817,100.972,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,0.853,0.606,0.0,-4.596,1.0,0.0713,0.0561,0.0,0.313,0.654,94.759,spotify:track:1lzr43nnXAijIGYnCT8M8H


In [6]:
len(data.index)

2258242

# Using only 60% of the songs to create the model

In [7]:
percentage = 60
cut = int(len(data)*percentage/100)
print(cut)
training_data = data.iloc[0:cut]

1354945


In [8]:
training_data.head()

Unnamed: 0,artist_name,track_uri,artist_uri,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4.0,-7.105,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,0.774,0.838,5.0,-3.914,0.0,0.114,0.0249,0.025,0.242,0.924,143.04,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,0.664,0.758,2.0,-6.583,0.0,0.21,0.00238,0.0,0.0598,0.701,99.259,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,0.892,0.714,4.0,-6.055,0.0,0.141,0.201,0.000234,0.0521,0.817,100.972,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,0.853,0.606,0.0,-4.596,1.0,0.0713,0.0561,0.0,0.313,0.654,94.759,spotify:track:1lzr43nnXAijIGYnCT8M8H


In [9]:
song_features = ['danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo']

# Fitting the model with all song features but one 

In [10]:
# Number of cluster
true_k = 10

In [None]:
# How to drop a column from a subdataframe
# training_data[song_features].drop('danceability', axis=1)
training_song_df = training_data[song_features]
X_train_song = np.array(training_song_df)
X_train_song

for i in song_features:
    print(i)
    X_train_song = np.array(training_data[song_features].drop(i,axis=1))
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=100, random_state=1)
    model.fit(X_train_song)
    # save the model as a pickle file
    model_pkl_file = f"model_no_{i}.pkl"  

    with open(model_pkl_file, 'wb') as file:  
        pickle.dump(model, file)
    print(f"model with no {i} feature")

In [None]:
# Check functionality of the model
for i in song_features:
    print(i)
    with open(f"model_no_{i}.pkl", 'rb') as file:  
        loaded_model = pickle.load(file)
        print(loaded_model.labels_[0:5])
        # evaluate model 
        print(training_data[['artist_name', 'track_name']].iloc[0:5])
        y_predict = loaded_model.predict(np.array(training_data[song_features].drop(i,axis=1).iloc[0:5]))
        print(y_predict)
        file.close()

# Fitting the model with all song features 

In [None]:
X_train_all = np.array(training_data[song_features])
#print(X_train_all[0:5])
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=100, random_state=1)
model.fit(X_train_all)
# save the model as a pickle file
model_pkl_file = f"model_all.pkl"  
with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model, file)
    print(f"model with all features completed")

In [None]:
with open(f"model_all.pkl", 'rb') as file:  
    loaded_model = pickle.load(file)
    print(loaded_model.labels_[0:5])
    print(training_data[['artist_name', 'track_name']].iloc[0:5])
    # evaluate model 
    y_predict = loaded_model.predict(np.array(training_data[song_features].iloc[0:5]))
    print(y_predict)
    file.close()

# Fitting the model with only one song features 

In [None]:
for i in song_features:
    print(i)
    print(training_data[i][0:5])
    X_train_song = np.array(training_data[i]).reshape(-1,1)
    #print(X_train_song[0:5])
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=100, random_state=1)
    model.fit(X_train_song)
    # save the model as a pickle file
    model_pkl_file = f"model_{i}.pkl"  

    with open(model_pkl_file, 'wb') as file:  
        pickle.dump(model, file)
    print(f"model with only {i} feature completed")

In [None]:
# load model from pickle file
for i in song_features:
    print(i)
    with open(f"model_{i}.pkl", 'rb') as file:  
        loaded_model = pickle.load(file)
        print(loaded_model.labels_[0:5])
        # evaluate model 
        print(training_data[['artist_name', 'track_name']].iloc[0:5])
        y_predict = loaded_model.predict(np.array(training_data[i].iloc[0:5]).reshape(-1,1))
        print(y_predict)
        file.close()


### Obtain model predictions for all songs based on models trained with one parameters

In [None]:
# def call_loaded_model(feature, data):
#     print(feature)
#     with open(f"model_{feature}.pkl", 'rb') as file:  
#         loaded_model = pickle.load(file)
#         y_predict = loaded_model.predict(np.array(single_input).reshape(-1,1))
#         file.close()
#     return y_predict

In [None]:
# data['danceability']
# data['danceability'].apply(lambda x: print(np.array(x).reshape(-1,1)))

In [20]:
# call_loaded_model('danceability', data['danceability'].iloc[i])
subset = data.iloc[0:50000]
subset

Unnamed: 0,artist_name,track_uri,artist_uri,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4.0,-7.105,0.0,0.1210,0.03110,0.006970,0.0471,0.810,125.461,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,0.774,0.838,5.0,-3.914,0.0,0.1140,0.02490,0.025000,0.2420,0.924,143.040,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,0.664,0.758,2.0,-6.583,0.0,0.2100,0.00238,0.000000,0.0598,0.701,99.259,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,0.892,0.714,4.0,-6.055,0.0,0.1410,0.20100,0.000234,0.0521,0.817,100.972,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,0.853,0.606,0.0,-4.596,1.0,0.0713,0.05610,0.000000,0.3130,0.654,94.759,spotify:track:1lzr43nnXAijIGYnCT8M8H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Headhunterz,spotify:track:5QBQ6uA3sQFBSJJt8w9iat,spotify:artist:6C0KWmCdqrLU2LzzWBPbOy,Lift Me Up,0.409,0.610,7.0,-6.011,0.0,0.0447,0.06960,0.000006,0.0655,0.468,101.576,spotify:track:5QBQ6uA3sQFBSJJt8w9iat
49996,Keanu Silva,spotify:track:52DhbZEJ6lPZqJSrOPqSKo,spotify:artist:1zLMhO4zzzxt5PMV4wMS3y,Children - RMFB Remix Edit,0.222,0.814,3.0,-5.196,0.0,0.1010,0.00226,0.902000,0.2610,0.242,153.407,spotify:track:52DhbZEJ6lPZqJSrOPqSKo
49997,Myon,spotify:track:7dCmQbhItGA7JgdBUgY18J,spotify:artist:0nTbVTXLLbBA4xCtn0cFkv,Round We Go - Radio Edit,0.490,0.905,0.0,-3.928,0.0,0.0717,0.03950,0.000059,0.6860,0.210,128.016,spotify:track:7dCmQbhItGA7JgdBUgY18J
49998,Alison Wonderland,spotify:track:28QlLenBGFRjxMmKONi9x2,spotify:artist:11gWrKZMBsGQWmobv3oNfW,U Don't Know - Vincent Remix,0.414,0.767,2.0,-4.554,0.0,0.0650,0.00515,0.000000,0.0996,0.210,137.945,spotify:track:28QlLenBGFRjxMmKONi9x2


In [21]:
feature = 'danceability'
with open(f"model_{feature}.pkl", 'rb') as file:  
    loaded_model = pickle.load(file)
    subset[f"pred_{feature}"] = subset[feature].apply(lambda x: int(loaded_model.predict(np.array(x).reshape(-1,1))))
# data[f"pred_{i}"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset[f"pred_{feature}"] = subset[feature].apply(lambda x: int(loaded_model.predict(np.array(x).reshape(-1,1))))


In [23]:
subset.to_csv(f"pred_{feature}.csv")

Unnamed: 0,artist_name,track_uri,artist_uri,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri,pred_danceability
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4.0,-7.105,0.0,0.1210,0.03110,0.006970,0.0471,0.810,125.461,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,7
1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,0.774,0.838,5.0,-3.914,0.0,0.1140,0.02490,0.025000,0.2420,0.924,143.040,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,0
2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,0.664,0.758,2.0,-6.583,0.0,0.2100,0.00238,0.000000,0.0598,0.701,99.259,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,9
3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,0.892,0.714,4.0,-6.055,0.0,0.1410,0.20100,0.000234,0.0521,0.817,100.972,spotify:track:1AWQoqb9bSvzTjaLralEkT,7
4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,0.853,0.606,0.0,-4.596,1.0,0.0713,0.05610,0.000000,0.3130,0.654,94.759,spotify:track:1lzr43nnXAijIGYnCT8M8H,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Headhunterz,spotify:track:5QBQ6uA3sQFBSJJt8w9iat,spotify:artist:6C0KWmCdqrLU2LzzWBPbOy,Lift Me Up,0.409,0.610,7.0,-6.011,0.0,0.0447,0.06960,0.000006,0.0655,0.468,101.576,spotify:track:5QBQ6uA3sQFBSJJt8w9iat,1
49996,Keanu Silva,spotify:track:52DhbZEJ6lPZqJSrOPqSKo,spotify:artist:1zLMhO4zzzxt5PMV4wMS3y,Children - RMFB Remix Edit,0.222,0.814,3.0,-5.196,0.0,0.1010,0.00226,0.902000,0.2610,0.242,153.407,spotify:track:52DhbZEJ6lPZqJSrOPqSKo,2
49997,Myon,spotify:track:7dCmQbhItGA7JgdBUgY18J,spotify:artist:0nTbVTXLLbBA4xCtn0cFkv,Round We Go - Radio Edit,0.490,0.905,0.0,-3.928,0.0,0.0717,0.03950,0.000059,0.6860,0.210,128.016,spotify:track:7dCmQbhItGA7JgdBUgY18J,5
49998,Alison Wonderland,spotify:track:28QlLenBGFRjxMmKONi9x2,spotify:artist:11gWrKZMBsGQWmobv3oNfW,U Don't Know - Vincent Remix,0.414,0.767,2.0,-4.554,0.0,0.0650,0.00515,0.000000,0.0996,0.210,137.945,spotify:track:28QlLenBGFRjxMmKONi9x2,1


# Artist DataFrame

In [None]:
artist_df = get_artist_dataframe(artist_path)

In [None]:
artist_df.drop('followers', axis=1, inplace=True)

In [None]:
artist_df.head()

In [None]:
artist_df[artist_df.isna().any(axis=1)]

In [None]:
print(len(artist_df.index))
artist_df = artist_df[~artist_df.isna().any(axis=1)]
print(len(artist_df))

In [None]:
artist_df.isna().any()

# Full data set SONG + ARTIST

In [None]:
full_df = data.merge(artist_df, how='left' , left_on='artist_uri', right_on='uri')

In [None]:
full_df.drop(['uri_x', 'uri_y'], axis=1, inplace=True)

In [None]:
full_df.head()

In [None]:
print('full length: ', len(full_df.index), 'data length: ', len(data.index)) 
full_df.isna().any()

In [None]:
len(full_df[full_df.isna().any(axis=1)])

In [None]:
full_df.replace('[]', np.nan, inplace=True)

In [None]:
len(full_df[full_df.isna().any(axis=1)])

In [None]:
song_artist_df = full_df[~full_df.isna().any(axis=1)]

In [None]:
song_artist_df.isna().any()

In [None]:
len(song_artist_df)

In [None]:
# ' '.join([i.replace(' ', '_') for i in song_artist_df.genres[0].strip('][').replace('\'','').split(', ')])

In [None]:
song_artist_df['genres'] = song_artist_df['genres'].apply(lambda x: ' '.join([i.replace(' ', '_') for i in x.strip('][').replace('\'','').split(', ')]))
#song_artist_df.drop('genres', axis=1, inplace=True)
# song_artist_df['genres'] = song_artist_df['genres'].apply(lambda x: ' '.join(list(set(x.split()))))
# song_artist_df['bgenres'] = song_artist_df.agenres.replace({"r\&b": "rhythm_blues"}, regex = True)
song_artist_df['genres'] = song_artist_df.genres.replace({"[^A-Za-z ]+": ""}, regex = True)
#song_artist_df.drop('agenres', axis=1, inplace=True)
song_artist_df.reset_index(inplace=True)

In [None]:
song_artist_df.drop('index', axis=1, inplace=True)

In [None]:
song_artist_df

In [None]:
np.where(song_artist_df.applymap(lambda x: x == ''))

In [None]:
print(f"There are {len(song_artist_df['genres'].unique())} unique genre combinations")

In [None]:
song_artist_df['genres'][0:5000]

# Vectorizing genre 

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')

X_train = vectorizer.fit_transform(song_artist_df['genres'][0:50000])
print(X_train.shape)
# svd = TruncatedSVD(n_components=100)
# svp_X_train = svd.fit_transform(X_train)
# print(svp_X_train.shape)
# x_train_array = svp_X_train.toarray()
# for index, value in enumerate(text_X_train):
#     print(song_artist_df['genre'].iloc[index], value)

# print(x_train_array.shape)
# print(x_train_array[0][44])
# f_xtrain_nosvp = np.concatenate((song_features_array, x_train_array), axis=1)
# print('xtrain_nosvp', f_xtrain_nosvp.shape)
# f_xtrain = np.concatenate((song_features_array, text_X_train), axis=1)
# #print(f_xtrain)
# print(f_xtrain.shape)

#print(X_train)
#print(f"n_samples: {X_train.shape[0]}, n_features: {X_train.shape[1]}")

# Training a model with genre 

In [None]:
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=100, random_state=1)
model.fit(X_train)
# model.fit(svp_X_train)
# save the model as a pickle file
model_pkl_file = f"model_genre.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model, file)
print(f"model with genre feature completed")

In [None]:
def input_vectorization(str_input):
    #print(test_song)
    if isinstance(str_input, str):
        # It is a single string element thus we cannot use list() and have to use brackets [] to transform to list
        X_test = vectorizer.transform([str_input])
#         svd = TruncatedSVD(n_components=100)
#         svp_X_train = svd.fit_transform(X_test)
#         prediction = model.predict(X_test)
    else:
        # if is not a single string element so we can transform into a list directly using list()
        X_test = vectorizer.transform(list(str_input))
#         svd = TruncatedSVD(n_components=100)
#         svp_X_train = svd.fit_transform(X_test)
#         prediction = model.predict(X_test)
    return X_test

In [None]:
# Check functionality of the model
# print(song_artist_df['genres'].iloc[0:5])
# a = song_artist_df['genres'].iloc[0:5]
# X_test = vectorizer.transform(a)
# print(X_test.shape)
# svd = TruncatedSVD(n_components=4)
# svp_X_train = svd.fit_transform(X_test)
# print(svp_X_train.shape)
with open(f"model_genre.pkl", 'rb') as file:  
        loaded_model = pickle.load(file)
        print(loaded_model.labels_[0:5])
        # evaluate model 
        print(training_data[['artist_name', 'track_name']].iloc[0:5])
        y_predict = loaded_model.predict(input_vectorization(song_artist_df['genres'].iloc[0:1]))
        print(y_predict)
        file.close()

# Reading Json files

In [None]:
# # path to playlist file
# root_path = !pwd
# root_path = str(root_path[0])
# # filename 
# filepath = f"/home/ernek/Main/Erdos/song_recommender/playlist_data/sampledata/"
# filename = 'mpd.slice.0-999.json'
# # path + filename
# fpath_name = f"{filepath}{filename}"
# # Open file of playlist and obtain fields of json file 
# with open(fpath_name) as data_file:    
#     data = json.load(data_file)  

# # Extract keys
# num_keys_old = 0
# for index, playlist in enumerate(data['playlists']):
#     num_keys =  len(playlist.keys())
#     if index == 0:
#         num_key_old = num_keys
#         continue
#     if num_keys > num_key_old:
#         keys = playlist.keys()
#     num_key_old = num_keys
    
# # Construct keys of playlist
# keys = list(keys)
# keys.remove('tracks')
# print(" Playlist keys: ", keys)

# # Create dataframe with track and playlist info
# music_df = pd.json_normalize(data['playlists'],  meta = keys, meta_prefix = 'playlist_', errors='ignore', record_path=['tracks'], record_prefix = 'track_')
# music_df.head()

# # Check column values 
# music_df[music_df['playlist_pid'] == 0].columns

In [None]:
# Select indexes of playlists that have a description
# playlists_id_with_description = music_df[~music_df['playlist_description'].isna()]['playlist_pid'].unique()
# print(playlists_id_with_description)
# Working only with playlists that contain a "description"
# subset_df = music_df[music_df['playlist_pid'].isin(playlists_id_with_description)]
# print(subset_df['playlist_pid'].unique())

In [None]:
# num_records = len(subset_df)
# num_unique_records = len(subset_df[['track_artist_name','track_track_name']].apply(lambda x: ',.'.join(x),axis=1).unique())
# print('Number of table records: ', num_records)
# print('Number of unique artist, song pairs: ', num_unique_records )
# print('Number of repeated songs: ', num_records - num_unique_records )
# print('Number of playlists: ', len(subset_df['playlist_pid'].unique()))
# print('Number of unique artists: ', len(subset_df['track_artist_name'].unique()))

In [None]:
# # Function to select track_uri ONLY Nrecords records for now
# Nrecords = 250
# def get_features(df, Nrecords):
#     indexes = []
#     audio_features = []
#     track_popularity = []
#     artist_genre = []
#     artist_popularity = []
#     album_popularity = []
#     start = 0
#     for row_index, row in df.iloc[0:Nrecords].iterrows():
#         #print(row_index, row['track_track_uri'])
#         track_uri = row['track_track_uri']
#         artist_uri = row['track_artist_uri']
#         album_uri = row['track_album_uri']
#         if start == 0:
#             keys = spotify.audio_features(tracks=track_uri)[0].keys()
#         start += 1
    
#         track_popularity.append(spotify.track(track_uri)['popularity'])
        
#         indexes.append(row_index) 
#         audio_features.append(spotify.audio_features(tracks=track_uri)[0].values())
        
#         artist_results = spotify.artist(artist_uri)
        
#         artist_genre.append(artist_results['genres'])
#         artist_popularity.append(artist_results['popularity'])
#         #print(artist_results['genres'], artist_results['popularity'])
#         album_results = spotify.album(album_uri)
#         album_popularity.append(album_results['popularity'])
        
        
#     features_df = pd.DataFrame(audio_features, columns=keys)
#     features_df['song_popularity'] = track_popularity
#     features_df['artist_genre'] = artist_genre
#     features_df['artist_popularity']  = artist_popularity
#     features_df['album_popularity'] = album_popularity
#     features_df['index'] = indexes
#     features_df.set_index('index', inplace=True)
#     return features_df

# spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [None]:
# audio_sp_df = get_features(subset_df , Nrecords)
# audio_sp_df.index.name = None
# audio_sp_df.drop(['type', 'id', 'track_href', 'analysis_url'], inplace=True, axis=1)
# audio_sp_df

In [None]:
# Parameters to use for the model
# audio_sp_df.keys()

In [None]:
# check_df = subset_df.iloc[0:Nrecords].merge(audio_sp_df, how='left' , left_on = 'track_track_uri', right_on='uri')
# check_df.drop(['track_pos', 'uri', 'mode', 'playlist_duration_ms','playlist_num_albums','playlist_num_artists',  'track_artist_uri', 'track_album_uri', 'track_duration_ms','playlist_num_followers', 'playlist_num_edits', 'playlist_collaborative', 'playlist_modified_at', 'playlist_num_tracks'], inplace = True, axis=1)
# check_df

In [None]:
# Make album popularity equal to artist popularity if album popularity is 0
# check_df['album_popularity'] = np.where(check_df['album_popularity'] == 0, check_df['artist_popularity'], check_df['album_popularity'])
# check_df['song_popularity'] = np.where(check_df['song_popularity'] == 0, check_df['artist_popularity'], check_df['song_popularity'])

In [None]:
# NLP workflow
# Need to vectorize string fields
# Use all string fields and combine them in a column containing all the words f
# from collections import Counter

In [None]:
# check_df['artist_genre']

In [None]:
#check_df['artist_genre'] = check_df['artist_genre'].apply(lambda x: ' '.join([i.replace('-', '_') for i in x]))
# check_df['artist_genre'] = check_df['artist_genre'].apply(lambda x: ' '.join([i.replace('-', '_').replace('_', '') for i in x]))

In [None]:
# check_df['artist_genre'] = check_df['artist_genre'].apply(lambda x: ' '.join(list(set(x.split()))))

In [None]:
# recheck = check.apply(lambda x: list(set(x.split())))
# print(check)
# print(recheck)
# # check_df['artist_genre'] = check_df['artist_genre'].apply(lambda x: ''.join([i.replace('', '_') for i in x]))

In [None]:
label_columns = ['track_track_name', 'artist_genre']
check_df[label_columns]

In [None]:
# Combine 5 fields into one
# string_field = check_df.track_track_name.str.cat(" " + check_df.artist_genre)
string_field = check_df.artist_genre

string_field = string_field.replace({"r\&b": "rhythm blues"}, regex = True)
string_field = string_field.replace({"[^A-Za-z ]+": ""}, regex = True)
#print('last song: ',string_field.tail())
string_field

In [None]:
test_song = string_field[0:int(len(string_field)*0.4)]
print(test_song[0])
#fraction = 0.0
#string_field = string_field[int(len(string_field)*fraction):]


In [None]:
# for i in string_field:
#     print(set(i.split()))

In [None]:
#check_df['All_text'] = check_df.track_track_name.str.cat(" " + check_df.artist_genre)
check_df['All_text'] = check_df.artist_genre
# check_df

In [None]:
# import nltk
# from nltk.stem import WordNetLemmatizer

In [None]:
# stopwords = nltk.corpus.stopwords.words('english')
# lemmatizer = WordNetLemmatizer()
# nltk.download('stopwords')

In [None]:
check_df.keys()

In [None]:
song_features = ['danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'song_popularity', 'artist_popularity', 'album_popularity']
df_song_features = check_df[song_features]

In [None]:
song_features_array = np.array(df_song_features)

In [None]:
song_features = ['danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo']
df_song_features = check_df[song_features]

In [None]:
df_song_features

In [None]:
song_features_array = np.array(df_song_features)

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')

X_train = vectorizer.fit_transform(string_field)
#print(X_train[0])
x_train_array = X_train.toarray()

svd = TruncatedSVD(n_components=100)
#print(type(svd))
text_X_train = svd.fit_transform(X_train)
#print(text_X_train)
#print(text_X_train.shape)

#for index, value in enumerate(text_X_train):
#    print(string_field[index], value)

#print(text_X_train)
#print(x_train_array)
print(x_train_array.shape)
print(x_train_array[0][44])
f_xtrain_nosvp = np.concatenate((song_features_array, x_train_array), axis=1)
print('xtrain_nosvp', f_xtrain_nosvp.shape)
f_xtrain = np.concatenate((song_features_array, text_X_train), axis=1)
#print(f_xtrain)
print(f_xtrain.shape)

#print(X_train)
#print(f"n_samples: {X_train.shape[0]}, n_features: {X_train.shape[1]}")

In [None]:
print(f"{X_train.nnz / np.prod(X_train.shape):.3f}")
print(f"About {X_train.nnz / np.prod(X_train.shape)*100:.3f}\% of the entries in the matrix are non zero")

In [None]:
# Fitting the Full MATRIX of text vectors without song_features
true_k = 10

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=100, random_state=1)
model.fit(preprocessing.normalize(X_train))
X_train_dist = model.transform(X_train)**2
print(X_train_dist.shape)
print(X_train_dist)
dist_df = pd.DataFrame(X_train_dist.sum(axis=1).round(2), columns=['sqdist'])
print(X_train_dist[0].sum())
print(X_train_dist[0].min())
dist_df['label'] = model.labels_
print(dist_df.head())
print(dist_df.shape)
#print(model.labels_)
#print(len(model.labels_))

for index, value in enumerate(model.labels_):
    print(f"STRING: {string_field[index]}", f"CLUSTER_INDEX: {value}")
    if index == 5:
        break
print('Top terms per cluster')
order_centroids = model.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names_out()
#print(terms)
for i in range(true_k):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i,:10]:
        print(f"{terms[ind]} ", end="")
    print()

In [None]:
# Fitting the Full MATRIX of text vectors + song_features
true_k = 10

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=100, random_state=1)
model.fit(preprocessing.normalize(f_xtrain_nosvp))
#print(model.labels_)
#print(len(model.labels_))

for index, value in enumerate(model.labels_):
    print(f"STRING: {string_field[index]}", f"CLUSTER_INDEX: {value}")
    if index == 5:
        break
print('Top terms per cluster')
order_centroids = model.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names_out()
#print(terms)
for i in range(true_k):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i,:10]:
        print(f"{terms[ind]} ", end="")
    print()

#print(list(test_song[:1]))
#print(list(test_song)[0])

# X_test = vectorizer.transform(list(test_song))
# #print(X_test)
# print(f"n_samples: {X_test.shape[0]}, n_features: {X_test.shape[1]}")
# # feature_names = vectorizer.get_feature_names_out()
# # print(feature_names)
# prediction = model.predict(X_test)
# print(prediction)

In [None]:
# Fitting the SVD MATRIX of text vectors without song_features
true_k = 10

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=100, random_state=1)
model.fit(preprocessing.normalize(text_X_train))
#print(model.labels_)
#print(len(model.labels_))

for index, value in enumerate(model.labels_):
    print(f"STRING: {string_field[index]}", f"CLUSTER_INDEX: {value}")
    if index == 5:
        break
print('Top terms per cluster')
order_centroids = model.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names_out()
#print(terms)
for i in range(true_k):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i,:5]:
        print(f"{terms[ind]} ", end="")
    print()

In [None]:
# Fitting the SVD MATRIX of text vectors with song_features
true_k = 10

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=100)
model.fit(preprocessing.normalize(f_xtrain))
#print(model.labels_)
#print(len(model.labels_))

for index, value in enumerate(model.labels_):
    print(f"STRING: {string_field[index]}", f"CLUSTER_INDEX: {value}")
    if index == 5:
        break
    
print('Top terms per cluster')
order_centroids = model.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names_out()
#print(terms)
for i in range(true_k):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i,:5]:
        print(f"{terms[ind]} ", end="")
    print()

In [None]:
#print(list(test_song[:1]))
#print(list(test_song)[0])
X_test = vectorizer.transform(list(test_song))
#print(X_test)
print(f"n_samples: {X_test.shape[0]}, n_features: {X_test.shape[1]}")
# feature_names = vectorizer.get_feature_names_out()
# print(feature_names)
prediction = model.predict(X_test)
print(prediction)

In [None]:
print(model.labels_)
print(prediction)

In [None]:
# # Save machine learning model
# filename = 'finalized_model.sav'
# pickle.dump(model, open(filename, 'wb'))

In [None]:
# # load model only once
# with open('finalized_model.sav', 'rb') as fid:
#     model = pickle.load(fid)

In [None]:
check_df['ClusterPrediction'] = ""
check_df['All_text']

In [None]:
def cluster_predict(str_input):
    #print(test_song)
    if isinstance(str_input, str):
        # It is a single string element thus we cannot use list() and have to use brackets [] to transform to list
        X_test = vectorizer.transform([str_input])
        prediction = model.predict(X_test)
    else:
        # if is not a single string element so we can transform into a list directly using list()
        X_test = vectorizer.transform(list(str_input))
        prediction = model.predict(X_test)
    return int(prediction)

In [None]:
check_df['All_text'][0]

In [None]:
cluster_predict(check_df['All_text'][0])

In [None]:
cluster_predict([check_df['All_text'][0]])

In [None]:
result = check_df['All_text'].apply(lambda x: cluster_predict(x))
print(result)

In [None]:
# for i in check_df['All_text']:
#     pred = cluster_predict(i)
#     print(i, pred)
check_df['ClusterPrediction'] = check_df['All_text'].apply(lambda x: cluster_predict(x))

In [None]:
check_df

In [None]:
# for seed in range(5):
#     model = KMeans(
#         n_clusters=true_k,
#         max_iter=500,
#         n_init=1,
#         random_state=seed,
#     ).fit(X)
#     cluster_ids, cluster_sizes = np.unique(model.labels_, return_counts=True)
#     print(f"Number of elements asigned to each cluster: {cluster_sizes}")
# print()

In [None]:
def recommend_util(artist_name, song_name):
    
    # Predict category of input string category
    chosen_song_df = check_df.loc[(check_df['track_artist_name'] == artist_name) & (check_df['track_track_name'] == song_name)]
    str_input = chosen_song_df.track_track_name.str.cat(" " + chosen_song_df.artist_genre)
        
    prediction_inp = cluster_predict(str_input)
    prediction_inp = int(prediction_inp)
    
    temp_df = check_df.loc[check_df['ClusterPrediction'] == prediction_inp]
    new_temp_df = temp_df.sample(5)
    
    return chosen_song_df[['track_artist_name', 'track_track_name']], new_temp_df[['track_artist_name', 'track_track_name']]

In [None]:
song_choice = 2
print(check_df['track_artist_name'][song_choice], check_df['track_track_name'][song_choice])
temp_df = check_df.loc[(check_df['track_artist_name'] == check_df['track_artist_name'][0]) & (check_df['track_track_name'] == check_df['track_track_name'][0])]
string_input = temp_df.track_track_name.str.cat(" " + temp_df.artist_genre)
string_input
prediction_inp = cluster_predict(string_input)
print(prediction_inp)

In [None]:
song_choice = 1
original_song , recommended_songs = recommend_util(check_df['track_artist_name'][song_choice], check_df['track_track_name'][song_choice])
print(original_song)
print('\n')
print(recommended_songs)

In [None]:
# Example of how to pull metadata from a single track 
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

Row = 5 

track_uri = subset_df.iloc[Row]['track_track_uri']
artist_uri = subset_df.iloc[Row]['track_artist_uri']
album_uri = subset_df.iloc[Row]['track_album_uri']

print('ARTIST: ', subset_df.iloc[Row]['track_artist_name'])
print('ALBUM: ', subset_df.iloc[Row]['track_album_name'])
print('SONG: ', subset_df.iloc[Row]['track_track_name'])
track_popularity = spotify.track(track_uri)['popularity']
print('track_popularity: ', track_popularity)
track_results = spotify.audio_features(tracks=track_uri)
print('audio_features: ', track_results[0])
artist_results = spotify.artist(artist_uri)
print('artist_genre: ', artist_results['genres'])
print('artist_popularity: ', artist_results['popularity'])
artist_albums_results = spotify.artist_albums(artist_uri)
#print(artist_albums_results)
album_results = spotify.album(album_uri)
print('album_popularity: ',album_results['popularity'])
print('album_keys: ', album_results.keys())
# print(artist_results['genres'])
# print(artist_results['popularity'])

In [None]:
# Work only with important features
important_features = ['track_artist_name', 'track_track_name', 'playlist_name', 'playlist_duration_ms', 'playlist_description']

In [None]:
artist_count_df = subset_df[['playlist_name','track_artist_name']].groupby(['playlist_name'], sort=False).value_counts(sort=False).reset_index(name="artist_count")

In [None]:
artist_count_df

In [None]:
subset_df[['playlist_name', 'track_artist_name']]

In [None]:
subset_df[['playlist_name','track_artist_name']].groupby(['playlist_name'], sort = False).value_counts(sort=False).reset_index(name="artist_count").info()

In [None]:
music_df = pd.merge(subset_df, artist_count_df,  how='left', on = ['playlist_name','track_artist_name'])

In [None]:
music_df

In [None]:
# subset_df.join(.sum(), on='playlist_name', rsuffix='_count')
subset_df.groupby(['playlist_name','track_artist_name']).size().unstack(fill_value=0)

In [None]:
music_df[['track_artist_name','track_track_name']].apply(lambda x: ',.'.join(x),axis=1)

In [None]:
important_features.append("artist_count")

In [None]:
music_df[important_features]

In [None]:
music_df['artist_fraction'] =  music_df['artist_count'] / music_df['playlist_num_tracks'] 
music_df

In [None]:
subset_df.groupby(['playlist_name','track_artist_name']).size().unstack(fill_value=0)

In [None]:
music_df[music_df['playlist_num_tracks'] == Ntracks][['track_artist_name','track_track_name']].apply(lambda x: ',.'.join(x),axis=1).value_counts() 

In [None]:
vc = music_df[music_df['playlist_num_tracks'] == Ntracks][['track_artist_name','track_track_name']].apply(lambda x: ',.'.join(x), axis=1).value_counts()
vc[vc > 1].sum() - len(vc[vc > 1])

In [None]:
# for i in df[['track_artist_name','track_track_name']].apply(lambda x: ',.'.join(x),axis=1).unique():
# #     print(i.split(',.'))
#     if len(i.split(',.')) == 2:
#         print(i)

In [None]:
np.unique(music_df[music_df['playlist_num_tracks'] == Ntracks]['track_track_name'])

In [None]:
important_features = ['track_artist_name', 'track_track_name', 'playlist_name', 'playlist_num_tracks', 'playlist_num_albums', 'playlist_duration_ms', 'playlist_num_artists']

In [None]:
music_df[music_df['playlist_num_tracks'] == 20][['track_artist_name', 'track_track_name', 'playlist_name', 'playlist_num_tracks', 'playlist_num_albums', 'playlist_duration_ms', 'playlist_num_artists', 'playlist_description']]

In [None]:
music_df[music_df['playlist_num_tracks'] == Ntracks][important_features].groupby('playlist_name')['track_artist_name'].value_counts()