In [None]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import spotipy
import sqlite3
from sklearn.cluster import KMeans
from sqlite3 import Error
import seaborn as sns
import matplotlib.pyplot as plt
from spotipy.oauth2 import SpotifyOAuth
from spotipy.oauth2 import SpotifyClientCredentials
import time
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from openTSNE import TSNE as openTSNE
from wordcloud import WordCloud
from wordcloud import STOPWORDS as STOPWORDS

In [None]:
def connect_db(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)
    return conn

In [None]:
# connect to database
conn = connect_db('data.db')
cur = conn.cursor()

In [None]:
# Getting the target data
# Extract playlists that have the input words in their titles to use as training data
input_words = ['summer', 'beach', 'throwbacks']
# sql_select = '''SELECT AVG(danceability), AVG(energy), AVG(key), AVG(loudness), 
# AVG(mode), AVG(speechiness), AVG(acousticness), AVG(instrumentalness), AVG(liveness), 
# AVG(valence), AVG(tempo), AVG(duration_ms), AVG(time_signature) FROM avg_features_by_playlist WHERE'''

sql_select = '''SELECT * FROM avg_features_by_playlist WHERE'''
for w in range(len(input_words)):
    sql_select += " name LIKE '%" + input_words[w] + "%'"
    if w != len(input_words) -1:
        sql_select += " OR" 
pl_train = cur.execute(sql_select).fetchall()

In [None]:
# Get X data (average playlist features)
data_cols = ['pid', 'name', 'danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
feature_cols = ['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
test_cols = ['pid', 'name', 'danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']

data_rows = cur.execute('select * from avg_features_by_playlist where pid > (select MIN(pid) from avg_features_by_playlist) order by pid').fetchall()


# test_pl = cur.execute('''select tracks_in_playlist.pid, tracks_in_playlist.track_uri, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_ms, time_signature 
# from features_by_track left join tracks_in_playlist on
# features_by_track.track_uri = tracks_in_playlist.track_uri 
# where pid = (select MIN(pid) from avg_features_by_playlist)''').fetchall()


test_data = pd.DataFrame(pl_train, columns=test_cols)
data = pd.DataFrame(data_rows, columns=data_cols)

print(test_data.head())
print(data.head(5))

In [None]:
conn.close()

In [None]:
# Scale the data


y = test_data[feature_cols].mean()

scaler = StandardScaler(with_mean=True, with_std=True).fit(data[feature_cols].values)

scaled_x = scaler.transform(data[feature_cols].values)
scaled_y = scaler.transform(np.array(y).reshape(1,-1))
scaled_features = pd.DataFrame(scaled_x)
tsne = openTSNE(perplexity=30, metric='euclidean', n_jobs=-1, random_state=0, verbose=False)
tsne_transformer = tsne.fit(scaled_x)
data_df = pd.DataFrame(tsne_transformer.transform(scaled_x), columns =['X', 'Y'])

print(data_df)

In [None]:
sns.scatterplot(x='X', y='Y', data=data_df, legend=None)

In [None]:
#TODO:
# needs to be tested and K updated below
#calculate how many K clusters there should be 
wcss = [] 
for number_of_clusters in range(1, 30): 
    kmeans = KMeans(n_clusters = number_of_clusters, random_state = 42)
    kmeans.fit(scaled_x) 
    wcss.append(kmeans.inertia_)
wcss

ks = range(1, 30)
plt.plot(ks, wcss)
plt.axvline(4, linestyle='--', color='r')

In [None]:
# initialize KMeans
n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
clusters = kmeans.fit(scaled_x)
labels = clusters.labels_
data_df['cluster'] = pd.Categorical(labels)

target_cluster = kmeans.predict(scaled_y)
print(target_cluster)

In [None]:
sns.scatterplot(x='X', y='Y', hue='cluster', style='cluster', data=data_df, legend=None)
plt.show()

In [None]:
data_df["playlist_name"] = data['name'].str.lower()
# display(data_df)

# we want to perform a pivot on data_df so that each cluster number is a column with row value equal to the playlist name.
# from there we can sum up that column to get the whole lsit of strings of playlist names for each cluster (column)
original_df = data_df.pivot(index='X', columns='cluster')['playlist_name'].reset_index()
original_df.columns.name = None
original_df = original_df.fillna('')
# original_df.head(20)
# print(original_df.columns)

#list of words to ignore
stop_words = STOPWORDS.update(["i", "it", "me", "my", "that", "the", "of", "than", "then", 
"when", "if", "a", "there", "playlist", "music", "song", "songs", "to", "too", "get", "as", "this", 
"am", "is", "are", "has", "and", "aa", "aaa", 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 
'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])

wordclouds = {}
for i in range(n_clusters):
    original_df.replace(np.nan, '')
    original_df[i] = original_df[i].astype(str) #.sum(skipna=True))
    wordclouds["wordcloud" + str(i)] = WordCloud(stopwords=stop_words).generate(' '.join(original_df[i]))

for val in wordclouds.values():
    plt.imshow(val)
    plt.axis("off")
    plt.show()

In [None]:
def get_target_cluster_songs(target_cluster):
    data_df["pid"] = data['pid']
    original_df = data_df.pivot(columns='cluster', values='pid')
    target_cluster_df = original_df[[target_cluster]].dropna()
    conn = connect_db('data.db')
    cur = conn.cursor()
    sql = '''SELECT features_by_track.* FROM features_by_track join tracks_in_playlist on features_by_track.track_uri = tracks_in_playlist.track_uri
        WHERE '''
    for pid in target_cluster_df[target_cluster].values:
        sql += "tracks_in_playlist.pid = " + str(int(pid))
        sql += " OR "
    sql = sql[:-3]
    cols = ["track_uri",
        "danceability",
        "energy",
        "key",
        "loudness",
        "mode",
        "speechiness",
        "acousticness",
        "instrumentalness",
        "liveness",
        "valence",
        "tempo",
        "duration_ms",
        "time_signature"]
        
    result = cur.execute(sql).fetchall()
    
    conn.close()
    tracks = pd.DataFrame(result, columns = cols)
    tracks = tracks[tracks['track_uri'].map(tracks['track_uri'].value_counts()) > 3]
    
    tracks['counts'] = tracks.groupby(['track_uri'])['time_signature'].transform('count')
    
    tracks = tracks.drop_duplicates(subset=['track_uri'])
    song_instances = tracks['counts'].sum()

    
    #num_unique_songs = len(pd.unique(tracks['track_uri']))
    print(song_instances, " song instances fetched")
    print(len(tracks), " unique songs fetched")
    print(tracks.head(5))
    return tracks
    

In [None]:
target_cluster_songs = get_target_cluster_songs(target_cluster[0])

In [None]:
target_cluster_songs

In [None]:
feature_cols.append('counts')
scaler_song = StandardScaler(with_mean=True, with_std=True).fit(target_cluster_songs[feature_cols].values)

scaled_song_x = scaler_song.transform(target_cluster_songs[feature_cols].values)
scaled_song_features = pd.DataFrame(scaled_song_x)
tsne_song = openTSNE(perplexity=30, metric='euclidean', n_jobs=-1, random_state=0, verbose=False)
tsne_transformer_song = tsne_song.fit(scaled_song_x)
song_data_df = pd.DataFrame(tsne_transformer_song.transform(scaled_song_x), columns =['X', 'Y'])



In [None]:
def get_song_target_cluster_songs(target_cluster):
    song_data_df["track_uri"] = target_cluster_songs['track_uri']
    original_df = song_data_df.pivot(columns='cluster', values='track_uri')
    target_cluster_df = original_df[[target_cluster]].dropna()
    
    conn = connect_db('data.db')
    cur = conn.cursor()
    sql = "SELECT track_name, track_uri FROM tracks WHERE "
    for uri in target_cluster_df[target_cluster].values:
        sql += "track_uri = " + "\"" + uri + "\""
        sql += " OR "
    sql = sql[:-3]
    cols = ["track_name", "track_uri"]
    
    result = cur.execute(sql).fetchall()
    conn.close()

    tracks = pd.DataFrame(result, columns = cols)

    unique_songs = pd.unique(tracks['track_uri'])
    print(len(unique_songs), " songs fetched")
    print(unique_songs)
    return unique_songs


In [None]:
def connect_to_spotify():
    # Spotify credentials
    cid = "5cffc2676cd44b35bc6af81faeb8e69a"
    secret = "f9fdae88362349b992ab2714ea91a094"
    os.environ["SPOTIPY_CLIENT_ID"] = cid
    os.environ["SPOTIPY_CLIENT_SECRET"] = secret
    os.environ['SPOTIPY_REDIRECT_URI'] = "http://127.0.0.1:8080"
    client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
    sp = spotipy.Spotify(client_credentials_manager = SpotifyClientCredentials())

    return sp

In [None]:
sp = connect_to_spotify()

In [None]:
import math


def generate_playlist(song_df, center_x, center_y, ids):
    
    # drop URI, for whatever reason it's empty and useless for this
    song_df = song_df.drop(columns='track_uri')
    
    # x2 - x1
    x_vals = song_df['X'].apply(lambda x: x - center_x)
    
    # y2 - y1
    y_vals = song_df['Y'].apply(lambda x: x - center_y)
    
    # square both
    x_vals = np.power(x_vals, 2)
    y_vals = np.power(y_vals, 2)

    # final distances between the center and all of the points
    distances = np.sqrt(x_vals + y_vals)
    
    
    # now append the song ids column!
    distances =  pd.DataFrame(distances)
    distances['id'] = ids
    
    # sort by distance and return the whole thing
    sorted_distances = distances.sort_values(by=0)
    closest_song_ids = sorted_distances
    return closest_song_ids



In [None]:
import spotipy.util as util

#must give a list of track ids, must include "spotify:track:"
user_id = "1a0f1b9085db4f49"
username = "31y7j5k3jeidd5rzhaznsdregg34"
playlistName = ""
scope = "playlist-modify-public"
songs_to_add = []
c = 0
for i in input_words:
    playlistName += input_words[c] + " "
    c += 1
    
results = target_cluster_songs['track_uri']
results = results.values.tolist()


centers = clusters.cluster_centers_[target_cluster]


results = generate_playlist(song_data_df, centers[:, 0][0], centers[:, 1][0], results)['id'].tolist()

c = 0
for track in range(99):
    songs_to_add.append("spotify:track:" + results[c])
    c += 1

token = SpotifyOAuth(scope = scope, username = username)
if token:
    sp = spotipy.Spotify(auth_manager=token)
    sp.user_playlist_create(user = username, name = playlistName)
    prep = sp.user_playlists(user = username)
    playlist = prep['items'][0]['id']
    sp.user_playlist_add_tracks(user = username, playlist_id= playlist, tracks=songs_to_add)
else:
    print("Can't get token for", username)
