In [2]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import spotipy
import sqlite3
from sklearn.cluster import KMeans
from sqlite3 import Error
import seaborn as sns
import matplotlib.pyplot as plt
from spotipy.oauth2 import SpotifyClientCredentials
import time
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from openTSNE import TSNE as openTSNE
from wordcloud import WordCloud
from wordcloud import STOPWORDS as STOPWORDS

In [3]:
def connect_db(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)
    return conn

In [4]:
# connect to database
# conn = connect_db('data.db')
# cur = conn.cursor()

In [5]:
# Getting the target data
# Extract playlists that have the input words in their titles to use as training data
# input_words = ['summer', 'beach', 'throwbacks']
# sql_select = '''SELECT AVG(danceability), AVG(energy), AVG(key), AVG(loudness), 
# AVG(mode), AVG(speechiness), AVG(acousticness), AVG(instrumentalness), AVG(liveness), 
# AVG(valence), AVG(tempo), AVG(duration_ms), AVG(time_signature) FROM avg_features_by_playlist WHERE'''
def create_train(input_words, cur):
    sql_select = '''SELECT * FROM avg_features_by_playlist WHERE'''
    for w in range(len(input_words)):
        sql_select += " name LIKE '%" + input_words[w] + "%'"
        if w != len(input_words) -1:
            sql_select += " OR" 
    pl_train = cur.execute(sql_select).fetchall()
    return pl_train

In [6]:
def get_x_data(pl_train, cur):
    # Get X data (average playlist features)
    data_cols = ['pid', 'name', 'danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
    feature_cols = ['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
    test_cols = ['pid', 'name', 'danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']

    data_rows = cur.execute('select * from avg_features_by_playlist where pid > (select MIN(pid) from avg_features_by_playlist) order by pid').fetchall()


    # test_pl = cur.execute('''select tracks_in_playlist.pid, tracks_in_playlist.track_uri, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_ms, time_signature 
    # from features_by_track left join tracks_in_playlist on
    # features_by_track.track_uri = tracks_in_playlist.track_uri 
    # where pid = (select MIN(pid) from avg_features_by_playlist)''').fetchall()


    test_data = pd.DataFrame(pl_train, columns=test_cols)
    data = pd.DataFrame(data_rows, columns=data_cols)

    return test_data, test_cols, data, data_cols, feature_cols
    # print(test_data.head())
    # print(data.head(5))

In [7]:
# conn.close()

In [8]:
# Scale the data

def scale_data(test_data, data, feature_cols):
    y = test_data[feature_cols].mean()

    scaler = StandardScaler(with_mean=True, with_std=True).fit(data[feature_cols].values)

    scaled_x = scaler.transform(data[feature_cols].values)
    scaled_y = scaler.transform(np.array(y).reshape(1,-1))
    scaled_features = pd.DataFrame(scaled_x)
    tsne = openTSNE(perplexity=30, metric='euclidean', n_jobs=-1, random_state=0, verbose=False)
    tsne_transformer = tsne.fit(scaled_x)
    data_df = pd.DataFrame(tsne_transformer.transform(scaled_x), columns =['X', 'Y'])

    #print(data_df, scaled_x, scaled_y, scaled_features)
    return data_df, scaled_x, scaled_y, scaled_features

In [9]:
def draw_scatterplot(data_df):
    sns.scatterplot(x='X', y='Y', data=data_df, legend=None)

In [10]:
def calculate_num_clusters(scaled_x):
    #TODO:
    # needs to be tested and K updated below
    #calculate how many K clusters there should be 
    wcss = [] 
    for number_of_clusters in range(1, 30): 
        kmeans = KMeans(n_clusters = number_of_clusters, random_state = 42)
        kmeans.fit(scaled_x) 
        wcss.append(kmeans.inertia_)
    wcss

    ks = range(1, 30)
    plt.plot(ks, wcss)
    plt.axvline(4, linestyle='--', color='r')

In [11]:
def kmeans_init(data_df, scaled_x, scaled_y, n_clusters):
    # initialize KMeans
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    clusters = kmeans.fit(scaled_x)
    labels = clusters.labels_
    data_df['cluster'] = pd.Categorical(labels)
    
    target_cluster = kmeans.predict(scaled_y)
    return (target_cluster, labels, data_df['cluster'])



In [12]:
def draw_colored_scatterplot(data_df):
    sns.scatterplot(x='X', y='Y', hue='cluster', style='cluster', data=data_df, legend=None)
    plt.show()

In [13]:
def wordclouds(data_df, data, n_clusters):
    data_df["playlist_name"] = data['name'].str.lower()
    # display(data_df)

    # we want to perform a pivot on data_df so that each cluster number is a column with row value equal to the playlist name.
    # from there we can sum up that column to get the whole lsit of strings of playlist names for each cluster (column)
    original_df = data_df.pivot(index='X', columns='cluster')['playlist_name'].reset_index()
    original_df.columns.name = None
    original_df = original_df.fillna('')
    # original_df.head(20)
    # print(original_df.columns)

    #list of words to ignore
    stop_words = STOPWORDS.update(["i", "it", "me", "my", "that", "the", "of", "than", "then", 
    "when", "if", "a", "there", "playlist", "music", "song", "songs", "to", "too", "get", "as", "this", 
    "am", "is", "are", "has", "and", "aa", "aaa", 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 
    'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])

    wordclouds = {}
    for i in range(n_clusters):
        original_df.replace(np.nan, '')
        original_df[i] = original_df[i].astype(str) #.sum(skipna=True))
        wordclouds["wordcloud" + str(i)] = WordCloud(stopwords=stop_words).generate(' '.join(original_df[i]))

    for val in wordclouds.values():
        plt.imshow(val)
        plt.axis("off")
        plt.show()

In [43]:
def get_target_cluster_songs(target_cluster, data_df, data):
    data_df["pid"] = data['pid']
    original_df = data_df.pivot(columns='cluster', values='pid')
    target_cluster_df = original_df[[target_cluster]].dropna()
    conn = connect_db('data.db')
    cur = conn.cursor()
    
    print(target_cluster_df)
    exit()
    num_of_ors = 0
    
    sql = '''SELECT features_by_track.* FROM features_by_track join tracks_in_playlist on features_by_track.track_uri = tracks_in_playlist.track_uri
        WHERE '''
    
    result = []
    
    for pid in target_cluster_df[target_cluster].values:
        sql += "tracks_in_playlist.pid = " + str(int(pid))
        sql += " OR "
        num_of_ors += 1
        if (num_of_ors == 100):
            sql = sql[:-3]
            cols = ["track_uri",
                "danceability",
                "energy",
                "key",
                "loudness",
                "mode",
                "speechiness",
                "acousticness",
                "instrumentalness",
                "liveness",
                "valence",
                "tempo",
                "duration_ms",
                "time_signature"]
            result.append(cur.execute(sql).fetchall())
            sql = '''SELECT features_by_track.* FROM features_by_track join tracks_in_playlist on features_by_track.track_uri = tracks_in_playlist.track_uri
            WHERE '''
            num_of_ors = 0
        
    conn.close()
    tracks = pd.DataFrame(result, columns = cols)
    tracks = tracks[tracks['track_uri'].map(tracks['track_uri'].value_counts()) > 3]
    
    tracks['counts'] = tracks.groupby(['track_uri'])['time_signature'].transform('count')
    
    tracks = tracks.drop_duplicates(subset=['track_uri'])
    song_instances = tracks['counts'].sum()

    
    #num_unique_songs = len(pd.unique(tracks['track_uri']))
    print(song_instances, " song instances fetched")
    print(len(tracks), " unique songs fetched")
    print(tracks.head(5))
    return tracks

In [15]:
'''
generate_playlist

Parameters:
input_words: (String) to pass to get_target_cluster_songs
obscurity: (float) int
max_song_length: (int) max_length of any song in the playlist
use_minutes: (bool) use minutes instead of number of songs for playlist length (default false)
playlist_length: number of songs in the playlist. If use_minutes=true, minutes in the playlist
'''

def generate_playlist(tracks, obscurity=1, max_song_length=10, use_minutes=False, playlist_length=10):
    playlist = []
    
    
    # obscurity
    # length




In [16]:
# pipeline
def pipeline():
    # connect to database
    conn = connect_db('data.db')
    cur = conn.cursor()
    input_words = ["beach","sun"]
    pl_train = create_train(input_words, cur)
    test_data, test_cols, data, data_cols, feature_cols= get_x_data(pl_train, cur)
    conn.close()
    data_df, scaled_x, scaled_y, scaled_features = scale_data(test_data, data, feature_cols)
    #draw_scatterplot(data_df) 
    #calculate_num_clusters(scaled_x) # unfinished
    n_clusters = 20 # manually set after looking at calculate_num_clusters
    target_cluster = kmeans_init(data_df, scaled_x, scaled_y, n_clusters)
    #wordclouds(data_df, data, n_clusters)
    # tracks = get_target_cluster_songs(target_cluster, data_df, data)

    #generate_playlist(tracks)

In [17]:
# connect to database
conn = connect_db('data.db')
cur = conn.cursor()
input_words = ["beach","sun"]
pl_train = create_train(input_words, cur)
test_data, test_cols, data, data_cols, feature_cols= get_x_data(pl_train, cur)
conn.close()
data_df, scaled_x, scaled_y, scaled_features = scale_data(test_data, data, feature_cols)
#draw_scatterplot(data_df) 
#calculate_num_clusters(scaled_x) # unfinished
n_clusters = 20 # manually set after looking at calculate_num_clusters
target_cluster, labels, clusters = kmeans_init(data_df, scaled_x, scaled_y, n_clusters)


In [44]:
target_cluster_songs = get_target_cluster_songs(target_cluster[0], data_df, data)

cluster        19
8             9.0
32           33.0
53           54.0
62           63.0
82           83.0
...           ...
999940   999941.0
999944   999945.0
999966   999967.0
999977   999978.0
999979   999980.0

[71109 rows x 1 columns]


KeyboardInterrupt: 

: 

In [18]:
print(target_cluster)
draw_colored_scatterplot(target_cluster)
#wordclouds(data_df, data, n_clusters)
# tracks = get_target_cluster_songs(target_cluster, data_df, data)

#generate_playlist(tracks)

[19]


ValueError: Could not interpret value `X` for parameter `x`