In [1]:
import os 
os.chdir('../src/')
print("Current working directory is now: ", os.getcwd())

import pandas as pd 
import numpy as np
from pandas.io.json import json_normalize
from IPython.display import display

from d00_utils.load_confs import load_credentials, load_paths
from d00_utils.data_loader_sql import DataLoaderSQL
from d01_data_processing.data_cleaning import *
from d01_data_processing.spotify_user import SpotifyUser

creds = load_credentials()
paths = load_paths()

Current working directory is now:  C:\Users\Caroline Wang\OneDrive\Duke\Senior Year\CS316\Project\src


In this notebook we will create functions to pull data from the VM database and process the data into features 

## Query Data From VM Database Using Python

1. Need a SQL query that returns the top tracks for a FIXED listener_id, time_span
For these top tracks, need a SQL query that returns the release date of each track (associated with the album of the track), popularity of each track, audio features of each track (i.e., energy level, danceability, speechiness, instrumentalness, valence)

Get toptracks info from tracks table. Still need to get toptracks genre from the AlbumHasGenre table. 
```
 select track_pop, acousticness, danceability, energy, valence, 
        loudness, tempo, instrumentalness, speechiness, 
        mode, time_signature, liveness
        from tracks as t1
        where t1.track_id in (
                     select t2.track_id 
                      from toptracks as t2 
                      where listener_id='{listener_id}'and time_span='long_term')                  
```


2. Need a SQL query that returns the top artists for a FIXED listener_id, time_span
For these top artists, need a SQL query that returns the genres of each artist, the popularity of each artist, number followers of each artist

Get topartists info from tracks table. Still need to get topartists genre info from the ArtistHasGenre table 
```
select * 
from artisthasgenre 
natural join 
(select a1.artist_id, a1.artist_pop, a1.num_followers 
from artists as a1 
where a1.artist_id in 
(select a2.artist_id 
 from topartists as a2
 where listener_id='ninjakichi' and time_span='long_term' ))
as artist_info
```

In [2]:
dl = DataLoaderSQL(creds=creds,
                   paths=paths)

listener_id = "ninjakichi"
num_tracks = 20


## Queried Data Into Features 

#### Track related features 

1. Avg energy level, avg danceability, avg speechiness, avg instrumentalness, avg valence of your top tracks from x-term songs, weighted by track_pop, a number from 0 - 100 (100 being most popular)
       - Energy level, acousticness, danceability, speechiness, instrumentalness, liveness, loudness, speechiness, valence, are from 0 -1
2. Tempo: maxed out at around 220 bpm
3. Mode: 1 is major, 0 is minor 
       - Percent of songs that are major

In [39]:

track_cmd = f"""
        select track_pop, acousticness, danceability, energy, valence, 
        loudness, tempo, instrumentalness, speechiness, 
        mode, time_signature, liveness
        from tracks as t1
        where t1.track_id in (
                     select t2.track_id 
                      from toptracks as t2 
                      where listener_id='{listener_id}'and time_span='long_term')                  
            """

track_info = dl.select_from_table(sql=track_cmd)
cols = track_info.columns
track_info[cols] = track_info[cols].apply(pd.to_numeric, errors='coerce')
track_info.head()

Unnamed: 0,track_pop,acousticness,danceability,energy,valence,loudness,tempo,instrumentalness,speechiness,mode,time_signature,liveness
0,23,0.675,0.197,0.32,0.108,-16.696,83.893,0.748,0.0387,1,4,0.803
1,73,0.228,0.653,0.816,0.816,-4.353,178.086,0.0,0.167,1,4,0.0967
2,83,0.257,0.671,0.373,0.732,-18.064,92.717,7.9e-05,0.0323,1,4,0.0481
3,73,0.0901,0.446,0.952,0.624,-5.321,125.303,0.0857,0.0523,1,4,0.112
4,74,0.649,0.571,0.307,0.108,-10.958,83.72,0.0,0.0499,0,4,0.178


In [56]:
# compute average weighted by the idf of track_pop 
def compute_inv_pop(series):
    '''vec is a pd series of integers from 0 to 100'''
    return np.log(100/series).replace({np.inf: 5.7})
    
def compute_weighted_avg(series, weights):
    '''compute the weighted avg of series'''
    return np.average(series, weights=weights)
    
track_info['inv_pop'] = compute_inv_pop(track_info['track_pop'])
compute_weighted_avg(track_info['loudness'], track_info['inv_pop'])

-11.53396334195145

In [143]:
# computed avg weighted by the track inverse popularity 
col_list= ['acousticness', 'danceability', 'energy', 
            'valence', 'loudness', 'tempo', 'instrumentalness',
            'speechiness', 'time_signature', 'liveness'] 

features_tracks_dict = {f"avg_{col}": compute_weighted_avg(series=track_info[col],
                                                    weights=track_info['inv_pop']) 
                 for col in col_list }
# features_dict
features_tracks_dict["%_major"] = np.sum(track_info['mode'])/num_tracks
features_tracks_dict

{'avg_acousticness': 0.3849086253539318,
 'avg_danceability': 0.49208783113630666,
 'avg_energy': 0.5548612411256385,
 'avg_valence': 0.45680829315132543,
 'avg_loudness': -11.53396334195145,
 'avg_tempo': 112.33350984716522,
 'avg_instrumentalness': 0.6190045290597034,
 'avg_speechiness': 0.0626400337980007,
 'avg_time_signature': 3.803718044328438,
 'avg_liveness': 0.167641725741742,
 '%_major': 0.65}

#### Album related features 

1. % of artists you listen to that fall into well-defined subcategories 
2. Avg popularity of your top artists (artist_pop)


In [151]:
artist_cmd = f"""
            select * 
            from artisthasgenre 
            natural join 
                (select a1.artist_id, a1.artist_pop, a1.num_followers 
                 from artists as a1 
                 where a1.artist_id in 
                    (select a2.artist_id 
                     from topartists as a2
                     where listener_id='ninjakichi' and time_span='long_term' ))
                 as artist_info
            """
artist_info = dl.select_from_table(sql=artist_cmd)
artist_info.head()

Unnamed: 0,artist_id,genre_name,artist_pop,num_followers
0,1HY2Jd0NmPuamShAr6KMms,dance pop,87,11076948
1,1HY2Jd0NmPuamShAr6KMms,pop,87,11076948
2,1HY2Jd0NmPuamShAr6KMms,post-teen pop,87,11076948
3,3dRfiJ2650SZu6GbydcHNb,soundtrack,75,849382
4,1hCkSJcXREhrodeIHQdav8,german soundtrack,74,396012


In [152]:
genre_map = {
                # new
                'alternative': 'alternative', 
                'indie': 'alternative', 
                'experimental': 'experimental', 
                'avant': 'experimental', 
                
                # typical gentres 
                'country': 'country', 
                'folk': 'country', 
                'rock': 'rock', 
                'punk': 'rock', 
                'metal': 'metal',
                'rap': 'hip_hop', 
                'hip': 'hip_hop', 
                'hop': 'hip_hop',
                'trap': 'hip_hop', 
                'pop': 'pop', 
                
                # r&b, african-american insp. 
                'r&b': 'r&b_soul', 
                'soul': 'r&b_soul', 
                'funk': 'r&b_soul',
                'afro': 'jazz', 
                'jazz': 'jazz',
                
                # religious 
                'gospel': 'christian', 
                'christian': 'christian', 

                # ethnic
                'reggae': 'latin', 
                'latin': 'latin', 
                 
                # soundtrack 
                'soundtrack': 'soundtrack',
                'video': 'soundtrack', 
                'score': 'soundtrack',
                'tunes': 'soundtrack', 
                'cartoon': 'soundtrack', 
                'anime': 'soundtrack', 
                'otacore': 'soundtrack',
                
                # classical type / vocal
                'classical': 'classical', 
                'orchestra': 'classical',                
                'chamber': 'classical', 
                'contemporary': 'contemporary', 
                # vocal
                'capella': 'vocal',
                'choir': 'vocal',
                'alto': 'vocal',
                'soprano': 'vocal', 
                'vocal': 'vocal',
                
                # electronic 
                'edm': 'edm', 
                'rave': 'edm', 
                'house': 'edm', 
                'tech': 'edm',
                'room': 'edm',
                'step': 'edm', 
                'dance': 'edm', 
                'electronic': 'edm',
                'trance': 'edm',
                
                # misc chill : 
                'lo-fi': 'chill',
                'meditation': 'chill', 
                'drone': 'chill', 
                'focus': 'chill', 
                'zen': 'chill'
               }

# flip genre mapping 
inv_genre_map = {}
for k, v in genre_map.items():
    inv_genre_map[v] = inv_genre_map.get(v, [])
    inv_genre_map[v].append(k)

In [154]:
genre_str_arr = np.array(artist_info['genre_name'], dtype=str)

for genre, subgenres in inv_genre_map.items(): 
    # test for subgenre in each entry of genre_str_arr
    subgenre_bool_filters = []
    for subgen in subgenres: 
        subgenre_bool_filters.append((np.core.defchararray.find(genre_str_arr, 
                                                                subgen) != -1))
    # elementwise logical or on all subgenre boolean arrays 
    logical_or_bool = np.logical_or.reduce(subgenre_bool_filters)
    artist_info[f"is_{genre}"] = logical_or_bool
    
artist_genres = artist_info.copy()
del artist_genres['genre_name']
artist_genres = artist_genres.groupby(['artist_id', 'artist_pop', 'num_followers']).any()

features_artists_dict = (artist_genres.reset_index().drop(['artist_id', 'artist_pop', 'num_followers'], axis=1).sum()/num_tracks).to_dict()
# compute avg artist popularity and rescale
features_artists_dict['avg_artist_pop'] = np.mean(artist_info['artist_pop'])/100

features_artists_dict

{'is_alternative': 0.1,
 'is_experimental': 0.0,
 'is_country': 0.0,
 'is_rock': 0.05,
 'is_metal': 0.05,
 'is_hip_hop': 0.0,
 'is_pop': 0.25,
 'is_r&b_soul': 0.0,
 'is_jazz': 0.0,
 'is_christian': 0.0,
 'is_latin': 0.0,
 'is_soundtrack': 0.6,
 'is_classical': 0.2,
 'is_contemporary': 0.0,
 'is_vocal': 0.0,
 'is_edm': 0.05,
 'is_chill': 0.0,
 'avg_artist_pop': 0.5884615384615385}

In [155]:
features_dict = {**features_artists_dict, **features_tracks_dict}
features_dict

{'is_alternative': 0.1,
 'is_experimental': 0.0,
 'is_country': 0.0,
 'is_rock': 0.05,
 'is_metal': 0.05,
 'is_hip_hop': 0.0,
 'is_pop': 0.25,
 'is_r&b_soul': 0.0,
 'is_jazz': 0.0,
 'is_christian': 0.0,
 'is_latin': 0.0,
 'is_soundtrack': 0.6,
 'is_classical': 0.2,
 'is_contemporary': 0.0,
 'is_vocal': 0.0,
 'is_edm': 0.05,
 'is_chill': 0.0,
 'avg_artist_pop': 0.5884615384615385,
 'avg_acousticness': 0.3849086253539318,
 'avg_danceability': 0.49208783113630666,
 'avg_energy': 0.5548612411256385,
 'avg_valence': 0.45680829315132543,
 'avg_loudness': -11.53396334195145,
 'avg_tempo': 112.33350984716522,
 'avg_instrumentalness': 0.6190045290597034,
 'avg_speechiness': 0.0626400337980007,
 'avg_time_signature': 3.803718044328438,
 'avg_liveness': 0.167641725741742,
 '%_major': 0.65}