In [1]:
import os 
os.chdir('../src/')
print("Current working directory is now: ", os.getcwd())

import pandas as pd 
import numpy as np
from IPython.display import display

from d00_utils.load_confs import load_credentials, load_paths
from d00_utils.data_loader_sql import DataLoaderSQL
from d01_data_processing.compute_features import * 

creds = load_credentials()
paths = load_paths()

Current working directory is now:  C:\Users\Caroline Wang\OneDrive\Duke\Senior Year\CS316\Project\src


In this notebook we will create functions to pull data from the VM database and process the data into features 

## Queried Data Into Features 

### Compute Features for One Individual

In [2]:
dl = DataLoaderSQL(creds=creds,
                   paths=paths)

listener_id = "ninjakichi"

In [6]:
for name in np.array(dl.select_from_table("select display_name from listeners")):
    print(name[0])


Caroline Wang
ninjakichi
Elise Brown
1214600613
Martha
bnativi17


#### Track related features 

In [6]:
track_cmd = f"""
        select track_pop, acousticness, danceability, energy, valence, 
        loudness, tempo, instrumentalness, speechiness, 
        mode, time_signature, liveness
        from tracks as t1
        where t1.track_id in (
                     select t2.track_id 
                      from toptracks as t2 
                      where listener_id='{listener_id}'and time_span='long_term')                  
            """

track_info = dl.select_from_table(sql=track_cmd)
cols = track_info.columns
track_info[cols] = track_info[cols].apply(pd.to_numeric, errors='coerce')
track_info.head()

Unnamed: 0,track_pop,acousticness,danceability,energy,valence,loudness,tempo,instrumentalness,speechiness,mode,time_signature,liveness
0,23,0.675,0.197,0.32,0.108,-16.696,83.893,0.748,0.0387,1,4,0.803
1,73,0.228,0.653,0.816,0.816,-4.353,178.086,0.0,0.167,1,4,0.0967
2,83,0.257,0.671,0.373,0.732,-18.064,92.717,7.9e-05,0.0323,1,4,0.0481
3,73,0.0901,0.446,0.952,0.624,-5.321,125.303,0.0857,0.0523,1,4,0.112
4,74,0.649,0.571,0.307,0.108,-10.958,83.72,0.0,0.0499,0,4,0.178


In [7]:
compute_track_features(track_info)

{'avg_acousticness': 0.3849086253539318,
 'avg_danceability': 0.49208783113630666,
 'avg_energy': 0.5548612411256385,
 'avg_valence': 0.45680829315132543,
 'avg_loudness': -11.53396334195145,
 'avg_tempo': 112.33350984716522,
 'avg_instrumentalness': 0.6190045290597034,
 'avg_speechiness': 0.0626400337980007,
 'avg_time_signature': 3.803718044328438,
 'avg_liveness': 0.167641725741742,
 '%_major': 0.65}

#### Album related features 

In [8]:
artist_cmd = f"""
            select * 
            from artisthasgenre 
            natural join 
                (select a1.artist_id, a1.artist_pop, a1.num_followers 
                 from artists as a1 
                 where a1.artist_id in 
                    (select a2.artist_id 
                     from topartists as a2
                     where listener_id='ninjakichi' and time_span='long_term' ))
                 as artist_info
            """
artist_info = dl.select_from_table(sql=artist_cmd)
artist_info.head()

Unnamed: 0,artist_id,genre_name,artist_pop,num_followers
0,1HY2Jd0NmPuamShAr6KMms,dance pop,87,11076948
1,1HY2Jd0NmPuamShAr6KMms,pop,87,11076948
2,1HY2Jd0NmPuamShAr6KMms,post-teen pop,87,11076948
3,3dRfiJ2650SZu6GbydcHNb,soundtrack,75,849382
4,1hCkSJcXREhrodeIHQdav8,german soundtrack,74,396012


In [9]:
features_artists_dict = compute_artists_features(artist_info=artist_info, 
                         genre_map=genre_map, 
                         num_tracks=num_tracks)
features_artists_dict

{'is_alternative': 0.1,
 'is_experimental': 0.0,
 'is_country': 0.0,
 'is_rock': 0.05,
 'is_metal': 0.05,
 'is_hip_hop': 0.0,
 'is_pop': 0.25,
 'is_r&b_soul': 0.0,
 'is_jazz': 0.0,
 'is_christian': 0.0,
 'is_latin': 0.0,
 'is_soundtrack': 0.6,
 'is_classical': 0.2,
 'is_contemporary': 0.0,
 'is_vocal': 0.0,
 'is_edm': 0.05,
 'is_chill': 0.0,
 'avg_artist_pop': 0.5884615384615385}

### Compute features for all individuals

In [10]:
track_cmd = """
          select toptrack_info.listener_id, 
               track_pop, acousticness, 
               danceability, energy, valence, 
               loudness, tempo, 
               instrumentalness, speechiness, 
               mode, time_signature, liveness
          from tracks as t1
          natural join 
            (select t2.track_id, t2.listener_id
             from toptracks as t2
             where time_span='long_term'
            )
          as toptrack_info
         """

top_track_info = dl.select_from_table(sql=track_cmd)

artist_cmd = """
           select * 
           from artisthasgenre 
           natural join 
              (select topartist_info.listener_id, a1.artist_id, a1.artist_pop, a1.num_followers 
               from artists as a1
               natural join 
                  (select topart.listener_id, topart.artist_id
                   from topartists as topart 
                   where time_span='long_term')
               as topartist_info) 
           as artist_info
           """

top_artist_info = dl.select_from_table(sql=artist_cmd)

In [11]:
compute_features_all(top_track_info=top_track_info, 
                     top_artist_info=top_artist_info)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  track_info['inv_pop'] = compute_inv_pop(track_info['track_pop'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_info[f"is_{genre}"] = logical_or_bool


Unnamed: 0,avg_acousticness,avg_danceability,avg_energy,avg_valence,avg_loudness,avg_tempo,avg_instrumentalness,avg_speechiness,avg_time_signature,avg_liveness,...,is_jazz,is_christian,is_latin,is_soundtrack,is_classical,is_contemporary,is_vocal,is_edm,is_chill,avg_artist_pop
0,0.524948,0.632793,0.496907,0.420923,-9.222899,101.786011,0.044895,0.160565,3.965484,0.18336,...,0.1,0.0,0.0,0.0,0.0,0.2,0.0,0.5,0.0,0.7654
1,0.384909,0.492088,0.554861,0.456808,-11.533963,112.33351,0.619005,0.06264,3.803718,0.167642,...,0.0,0.0,0.0,0.6,0.2,0.0,0.0,0.05,0.0,0.588462
2,0.463738,0.513752,0.543231,0.426426,-10.188628,112.664519,0.002306,0.170076,3.940402,0.250314,...,0.0,0.25,0.0,0.05,0.0,0.05,0.0,0.25,0.0,0.741905
3,0.024226,0.58355,0.889773,0.288178,-4.829371,139.831849,0.285106,0.096484,4.0,0.359377,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.65,0.0,0.686832
4,0.481755,0.428288,0.449242,0.310486,-11.866537,105.15272,0.2826,0.07026,3.607879,0.172975,...,0.0,0.0,0.0,0.0,0.25,0.05,0.0,0.3,0.0,0.705
5,0.063211,0.658945,0.781255,0.460777,-4.950297,123.898813,0.009605,0.063384,3.971549,0.214984,...,0.0,0.0,0.05,0.0,0.0,0.05,0.0,0.65,0.0,0.800595
