# Spotify Data

## data collection

* Use Spotipy to get Spotify public playlists data
* For each playlist, we get our response variable: # of followers
* For predictors, we pick the following attributes:
    * the number of tracks in the playlist
    * Audio Features
        * Popularity
        * Acousticness
        * Danceability
        * Instrumentalness
        * Liveness
        * Loudness
        * Valence
    * Artists Information 
        * Genre
        * Popularity
    * Avaiable markets: if this playlist can reach wider range, it might get more followers

In total, we are getting 18 features for now.

In [181]:
import pandas as pd 
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

playlists = sp.user_playlists('spotify')
playlists_data = []
while playlists:
    for i, playlist in enumerate(playlists['items']):
        playlist_data = {}
        playlist_data["songs"] = []
        
        uri = playlist["uri"]
        username = uri.split(':')[2]
        playlist_id = uri.split(':')[4]
        
        results = sp.user_playlist(username, playlist_id)
        
        playlist_data["playlist_id"] = playlist_id

        playlist_data["followers"] = results['followers']['total']
        
        playlist_data["tracks_num"] = playlist['tracks']['total']
        
        playlist_data["name"] = playlist["name"]
        if i%100 == 0:
            print(i,playlist_data["name"])
        
        tracks = sp.user_playlist_tracks("spotify",playlist_id)
        markets = []
        pop = []
        for j in range(tracks_num):
            try:
                playlist_data["songs"].append(tracks["items"][j]["track"]['name']) 
                
                market = tracks["items"][i]["track"]["available_markets"]
                markets = list(set().union(markets,market))
                
                pop.append(tracks["items"][j]["track"]['popularity'])
                
            except:
                continue
        playlist_data["avaiable_market"] = len(markets)
        if pop != []:
            playlist_data["pop_mean"] = np.mean(pop)
            playlist_data["pop_std"] = np.std(pop)
        else:
            playlist_data["pop_mean"] = np.nan
            playlist_data["pop_std"] = np.nan

        playlists_data.append(playlist_data)
    
    if playlists['next']:
        playlists = sp.next(playlists)
    else:
        playlists = None

In [None]:
# def audio_features(self, tracks=[]):
#     """ Get audio features for one or multiple tracks based upon their Spotify IDs
#     Parameters:
#     - tracks - a list of track URIs, URLs or IDs, maximum: 50 ids
#     """
#     if isinstance(tracks, str):
#         trackid = self._get_id('track', tracks)
#         results = self._get('audio-features/?ids=' + trackid)
#     else:
#         tlist = [self._get_id('track', t) for t in tracks]
#         results = self._get('audio-features/?ids=' + ','.join(tlist))
#     # the response has changed, look for the new style first, and if
#     # its not there, fallback on the old style
#     if 'audio_features' in results:
#         return results['audio_features']
#     else:
#         return results

In [None]:
start = 0
end = 1700
for i, playlist in enumerate(pl_more[start:end]):
    for j, track in enumerate(playlist['tracks']):
        if track != None:
            try:
                track_audio=sp.audio_features(track['id'])
                if track_audio[0] != None and track_audio!=None:
                    pl_more[i+start]['tracks'][j].update(track_audio[0])
                if (type(track['artists'])==list):
                    track_artist = sp.search(q='artist:'+track['artists'][0]['name'], type='artist')
                elif (type(track['artists'])==dict):   
                    track_artist = sp.search(q='artist:'+track['artists']["items"][0]['name'], type='artist')
                else:
                    print(i+start,"no artist")
                    track_artist = None
                if track_artist:
                    pl_more[i+start]['tracks'][j].update(track_artist)  
            except:
                print("!!",i+start)
    print(i+start)

## Store data in json form and read

In [2]:
import json
# with open("playlists_data.json","w") as js:
#     json.dump(playlists_data,js)
# playlists_dataframe = pd.DataFrame.from_dict(playlists_data)
# playlists_dataframe.to_json("playlists_df.json")

In [3]:
# playlists_dataframe= pd.read_json("playlists_df.json")
with open('data_1700.json','r') as js:
    playlists_data=json.load(js)

## Feature Engineering

In [2]:
import json
with open('data_1700.json','r') as js:
    pl_1700=json.load(js)

In [1]:
playlists_data = []
for pl in pl_1700:
    try:
        playlist_data = {}

        # followers
        # tracks_num
        playlist_data["name"] = pl["list"]["name"]
        playlist_data["followers"] = pl["list"]["followers"]["total"]
        playlist_data["tracks_num"] = pl["list"]["tracks"]["total"]

        # market 
        # popularity
        markets = []
        gen = []
        artist_pop = []
        pop = []
        acous = []
        dance = []
        energy = []
        instru = []
        live = []
        loud = []
        valence = []
        for j in range(playlist_data["tracks_num"]):


            market = pl["tracks"][j]["available_markets"]
            markets = list(set().union(markets,market))
            gen.extend(pl["tracks"][j]["artists"]["items"][0]["genres"])
            artist_pop.append(pl["tracks"][j]["artists"]["items"][0]["popularity"])

            pop.append(pl["tracks"][j]['popularity'])
            acous.append(pl["tracks"][j]['acousticness'])
            dance.append(pl["tracks"][j]['danceability'])
            energy.append(pl["tracks"][j]['energy'])
            instru.append(pl["tracks"][j]['instrumentalness'])
            live.append(pl["tracks"][j]['liveness'])
            loud.append(pl["tracks"][j]['loudness'])
            valence.append(pl["tracks"][j]['valence'])

        playlist_data["avaiable_market"] = len(markets)

        c = Counter(gen)
        playlist_data["genre"] = c.most_common()[0][0]
        
        if artist_pop != []:
            playlist_data["artist_pop_mean"] = np.mean(artist_pop)
            playlist_data["artist_pop_std"] = np.std(artist_pop)
        else:
            playlist_data["artist_pop_mean"] = np.nan
            playlist_data["artist_pop_std"] = np.nan 

        if pop != []:
            playlist_data["pop_mean"] = np.mean(pop)
            playlist_data["pop_std"] = np.std(pop)
        else:
            playlist_data["pop_mean"] = np.nan
            playlist_data["pop_std"] = np.nan

        if acous != []:
            playlist_data["acous_mean"] = np.mean(acous)
            playlist_data["acous_std"] = np.std(acous)
        else:
            playlist_data["acous_mean"] = np.nan
            playlist_data["acous_std"] = np.nan     

        if dance != []:
            playlist_data["dance_mean"] = np.mean(dance)
            playlist_data["dance_std"] = np.std(dance)
        else:
            playlist_data["dance_mean"] = np.nan
            playlist_data["dance_std"] = np.nan

        if energy != []:
            playlist_data["energy_mean"] = np.mean(energy)
            playlist_data["energy_std"] = np.std(energy)
        else:
            playlist_data["energy_mean"] = np.nan
            playlist_data["energy_std"] = np.nan

        if instru != []:
            playlist_data["instru_mean"] = np.mean(instru)
            playlist_data["instru_std"] = np.std(instru)
        else:
            playlist_data["instru_mean"] = np.nan
            playlist_data["instru_std"] = np.nan

        if live != []:
            playlist_data["live_mean"] = np.mean(live)
            playlist_data["live_std"] = np.std(live)
        else:
            playlist_data["live_mean"] = np.nan
            playlist_data["live_std"] = np.nan

        if loud != []:
            playlist_data["loud_mean"] = np.mean(loud)
            playlist_data["loud_std"] = np.std(loud)
        else:
            playlist_data["loud_mean"] = np.nan
            playlist_data["loud_std"] = np.nan

        if valence != []:
            playlist_data["valence_mean"] = np.mean(valence)
            playlist_data["valence_std"] = np.std(valence)
        else:
            playlist_data["valence_mean"] = np.nan
            playlist_data["valence_std"] = np.nan

        playlists_data.append(playlist_data)
    except:
        continue

In [None]:
playlists_dataframe = pd.DataFrame.from_dict(playlists_data)

In [16]:
# playlists_dataframe = pd.read_json("playlists_dataframe_904_with_name.json")

In [17]:
df = playlists_dataframe.copy()
df["genre_narrow"] = ""
df.loc[df["genre"].str.contains('pop'),"genre_narrow"] = "pop"
df.loc[df["genre"].str.contains('metal'),"genre_narrow"] = "metal"
df.loc[df["genre"].str.contains('jazz'),"genre_narrow"] = "jazz"
df.loc[df["genre"].str.contains('rock'),"genre_narrow"] = "rock"
df.loc[df["genre"].str.contains('punk'),"genre_narrow"] = "punk"
df.loc[df["genre"].str.contains('hop'),"genre_narrow"] = "hiphop"
df.loc[df["genre"].str.contains('new'),"genre_narrow"] = "modern"
df.loc[df["genre"].str.contains('post'),"genre_narrow"] = "modern"
df.loc[df["genre"].str.contains('modern'),"genre_narrow"] = "modern"
df.loc[df["genre"].str.contains('classical'),"genre_narrow"] = "classical"
df.loc[df["genre"].str.contains('soul'),"genre_narrow"] = "soul"
df.loc[df["genre"].str.contains('blues'),"genre_narrow"] = "blues"
df.loc[df["genre"].str.contains('core'),"genre_narrow"] = "hardcore"
df.loc[df["genre"].str.contains('children'),"genre_narrow"] = "children"
df.loc[df["genre"].str.contains('dance'),"genre_narrow"] = "dance"
df.loc[df["genre"].str.contains('rap'),"genre_narrow"] = "rap"
df.loc[df["genre"].str.contains('drama'),"genre_narrow"] = "drama"
df.loc[df["genre"].str.contains('funk'),"genre_narrow"] = "funk"
df.loc[df["genre"].str.contains('edm'),"genre_narrow"] = "electronic"
df.loc[df["genre"].str.contains('electronic'),"genre_narrow"] = "electronic"
df.loc[df["genre"].str.contains('ambient'),"genre_narrow"] = "healing"
df.loc[df["genre"].str.contains('environmental'),"genre_narrow"] = "healing"
df.loc[df["genre"].str.contains('healing'),"genre_narrow"] = "healing"
df.loc[df["genre"].str.contains('hollywood'),"genre_narrow"] = "hollywood"
df.loc[df["genre_narrow"]== "","genre_narrow"] = "other"

In [18]:
df = df.drop(["genre"],axis=1)
df_with_dummies = pd.get_dummies(df, columns = ["genre_narrow"])

In [23]:
df_with_dummies.to_json("playlists_dataframe_904_with_name_dummy.json")

In [9]:
# df.to_csv("playlists_dataframe_841_narrow_genre.csv")

## Data Description (Feature table)

Now, we have 1700 playlists with 21 features. The following are the details about each features. (More analysis, see EDA page)


| Key                             | Value Type   | Value Description                                                                                                                                                                                                                                                                                                                                                                                                    | Feature Engineering                                                                                                                 |
|---------------------------------|--------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
| followers                       | int          | Number of Followers: from 0 to inf, indicate how many people follow this playlist.                                                                                                                                                                                                                                                                                                                                   | Probably need to do some log transformation later.                                                                                  |
| acous_mean  acous_std           | float  float | Acousticness: A confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic.                                                                                                                                                                                                                                                                           | Get the mean/standard deviation of acousticness for all track in the playlist.                                                      |
| artist_pop_mean  artist_pop_std | float  float | Popularity (Artist): How popular this artist is and measured from 0.0 to 100.0. 100.0 represents most popular.                                                                                                                                                                                                                                                                                                       | Get the mean/standard deviation of popularity for all artists in each track inside one playlist.                                    |
| avaiable_market                 | int          | The number of available markets: how many possible markets can download/listen to this playlist.                                                                                                                                                                                                                                                                                                                     | Convert lists of available markets to numbers, and get the maximum number of market for all tracks in the playlist.                 |
| dance_mean  dance_std           | float  float | Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable.                                                                                                                                                                         | Get the mean/standard deviation of danceability for all track inside one playlist.                                                  |
| energy_mean  energy_std         | float  float | Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.                            | Get the mean/standard deviation of energy for all track inside one playlist.                                                        |
| instru_mean  instru_std         | float  float | Instrumentalness: Predicts whether a track contains no vocals. "Ooh" and "aah" sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly "vocal". The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0. | Get the mean/standard deviation of instrumentalness for all track inside one playlist.                                              |
| live_mean  live_std             | float  float | Liveness: Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live.                                                                                                                                                                                    | Get the mean/standard deviation of liveness for all track inside one playlist.                                                      |
| loud_mean  loud_std             | float  float | Loudness: The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude). Values typical range between -60 and 0 db.                                                                               | Get the mean/standard deviation of loudness for all track inside one playlist.                                                      |
| pop_mean  pop_std               | float  float | Popularity (track): How popular this artist is and measured from 0.0 to 100.0. 100.0 represents most popular.                                                                                                                                                                                                                                                                                                        | Get the mean/standard deviation of popularity for all track inside one playlist. Need to check the collinearity with pop of artist. |
| tracks_num                      | int          | How many tracks are inside this playlist. From 0 to inf.                                                                                                                                                                                                                                                                                                                                                             |                                                                                                                                     |
| valence_mean   valence_std      | float  float | Valence: A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).                                                                                                                                                           | Get the mean/standard deviation of valence for all track inside one playlist.                                                       |
| genre_narrow                    | string       | Genre: From over 130 genres, narrow down to 20 general genres. For example: "pop", "rap", "classical", etc.                               



