# Data preprocessing

## Importing libraries

In [1]:
import json
from copy import deepcopy
import numpy as np
import pandas as pd
import collections

import warnings
warnings.filterwarnings('ignore')

## Helper functions used to preprocess the data

In [2]:
def load_data(file):
    
    #Function to load json file
    
    with open(file, 'r') as fd:
        data_from_json = json.load(fd)
        return data_from_json
    
def create_playlist_dataframe(tracks_db, playlists):
    
    #Function to get track features and return a playlist dictionary with track features
    
    processed_playlists = deepcopy(playlists)
    
    missing_counts = 0
    # We do a loop for each playlist
    for index, playlist in enumerate(processed_playlists):
        # get the list of track ids for each playlist
        track_ids = playlist['track_ids']
        track_feature_list = ['acousticness', 'album_id', 'album_name', 'album_popularity','artists_genres', 
                              'artists_ids', 'artists_names', 'artists_num_followers', 'artists_popularities',
                              'avg_artist_num_followers', 'avg_artist_popularity', 'danceability', 'duration_ms',
                              'energy', 'explicit', 'instrumentalness', 'isrc', 'key', 'liveness', 
                              'loudness', 'mode', 'mode_artist_genre', 'name', 'num_available_markets',
                              'popularity', 'speechiness', 'std_artist_num_followers', 'std_artist_popularity',
                              'tempo', 'time_signature', 'valence']
        
        # new entries of audio features for each playlist as a list to append each track's audio feature
        for track_feature in track_feature_list:
            playlist['track_' + track_feature] = []
        
    
        for track_id in track_ids:
            # check if the track_id is in the scrapped_tracks
            if track_id in tracks_db.keys():
                # append each track's audio feature into the playlist dictionary
                for track_feature in track_feature_list:
                    if track_feature in tracks_db[track_id].keys():
                        playlist['track_' + track_feature].append(tracks_db[track_id][track_feature])
            else:
                missing_counts += 1
        processed_playlists[index] = playlist
    print('tracks that are missing : {}'.format(missing_counts))
    return processed_playlists


def build_playlist_dataframe(playlists_dictionary_list):
    
    #Function to build playlist dataframe from playlists dictionary with track features
    
    
    if playlists_dictionary_list[7914]['id'] == '4krpfadGaaW42C7cEm2O0A':
        del playlists_dictionary_list[7914]
        
    # features to take the average and the std
    features_avg = ['track_acousticness', 'track_avg_artist_num_followers', 'track_album_popularity',
                    'track_avg_artist_popularity', 'track_danceability', 'track_duration_ms', 
                    'track_energy', 'track_explicit', 'track_instrumentalness','track_liveness', 
                    'track_loudness', 'track_mode', 'track_num_available_markets',
                    'track_std_artist_num_followers', 'track_std_artist_popularity',
                    'track_popularity', 'track_speechiness', 'track_tempo', 'track_valence'
                   ]                
                      
    # features to take the mode
    features_mode = ['track_artists_genres','track_key','track_time_signature']

    # features as is
    features = ['collaborative', 'num_followers', 'num_tracks']

    processed_playlists = {}

    for index, playlist in enumerate(playlists_dictionary_list):
        playlist_data = {} 
        playlist_data['id'] = playlist['id']

        for key in playlist.keys():
            if key in features_avg: # take avg and std
                playlist_data[key + '_avg'] = np.mean(playlist[key])
                playlist_data[key + '_std'] = np.std(playlist[key])
                if key in set(['track_popularity', 'track_album_popularity', 'track_avg_artist_popularity']):
                    playlist_data[key + '_max'] = max(playlist[key])
            elif key in features_mode: # take mode
                if playlist[key]:
                    if key == 'track_artists_genres':
                        flatten = lambda l: [item for sublist in l for item in sublist]
                        flattened_value = flatten(playlist[key])
                        if flattened_value:
                            counter = collections.Counter(flattened_value)
                            playlist_data[key + '_mode'] = counter.most_common()[0][0]
                            playlist_data[key + '_unique'] = len(set(flattened_value))
                    else:
                        counter = collections.Counter(playlist[key])
                        playlist_data[key + '_mode'] = counter.most_common()[0][0]
                        playlist_data[key + '_unique'] = len(set(playlist[key]))
            elif key in features:
                playlist_data[key] = playlist[key]

        processed_playlists[index] = playlist_data
    df = pd.DataFrame(processed_playlists).T
    
    # Drop all incomplete playlists
    full_df = df.dropna(axis=0, how='any')
    full_df.reset_index(inplace=True, drop=True)
    
    # Define our genre labels
    predefined_genres =['pop rap', 'punk', 'korean pop', 'pop christmas', 'folk', 'indie pop', 'pop', 
                    'rock', 'rap' , 'house', 'indie', 'dance', 'edm', 'mellow', 'hip hop',  
                    'alternative', 'jazz', 'r&b', 'soul', 'reggae', 'classical', 'funk', 'country',
                    'metal', 'blues', 'elect']
    # new column genre
    full_df['genre'] = None
    
    # We label all the genres
    genres = full_df['track_artists_genres_mode']
    for g in reversed(predefined_genres):
        full_df['genre'][genres.str.contains(g)] = g

    # We label all the other observations with 'other' 
    full_df['genre'].fillna('other', inplace=True)
    full_df.drop('track_artists_genres_mode', axis=1, inplace=True)
    
    return full_df
    

def build_track_dataframe(tracks_db):
    
    # Function to build track dataframe
    
    df = pd.DataFrame(tracks_db).T
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'trackID'}, inplace=True)
    df.drop('album_genres', axis=1, inplace=True) # drop album genre because it's null for all tracks
    
    # Define our genre labels
    predefined_genres =['pop rap', 'punk', 'korean pop', 'pop christmas', 'folk', 'indie pop', 'pop', 
                    'rock', 'rap' , 'house', 'indie', 'dance', 'edm', 'mellow', 'hip hop',  
                    'alternative', 'jazz', 'r&b', 'soul', 'reggae', 'classical', 'funk', 'country',
                    'metal', 'blues', 'elect']
    
    # Drop all observations (tracks) with missingness
    full_df = df.dropna(axis=0, how='any')
    full_df.reset_index(inplace=True, drop=True)
    
    # Create a new column genre_category
    full_df['genre'] = None
    
    # Label genres
    genres = full_df['mode_artist_genre']
    for g in reversed(predefined_genres):
        full_df['genre'][genres.str.contains(g)] = g

    # Label all observations that did not match our predefined genres as 'other'  
    full_df['genre'].fillna('other', inplace=True)
    full_df.drop('mode_artist_genre', axis=1, inplace=True)
    
    return full_df

## Preprocessing the playlists

In [3]:
# Load playlists database
playlists = load_data('./Data/playlists_from_200_search_words.json')
tracks_db = load_data('./Data/tracks.json')

# Extrack track features for each playlist
playlists_with_track_features = create_playlist_dataframe(tracks_db, playlists)

# Build playlists dataframe
playlists_df = build_playlist_dataframe(playlists_with_track_features)

playlists_df.head()


tracks that are missing : 161


Unnamed: 0,id,num_followers,collaborative,num_tracks,track_acousticness_avg,track_acousticness_std,track_album_popularity_avg,track_album_popularity_std,track_album_popularity_max,track_artists_genres_unique,...,track_std_artist_num_followers_std,track_std_artist_popularity_avg,track_std_artist_popularity_std,track_tempo_avg,track_tempo_std,track_time_signature_mode,track_time_signature_unique,track_valence_avg,track_valence_std,genre
0,37i9dQZF1DX9sIqqvKsjG8,1359200,False,304,0.99104,0.00704971,50.62,12.1324,69,6,...,0.0497494,0.0,0.0,116.403,33.6361,4,4,0.244111,0.137205,other
1,5oxZIYU1L9N1CczN0C4JkM,372240,False,519,0.781945,0.276167,42.81,5.61194,53,10,...,1050.02,0.309889,1.5076,116.96,34.7439,4,4,0.318979,0.20249,other
2,37i9dQZF1DWYmSg58uBxin,50518,False,125,0.95229,0.0998561,44.45,8.85819,64,21,...,15705.5,0.485,2.16501,109.473,36.7067,4,4,0.157351,0.103388,other
3,4rmUneFYK2N9lxkaxJ3Qko,16862,False,55,0.518699,0.381347,42.3455,17.8867,65,28,...,151868.0,1.15455,3.46525,119.323,24.1115,4,3,0.0926964,0.075302,other
4,37i9dQZF1DX8f5qTGj8FYl,204994,False,80,0.114581,0.135392,26.7,13.9905,54,64,...,29321.1,0.692723,2.16282,113.945,21.8186,4,3,0.759212,0.163188,funk


In [4]:
print('Number of observations with missing values: ', sum(playlists_df.isnull().any()))

Number of observations with missing values:  0


In [5]:
playlists_df.to_csv('../../data/playlists.csv', index=False)
# build dataframe from tracks database (i.e. tracks.json)


## Preprocessing the tracks

In [6]:
tracks_df = build_track_dataframe(tracks_db)

In [7]:
tracks_df.head()

Unnamed: 0,trackID,id,name,explicit,isrc,num_available_markets,danceability,energy,key,loudness,...,artists_names,artists_ids,artists_popularities,artists_num_followers,artists_genres,avg_artist_popularity,std_artist_popularity,avg_artist_num_followers,std_artist_num_followers,genre
0,6VKid1zSeLVUEV3oQR2i0k,6VKid1zSeLVUEV3oQR2i0k,Die Blaue Blume,False,SEXGF2013201,92,0.378,0.0603,0,-24.635,...,[Pontiver Bogross],[3dlALRQ5FDfoMDLFkzwhcw],[44],[61],[background piano],44,0,61,0,other
1,1BZo2CHWy9gSgidvexgNYM,1BZo2CHWy9gSgidvexgNYM,Sabuelum,False,QZAKB2003543,92,0.291,0.0121,10,-28.978,...,[Johann Kurzweil],[7u64sG9W2BjhBS7qfjBYxQ],[42],[54],[background piano],42,0,54,0,other
2,7nC2EOpMnpDT2DkvniimSm,7nC2EOpMnpDT2DkvniimSm,Lost,False,NLF9E2000005,92,0.389,0.1,10,-22.464,...,[Annelie],[0Rm9NmU9uyvf7tfVt4YNKC],[58],[2846],[neo-classical],58,0,2846,0,classical
3,7BbUNLqsUWQAM0QUNoFZWs,7BbUNLqsUWQAM0QUNoFZWs,Equus,False,SE5Q52001556,92,0.39,0.0434,2,-22.244,...,[S.A. Karl],[5wvtkvwPR6pF2h7H6f08tM],[59],[1017],"[background music, background piano, calming i...",59,0,1017,0,other
4,39ObnHa9VOJIwR1nULLXJI,39ObnHa9VOJIwR1nULLXJI,Reisida,False,SE5W32046105,92,0.499,0.0321,0,-27.78,...,[Jan Thiel],[2WrpycwxpswRBTsSnNMWCb],[54],[519],[background piano],54,0,519,0,other


In [8]:
# # save tracks_df to csv
tracks_df.to_csv('../../data/tracks.csv', index=False)