# Recommendation 

In [1]:
# importing libraries
import pandas as pd
from copy import deepcopy
import copy
import numpy as np
import collections
import json
import joblib
from itertools import combinations
from IPython.display import display

## Helper functions to work with the database

In [2]:
def create_track_features_dictionary(tracks, playlists):
    playlists_dict = copy.deepcopy(playlists)
    
    missing_counts = 0
    # Loop over each playlist
    for index, playlist in enumerate(playlists_dict):
        track_feature_list = ['acousticness', 'album_id', 'album_name', 'album_popularity','artists_genres', 
                              'artists_ids', 'artists_names', 'artists_num_followers', 'artists_popularities',
                              'avg_artist_num_followers', 'avg_artist_popularity', 'danceability', 'duration_ms',
                              'energy', 'explicit', 'instrumentalness', 'isrc', 'key', 'liveness', 
                              'loudness', 'mode', 'genre', 'name', 'num_available_markets',
                              'popularity', 'speechiness', 'std_artist_num_followers', 'std_artist_popularity',
                              'tempo', 'time_signature', 'valence']
        
        # new entries of audio features for each playlist as a list to append each track's audio feature
        for track_feature in track_feature_list:
            playlist['track_' + track_feature] = []
        
        # append each tracks' audio features into the entries of the playlist
        selected_tracks = tracks[tracks['trackID'].isin(playlist['track_ids'])]
        for j, track in selected_tracks.iterrows():
            # append each track's audio feature into the playlist dictionary
            for track_feature in track_feature_list:
                if track_feature in list(selected_tracks.columns):
                    playlist['track_' + track_feature].append(track[track_feature])
        playlists_dict[index] = playlist
    print('tracks that are missing : {}'.format(missing_counts))
    return playlists_dict


# Function that builds a playlist dataframe from a playlist dictionary

def create_playlist_dataframe(playlists_dictionary_list):
    
    # features to take the avg and std
    features_avg = ['track_acousticness', 'track_avg_artist_num_followers', 'track_album_popularity',
                    'track_avg_artist_popularity', 'track_danceability', 'track_duration_ms', 
                    'track_energy', 'track_explicit', 'track_instrumentalness','track_liveness', 
                    'track_loudness', 'track_mode', 'track_num_available_markets',
                    'track_std_artist_num_followers', 'track_std_artist_popularity',
                    'track_popularity', 'track_speechiness', 'track_tempo', 'track_valence'
                   ]          
                      
    # features to take the mode, # of uniques
    features_mode = ['track_key','track_time_signature', 'track_genre']

    # features as is
    features = ['collaborative', 'num_followers', 'num_tracks']

    playlists_dict = {}

    for index, playlist in enumerate(playlists_dictionary_list):
        playlist_data = {}

        for key in playlist.keys():
            if key in features_avg: # take avg and std
                playlist_data[key + '_avg'] = np.mean(playlist[key])
                playlist_data[key + '_std'] = np.std(playlist[key])
                if key in set(['track_popularity', 'track_album_popularity', 'track_avg_artist_popularity']):
                    playlist_data[key + '_max'] = max(playlist[key])
            elif key in features_mode: # take mode
                if playlist[key]:
                    if key == 'track_artists_genres':
                        flatten = lambda l: [item for sublist in l for item in sublist]
                        flattened_value = flatten(playlist[key])
                        if flattened_value:
                            counter = collections.Counter(flattened_value)
                            playlist_data[key + '_mode'] = counter.most_common()[0][0]
                            playlist_data[key + '_unique'] = len(set(flattened_value))
                    else:
                        counter = collections.Counter(playlist[key])
                        playlist_data[key + '_mode'] = counter.most_common()[0][0]
                        playlist_data[key + '_unique'] = len(set(playlist[key]))
            elif key in features:
                playlist_data[key] = playlist[key]

        playlists_dict[index] = playlist_data
    df = pd.DataFrame(playlists_dict).T
    df.rename(columns = {'track_genre_mode': 'genre'}, inplace = True)
    
    return df

# Function that loads json files

# Load json file
def load_data(file):
    with open(file, 'r') as fd:
        data_from_json = json.load(fd)
        return data_from_json

## Loading data

In [3]:
# Loading the files

tracks_json = load_data('./Data/tracks.json')
tracks_csv = pd.read_csv('./Data/tracks.csv')
playlists_json = load_data('./Data/playlists_from_200_search_words.json')
playlists_csv = pd.read_csv('./Data/playlists.csv')
X_train = pd.read_csv('./Data/X_train.csv')    

## Functions to build the recommendation system

In [4]:
def generate_playlists(genre, num_tracks):
    
    num_tracks_pool = num_tracks + 2
    
    # Order tracks by popularity in ascending order
    tracks_filtered = tracks_csv[tracks_csv['genre'] == genre].sort_values('popularity', ascending = False).iloc[:num_tracks_pool]
    
    # generate combinations of track ids from the filtered tracks
    combinations_tracks_ids = list(combinations(list(tracks_filtered['trackID']), num_tracks))
    
    # We create a list of dictionaries with track ids and size = num_tracks
    playlist_track_ids = [{'track_ids':track_ids, 'num_tracks':num_tracks_pool} for track_ids in combinations_tracks_ids]
    
    # Apply the function to create a dicionary with track features
    playlists_dict = create_track_features_dictionary(tracks_csv, playlist_track_ids)
    
    # create playlists from the playlists dictionary
    genre_playlists_csv = create_playlist_dataframe(playlists_dict)
    
    return genre_playlists_csv, playlist_track_ids


def preprocess_playlist_candidates(candidates_playlists_csv):
    
    # One hot encoding for categorical variable
    categorical_predictors = ['genre', 'track_time_signature_mode', 'track_key_mode']
    df_encoded = pd.get_dummies(candidates_playlists_csv, prefix = categorical_predictors, columns = categorical_predictors)
    encoded_df_full = pd.DataFrame()
    cur_columns = set(list(df_encoded.columns))
    for col in X_train.columns:
        if col in cur_columns:
            encoded_df_full = pd.concat([encoded_df_full, df_encoded[col]], axis = 1)
        else:
            encoded_df_full[col] = 0
    

    return encoded_df_full


def recommended_playlist(genre, num_tracks):
    genre_playlists_csv, playlist_track_ids = generate_playlists(genre, num_tracks)
    df_recommendation = preprocess_playlist_candidates(genre_playlists_csv)
    
    # Load meta model
    meta_model = joblib.load('./Data/meta_mod_3.pkl')
    prefix = './Data/'
    suffix = '.pkl'
    models = ['RF_regression', 'AdaBoost_regression','SVM_regression']

    # Record each single model's predicted results
    meta_X_recommendation = np.zeros((df_recommendation.shape[0], len(models)))
    for i, name in enumerate(models):
        model_name = prefix + name + suffix
        model = joblib.load(model_name) 
        meta_X_recommendation[:, i] = model.predict(df_recommendation)
    
    predicted_log_num_followers = meta_model.predict(meta_X_recommendation)
    predicted_num_followers = np.exp(predicted_log_num_followers) - 1

    recommendation_playlist = playlist_track_ids[np.argmax(predicted_num_followers)]
    recommendation_playlist_pred_num_followers = max(predicted_num_followers)
    display_recommendation_df = get_recommendation_tracks_display_info(recommendation_playlist)
    
    print('The recommended playlist is:')
    display(display_recommendation_df)
    print('Predicted num_followers: {}'.format(recommendation_playlist_pred_num_followers))
    return recommendation_playlist, recommendation_playlist_pred_num_followers

def get_recommendation_tracks_display_info(recommendation):
    display_info_list = []
    recommend_track_ids = recommendation['track_ids']
    for track_id in recommend_track_ids:
        display_info = {}
        track_info = tracks_json[track_id]
        display_info['track ID'] = track_id
        display_info['track name'] = track_info['name']
        display_info['artists names'] = track_info['artists_names']
        display_info_list.append(display_info)
    return pd.DataFrame(display_info_list)    

## Functions to validate our recommendation system

In [5]:
def get_most_similar_playlist(recommendation, genre):
    
    # Return the playlist id of the most similar playlist
    
    dissimilarity_list = []
    for playlist in playlists_json:
        cur_pl = playlists_csv[playlists_csv['id'] == playlist['id']]
        if not cur_pl.empty:
            cur_genre = cur_pl['genre'].values[0]
            if cur_genre == genre:
                dissimilarity_list.append(len(set(recommendation['track_ids']) - set(playlist['track_ids'])))
            else:
                dissimilarity_list.append(float('inf'))
        else:
            dissimilarity_list.append(float('inf'))

    most_similar_playlist_id = playlists_json[dissimilarity_list.index(min(dissimilarity_list))]['id']
    return most_similar_playlist_id

def predict_most_similar_playlist(most_similar_playlist_id):
    most_similar_playlist = playlists_csv[playlists_csv['id'] == most_similar_playlist_id]
    
    
    processed_most_similar_playlist = preprocess_playlist_candidates(most_similar_playlist)
    
    
    meta_model = joblib.load('./Data/meta_mod_3.pkl')
    prefix = './Data/'
    suffix = '.pkl'
    models = ['RF_regression', 'AdaBoost_regression','SVM_regression']

    # Record model's predicted results on validation set as the train set for the meta regressor
    meta_X_processed_most_similar_playlist = np.zeros((processed_most_similar_playlist.shape[0], len(models)))
    for i, name in enumerate(models):
        model_name = prefix + name + suffix
        model = joblib.load(model_name) 
        meta_X_processed_most_similar_playlist[:, i] = model.predict(processed_most_similar_playlist)

    
    most_similar_predicted_log_num_followers = meta_model.predict(meta_X_processed_most_similar_playlist)
    print(most_similar_predicted_log_num_followers )
    most_similar_predicted_num_followers = (np.exp(most_similar_predicted_log_num_followers) - 1)[0]
    print('The most similar playlist\'s predicted num_followers: {}'.format(most_similar_predicted_num_followers))

def get_rank_within_genre(playlist_id, genre):
    n_genre = len(playlists_csv[playlists_csv['genre']==genre])
    print('There are {} playlists in genre = {}'.format(n_genre, genre))
    
    df = deepcopy(playlists_csv[playlists_csv['genre']==genre])
    df.sort_values(by=['num_followers'], ascending=False, inplace=True)
    
    # Get the rank of the playlist in the database(by genre)
    df.reset_index(inplace=True)
    rank = df[df['id'] == playlist_id].index.values[0] + 1
    print('The most similar playlist\'s rank within genre is: {}'.format(rank))
    return rank

## Some results

In [6]:
recommendation, recommendation_pred_num_followers = recommended_playlist('pop', 10)
most_similar_pl_id = get_most_similar_playlist(recommendation, 'pop')
predict_most_similar_playlist(most_similar_pl_id)
rank = get_rank_within_genre(most_similar_pl_id, 'pop')

tracks that are missing : 0
The recommended playlist is:


Unnamed: 0,track ID,track name,artists names
0,4Oun2ylbjFKMPTiaSbbCih,WAP (feat. Megan Thee Stallion),"[Cardi B, Megan Thee Stallion]"
1,6UelLqGlWMcVH1E5c4H7lY,Watermelon Sugar,[Harry Styles]
2,1xQ6trAsedVPCdbtDAmk0c,Savage Love (Laxed - Siren Beat),"[Jawsh 685, Jason Derulo]"
3,2XU0oxnq2qxCpomAAuJY8K,Dance Monkey,[Tones And I]
4,3ZG8N7aWw2meb6UrI5ZmnZ,Relación,[Sech]
5,4HBZA5flZLE435QTztThqH,Stuck with U (with Justin Bieber),"[Ariana Grande, Justin Bieber]"
6,4xqrdfXkTW4T0RauPLv3WA,Heather,[Conan Gray]
7,45bE4HXI0AwGZXfZtMp8JR,you broke me first,[Tate McRae]
8,2ygvZOXrIeVL4xZmAWJT2C,my future,[Billie Eilish]
9,7qEHsqek33rTcFNT9PFqLf,Someone You Loved,[Lewis Capaldi]


Predicted num_followers: 372.0338844803864
[5.34970171 5.34970171 5.34970171 5.34970171]
The most similar playlist's predicted num_followers: 209.54548419235527
There are 2626 playlists in genre = pop
The most similar playlist's rank within genre is: 5


In [13]:
recommendation, recommendation_pred_num_followers = recommended_playlist('pop', 5)
most_similar_pl_id = get_most_similar_playlist(recommendation, 'pop')
predict_most_similar_playlist(most_similar_pl_id)
rank = get_rank_within_genre(most_similar_pl_id, 'pop')

tracks that are missing : 0
The recommended playlist is:


Unnamed: 0,track ID,track name,artists names
0,4Oun2ylbjFKMPTiaSbbCih,WAP (feat. Megan Thee Stallion),"[Cardi B, Megan Thee Stallion]"
1,0v1x6rN6JHRapa03JElljE,Dynamite,[BTS]
2,4ZRrLHqzhGRXYj2qcB4s5S,Tattoo - Remix with Camilo,"[Rauw Alejandro, Camilo]"
3,2XU0oxnq2qxCpomAAuJY8K,Dance Monkey,[Tones And I]
4,3ZG8N7aWw2meb6UrI5ZmnZ,Relación,[Sech]


Predicted num_followers: 297.5116219615801
[5.35198221]
The most similar playlist's predicted num_followers: 210.02618234173931
There are 2626 playlists in genre = pop
The most similar playlist's rank within genre is: 2136


In [8]:
recommendation, recommendation_pred_num_followers = recommended_playlist('rock', 10)
most_similar_pl_id = get_most_similar_playlist(recommendation, 'rock')
predict_most_similar_playlist(most_similar_pl_id)
rank = get_rank_within_genre(most_similar_pl_id, 'rock')

tracks that are missing : 0
The recommended playlist is:


Unnamed: 0,track ID,track name,artists names
0,0pqnGHJpmpxLKifKRmU6WP,Believer,[Imagine Dragons]
1,2374M0fQpWi3dLnB54qaLX,Africa,[TOTO]
2,2WfaOiMkCvy7F5fcp2zZ8L,Take on Me,[a-ha]
3,1zB4vmk8tFRmM9UULNzbLB,Thunder,[Imagine Dragons]
4,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,[The Neighbourhood]
5,3CRDbSIZ4r5MsZ0YwxuEkn,Stressed Out,[Twenty One Pilots]
6,4u7EnebtmKWzUH433cf5Qv,Bohemian Rhapsody - 2011 Mix,[Queen]
7,5FVd6KXrgO9B3JPmC8OPst,Do I Wanna Know?,[Arctic Monkeys]
8,40riOy7x9W7GXjyGp4pjAv,Hotel California - 2013 Remaster,[Eagles]
9,7e89621JPkKaeDSTQ3avtg,Sweet Home Alabama,[Lynyrd Skynyrd]


Predicted num_followers: 291.26827603189736
[5.30324008]
The most similar playlist's predicted num_followers: 199.98697092640552
There are 788 playlists in genre = rock
The most similar playlist's rank within genre is: 761


In [14]:
recommendation, recommendation_pred_num_followers = recommended_playlist('rock', 5)
most_similar_pl_id = get_most_similar_playlist(recommendation, 'rock')
predict_most_similar_playlist(most_similar_pl_id)
rank = get_rank_within_genre(most_similar_pl_id, 'rock')

tracks that are missing : 0
The recommended playlist is:


Unnamed: 0,track ID,track name,artists names
0,0pqnGHJpmpxLKifKRmU6WP,Believer,[Imagine Dragons]
1,08mG3Y1vljYA6bvDt4Wqkj,Back In Black,[AC/DC]
2,2374M0fQpWi3dLnB54qaLX,Africa,[TOTO]
3,1zB4vmk8tFRmM9UULNzbLB,Thunder,[Imagine Dragons]
4,1JSTJqkT5qHq8MDJnJbRE1,Every Breath You Take,[The Police]


Predicted num_followers: 307.67530067397064
[5.52169429 5.52169429]
The most similar playlist's predicted num_followers: 249.05835063967493
There are 788 playlists in genre = rock
The most similar playlist's rank within genre is: 55


In [10]:
recommendation, recommendation_pred_num_followers = recommended_playlist('funk', 10)
most_similar_pl_id = get_most_similar_playlist(recommendation, 'funk')
predict_most_similar_playlist(most_similar_pl_id)
rank = get_rank_within_genre(most_similar_pl_id, 'funk')

tracks that are missing : 0
The recommended playlist is:


Unnamed: 0,track ID,track name,artists names
0,4xWzZmX4K1yyrdtRfbUvjt,Oh Juliana,[Niack]
1,0AGS6ZRgzobrazmCi6pYMe,Na Raba Toma Tapão,[Niack]
2,12v6LfkX9YIR3uLefIAAYZ,Comprei um Lança,[Mc Jacare]
3,67mqU3FeErlvWwsxnMfvQo,Tudo Aconteceu,"[MC Du Black, Delacruz]"
4,60wF0XnX7A0jVpDNA5v8vi,Tudo no Sigilo (Vytinho NG e Bianca),"[Vytinho NG, Bianca]"
5,7lQWRAjyhTpCWFC0jmclT4,Gangsta's Paradise,"[Coolio, L.V.]"
6,6jcGoHrqfRW6fI2I6xtwBQ,Deus é por nós,[MC Marks]
7,7Ac3BmqTQoLdAt7HtZyfgN,Te Prometo,"[Dennis DJ, Mc Don Juan]"
8,2WlZuBDgLfT7Kc0admhFdg,BRABA,[Luísa Sonza]
9,6Q9NH5cNBY58nJkp5Jbs9j,Deve ser horrível dormir sem mim,"[Manu Gavassi, Gloria Groove]"


Predicted num_followers: 376.81091628301283
[5.40602466 5.40602466]
The most similar playlist's predicted num_followers: 221.7443414272273
There are 93 playlists in genre = funk
The most similar playlist's rank within genre is: 5


In [15]:
recommendation, recommendation_pred_num_followers = recommended_playlist('funk', 5)
most_similar_pl_id = get_most_similar_playlist(recommendation, 'funk')
predict_most_similar_playlist(most_similar_pl_id)
rank = get_rank_within_genre(most_similar_pl_id, 'funk')

tracks that are missing : 0
The recommended playlist is:


Unnamed: 0,track ID,track name,artists names
0,4xWzZmX4K1yyrdtRfbUvjt,Oh Juliana,[Niack]
1,0AGS6ZRgzobrazmCi6pYMe,Na Raba Toma Tapão,[Niack]
2,5GAdMAL3tRJkVN1HtwohK2,Vai Ter Que Aguentar,"[Mc Don Juan, Maiara & Maraisa]"
3,12v6LfkX9YIR3uLefIAAYZ,Comprei um Lança,[Mc Jacare]
4,60wF0XnX7A0jVpDNA5v8vi,Tudo no Sigilo (Vytinho NG e Bianca),"[Vytinho NG, Bianca]"


Predicted num_followers: 372.3373889956647
[5.40602466 5.40602466]
The most similar playlist's predicted num_followers: 221.7443414272277
There are 93 playlists in genre = funk
The most similar playlist's rank within genre is: 5
