##  1.) Make data into a df which can be easily modified on
## 2.) Add OHE and TD-IDF
## 3.) Utilize Spotify API to get user data
## 4.) Create recommended playlist by comparing playlist vector and song vector through cosine similarity
## 5.) Output

## Imports

In [1]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

## Preference and viewing options

In [2]:
## Sets the viewing to 500
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## 1 ---------------------------------------------------------------------------------------------------------

## Import the two data sets


In [3]:
data_w_genre = pd.read_csv('artists.csv')
data_wo_genre = pd.read_csv('tracks.csv')

## Explode and merge the two datasets


In [4]:
data_w_genre['genres_upd'] = data_w_genre['genres'].apply(lambda x: [re.sub(' ','_',i) for i in re.findall(r"'([^']*)'", x)])

data_wo_genre['artists_upd_v1'] = data_wo_genre['artists'].apply(lambda x: re.findall(r"'([^']*)'", x))
data_wo_genre[data_wo_genre['artists_upd_v1'].apply(lambda x: not x)]

data_wo_genre['artists_upd_v2'] = data_wo_genre['artists'].apply(lambda x: re.findall('\"(.*?)\"',x))
data_wo_genre['artists_upd'] = np.where(data_wo_genre['artists_upd_v1'].apply(lambda x: not x), data_wo_genre['artists_upd_v2'], data_wo_genre['artists_upd_v1'] )

data_wo_genre['artists_song'] = data_wo_genre.apply(lambda row: str(row['artists_upd'][0]) + str(row['name']), axis=1)

data_wo_genre.sort_values(['artists_song','release_date'], ascending = False, inplace = True)

data_wo_genre.drop_duplicates('artists_song',inplace = True)

## Explode the dataset

In [5]:
artists_exploded = data_wo_genre[['artists_upd', 'id', 'name', 'popularity', 'duration_ms', 'explicit', 'id_artists', 'release_date', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']].explode('artists_upd')

artists_exploded

Unnamed: 0,artists_upd,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
538433,최진희,3u1C6nWVRoP5F0w8gGrDL3,사랑의 미로,25,222380,0,['1NSrAf8XJYJVgAXKoxaMet'],1987-06-01,0.367,0.194,7,-19.057,1,0.0400,0.617,0.000006,0.1620,0.3670,144.316,4
404349,지수,1Mv4u308L16NZDZiD6HZCy,사랑은 힘든가봐,28,213440,0,['4c9QIMfEbIIynuaswyxGx9'],2005-12-23,0.675,0.785,4,-5.026,0,0.0280,0.379,0.000000,0.3530,0.6230,103.008,4
210091,지선,1jvoY322nxyKXq8OBhgmSY,어떡하죠,44,244360,0,['2Mo9NQaNCFCWSR5CnlfmbN'],2011-10-13,0.606,0.341,0,-7.094,1,0.0513,0.779,0.000000,0.1440,0.2940,135.667,4
270610,조정현,2ghebdwe2pNXT4eL34T7pW,그아픔까지사랑한거야,32,237688,0,['2WTpsPucygbYRnCnoEUkJQ'],1989-06-15,0.447,0.215,10,-16.478,1,0.0272,0.568,0.000001,0.0649,0.1770,71.979,4
208974,장정우,7rxpWwcXNgDUXl0wN0gUvp,천국의 기억 장정우 Version,31,280372,0,['5L7zKs2ftwENWOMI7LFaN1'],2003-12-24,0.494,0.656,7,-6.347,0,0.0262,0.659,0.000007,0.1110,0.4200,82.003,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281020,"Choir of The Church of The Transfiguration"",",6Pu2leLWEfThVIqJilw6O9,The Passion of Our Lord according to St. Matth...,4,79027,0,"['47EwjzhfZKotjVtvlQFNFS', '5H155SiWINLkYRElqw...",1963,0.177,0.208,11,-22.988,0,0.0388,0.988,0.946000,0.0982,0.0587,132.053,3
281020,",",6Pu2leLWEfThVIqJilw6O9,The Passion of Our Lord according to St. Matth...,4,79027,0,"['47EwjzhfZKotjVtvlQFNFS', '5H155SiWINLkYRElqw...",1963,0.177,0.208,11,-22.988,0,0.0388,0.988,0.946000,0.0982,0.0587,132.053,3
281020,",",6Pu2leLWEfThVIqJilw6O9,The Passion of Our Lord according to St. Matth...,4,79027,0,"['47EwjzhfZKotjVtvlQFNFS', '5H155SiWINLkYRElqw...",1963,0.177,0.208,11,-22.988,0,0.0388,0.988,0.946000,0.0982,0.0587,132.053,3
294380,Children,7f092ECp06XbFS6Ms5Yk6R,Mia Oraia Petalouda,26,100313,0,['7JJrBciIAQBPOBB5U6u49p'],2014-06-08,0.773,0.183,0,-9.370,1,0.0370,0.956,0.000000,0.1190,0.8640,79.950,4


## Merge dataset


In [6]:
artists_exploded_enriched = artists_exploded.merge(data_w_genre, how = 'left', left_on = 'artists_upd',right_on = 'name')
final_df = artists_exploded_enriched[~artists_exploded_enriched.genres_upd.isnull()]

## Get rid of the rest of the _y endings and rename the _x

In [7]:
# Find the columns that end with "_y"
cols_to_drop = [col for col in final_df.columns if col.endswith("_y")]

# Drop the columns
final_df.drop(columns=cols_to_drop, inplace=True)

# Find the columns that end with "_x"
cols_to_rename = [col for col in final_df.columns if col.endswith("_x")]

# Rename the columns
final_df.rename(columns={col: col[:-2] for col in cols_to_rename}, inplace=True)

final_df

Unnamed: 0,artists_upd,id,name,popularity,duration_ms,explicit,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,followers,genres,genres_upd
0,최진희,3u1C6nWVRoP5F0w8gGrDL3,사랑의 미로,25,222380,0,['1NSrAf8XJYJVgAXKoxaMet'],1987-06-01,0.367,0.194,7,-19.057,1,0.0400,0.6170,0.000006,0.1620,0.367,144.316,4,788.0,['trot'],[trot]
2,지선,1jvoY322nxyKXq8OBhgmSY,어떡하죠,44,244360,0,['2Mo9NQaNCFCWSR5CnlfmbN'],2011-10-13,0.606,0.341,0,-7.094,1,0.0513,0.7790,0.000000,0.1440,0.294,135.667,4,262.0,[],[]
4,장정우,7rxpWwcXNgDUXl0wN0gUvp,천국의 기억 장정우 Version,31,280372,0,['5L7zKs2ftwENWOMI7LFaN1'],2003-12-24,0.494,0.656,7,-6.347,0,0.0262,0.6590,0.000007,0.1110,0.420,82.003,4,11.0,[],[]
5,장정우,0cEvzbXjxkOxgBUmBUcHZW,그것만은..,32,294452,0,['5L7zKs2ftwENWOMI7LFaN1'],2003-12-24,0.311,0.487,4,-6.847,0,0.0265,0.4840,0.000000,0.2410,0.159,59.464,4,11.0,[],[]
8,이경섭,6uRXQW8BqB3N9WKfe9gfdw,Title 허밍,37,102307,0,['191huMISbbIeUELiiEGZ7L'],2007-11-09,0.825,0.604,7,-6.348,0,0.0417,0.3760,0.000001,0.0875,0.898,98.272,4,22.0,[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763039,Tim Rice,5UbK7BJAIhkUIYYLUABLdN,The Last Supper,30,428507,0,"['1T1Hnf7jWiJPFUTi6voWwU', '479Yp6DvyXoIaCssAx...",1970-01-01,0.397,0.396,7,-8.094,1,0.0345,0.5930,0.000000,0.0997,0.232,121.013,4,1553.0,[],[]
763040,"""Ipi Ntombi"" 1975 Original Cast",2itG1gLI0n9tH1DZKJbPd2,The Warrior,11,249493,0,['2rV4kKW3Yvqh40hpDH1gpy'],1975-01-01,0.749,0.590,0,-5.699,1,0.0528,0.1580,0.003260,0.1440,0.574,116.904,4,506.0,['xhosa'],[xhosa]
763041,"""Ipi Ntombi"" 1975 Original Cast",52V0laIHD3rvFZrwqDPHFe,Shosholoza,17,231733,0,['2rV4kKW3Yvqh40hpDH1gpy'],1975-01-01,0.721,0.534,0,-10.546,1,0.0506,0.4960,0.000000,0.1110,0.923,114.075,4,506.0,['xhosa'],[xhosa]
763042,"""Ipi Ntombi"" 1975 Original Cast",5qhaazvxbhR8mBlo1LXTuY,Narration: Mama Tembu's Wedding,7,182733,0,['2rV4kKW3Yvqh40hpDH1gpy'],1975-01-01,0.604,0.803,0,-6.785,1,0.4300,0.4260,0.000000,0.2480,0.916,134.034,4,506.0,['xhosa'],[xhosa]


## Consolidate the lists for genre

In [8]:
final_consalidated = final_df.groupby('id')['genres_upd'].apply(list).reset_index()
final_consalidated['consolidates_genre_lists'] = final_consalidated['genres_upd'].apply(lambda x: list(set(list(itertools.chain.from_iterable(x)))))


In [9]:
final_consalidated.head()

Unnamed: 0,id,genres_upd,consolidates_genre_lists
0,0004Uy71ku11n3LMpuyf59,[[polish_rock]],[polish_rock]
1,000CSYu4rvd8cQ7JilfxhZ,"[[country_quebecois, rock_quebecois]]","[country_quebecois, rock_quebecois]"
2,000DsoWJKHdaUmhgcnpr8j,[[barnmusik]],[barnmusik]
3,000G1xMMuwxNHmwVsBdtj1,"[[candy_pop, new_wave, new_wave_pop, permanent...","[new_wave_pop, new_wave, power_pop, candy_pop,..."
4,000KblXP5csWFFFsD6smOy,"[[chamame, folclore_salteno, folklore_argentino]]","[folclore_salteno, folklore_argentino, chamame]"


In [10]:
data_wo_genre = data_wo_genre.merge(final_consalidated[['id','consolidates_genre_lists']], on = 'id',how = 'left')

In [11]:
ready_de = data_wo_genre

In [12]:
## Remove all NaN values
ready_de['consolidates_genre_lists'] = ready_de['consolidates_genre_lists'].apply(lambda d: d if isinstance(d, list) else [])

## Final data set ready for data engineering

In [13]:
ready_de

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists
0,3u1C6nWVRoP5F0w8gGrDL3,사랑의 미로,25,222380,0,['최진희'],['1NSrAf8XJYJVgAXKoxaMet'],1987-06-01,0.367,0.194,7,-19.057,1,0.0400,0.617,0.000006,0.1620,0.3670,144.316,4,[최진희],[],[최진희],최진희사랑의 미로,[trot]
1,1Mv4u308L16NZDZiD6HZCy,사랑은 힘든가봐,28,213440,0,['지수'],['4c9QIMfEbIIynuaswyxGx9'],2005-12-23,0.675,0.785,4,-5.026,0,0.0280,0.379,0.000000,0.3530,0.6230,103.008,4,[지수],[],[지수],지수사랑은 힘든가봐,[]
2,1jvoY322nxyKXq8OBhgmSY,어떡하죠,44,244360,0,['지선'],['2Mo9NQaNCFCWSR5CnlfmbN'],2011-10-13,0.606,0.341,0,-7.094,1,0.0513,0.779,0.000000,0.1440,0.2940,135.667,4,[지선],[],[지선],지선어떡하죠,[]
3,2ghebdwe2pNXT4eL34T7pW,그아픔까지사랑한거야,32,237688,0,['조정현'],['2WTpsPucygbYRnCnoEUkJQ'],1989-06-15,0.447,0.215,10,-16.478,1,0.0272,0.568,0.000001,0.0649,0.1770,71.979,4,[조정현],[],[조정현],조정현그아픔까지사랑한거야,[]
4,7rxpWwcXNgDUXl0wN0gUvp,천국의 기억 장정우 Version,31,280372,0,['장정우'],['5L7zKs2ftwENWOMI7LFaN1'],2003-12-24,0.494,0.656,7,-6.347,0,0.0262,0.659,0.000007,0.1110,0.4200,82.003,4,[장정우],[],[장정우],장정우천국의 기억 장정우 Version,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523470,3YCC4oCzagcfx4s3fp3XFz,Bandolera,35,322926,1,"[""At' Fat"", 'Yemil', 'Boza', 'El Tachi']","['455JBMYIT1lWf1Djj4rDc7', '7g4cPtKxNx146qdaY9...",2020-11-17,0.502,0.814,5,-2.688,0,0.1330,0.524,0.000000,0.1510,0.6720,72.344,4,"[ Fat"", , , , , ]",[At' Fat],"[ Fat"", , , , , ]","Fat"", Bandolera",[]
523471,5sr38tHMSPpaENiNcjUpFi,Cathrina,10,184560,0,"[""Henry D' Cruz"", ""Helen D'Souza""]","['3nH2WLSi1MhvlmJNkbLbIv', '4NR8lI9YoPWo5v2Kv6...",1976-01-01,0.661,0.677,1,-8.447,0,0.1160,0.683,0.001540,0.0361,0.8090,116.855,4,"[ Cruz"", ""Helen D]","[Henry D' Cruz, Helen D'Souza]","[ Cruz"", ""Helen D]","Cruz"", ""Helen DCathrina",[]
523472,6Pu2leLWEfThVIqJilw6O9,The Passion of Our Lord according to St. Matth...,4,79027,0,"[""Boys' Choir of The Church of The Transfigura...","['47EwjzhfZKotjVtvlQFNFS', '5H155SiWINLkYRElqw...",1963,0.177,0.208,11,-22.988,0,0.0388,0.988,0.946000,0.0982,0.0587,132.053,3,"[ Choir of The Church of The Transfiguration"",...",[Boys' Choir of The Church of The Transfigurat...,"[ Choir of The Church of The Transfiguration"",...","Choir of The Church of The Transfiguration"", ...",[]
523473,7f092ECp06XbFS6Ms5Yk6R,Mia Oraia Petalouda,26,100313,0,"[""Typaldos' Children's Choire""]",['7JJrBciIAQBPOBB5U6u49p'],2014-06-08,0.773,0.183,0,-9.370,1,0.0370,0.956,0.000000,0.1190,0.8640,79.950,4,[ Children],[Typaldos' Children's Choire],[ Children],ChildrenMia Oraia Petalouda,[]


## 2 ---------------------------------------------------------------------------------------------------------

In [14]:
## Insert release date in year increment so it's easier to process. AKA normalizing variables
ready_de['year_of_release'] = ready_de['release_date'].apply(lambda x: x.split('-')[0])

## TF-IDF: log(corpus/# of occurence of genre)
This is to place less emphasis on pop songs since pop songs are saturated within Spotify

In [15]:
float_cols = ready_de.dtypes[ready_de.dtypes == 'float64'].index.values

In [16]:
ohe_cols = 'popularity'

In [17]:
ready_de['popularity'].describe()

count    523475.000000
mean         27.235870
std          18.030233
min           0.000000
25%          13.000000
50%          27.000000
75%          40.000000
max          99.000000
Name: popularity, dtype: float64

## OHE template function to encode release date and popularity

In [18]:
## Section off the popularity so we can one-hot encode it
ready_de['popularity_red'] = ready_de['popularity'].apply(lambda x: int(x/5))

In [19]:
ready_de.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists,year_of_release,popularity_red
0,3u1C6nWVRoP5F0w8gGrDL3,사랑의 미로,25,222380,0,['최진희'],['1NSrAf8XJYJVgAXKoxaMet'],1987-06-01,0.367,0.194,7,-19.057,1,0.04,0.617,6e-06,0.162,0.367,144.316,4,[최진희],[],[최진희],최진희사랑의 미로,[trot],1987,5
1,1Mv4u308L16NZDZiD6HZCy,사랑은 힘든가봐,28,213440,0,['지수'],['4c9QIMfEbIIynuaswyxGx9'],2005-12-23,0.675,0.785,4,-5.026,0,0.028,0.379,0.0,0.353,0.623,103.008,4,[지수],[],[지수],지수사랑은 힘든가봐,[],2005,5
2,1jvoY322nxyKXq8OBhgmSY,어떡하죠,44,244360,0,['지선'],['2Mo9NQaNCFCWSR5CnlfmbN'],2011-10-13,0.606,0.341,0,-7.094,1,0.0513,0.779,0.0,0.144,0.294,135.667,4,[지선],[],[지선],지선어떡하죠,[],2011,8
3,2ghebdwe2pNXT4eL34T7pW,그아픔까지사랑한거야,32,237688,0,['조정현'],['2WTpsPucygbYRnCnoEUkJQ'],1989-06-15,0.447,0.215,10,-16.478,1,0.0272,0.568,1e-06,0.0649,0.177,71.979,4,[조정현],[],[조정현],조정현그아픔까지사랑한거야,[],1989,6
4,7rxpWwcXNgDUXl0wN0gUvp,천국의 기억 장정우 Version,31,280372,0,['장정우'],['5L7zKs2ftwENWOMI7LFaN1'],2003-12-24,0.494,0.656,7,-6.347,0,0.0262,0.659,7e-06,0.111,0.42,82.003,4,[장정우],[],[장정우],장정우천국의 기억 장정우 Version,[],2003,6


In [20]:
## OHE template function
## Purpose: Takes a column and outputs another column with the column ready to be ohe
## Contract: Dataframe, Series/Column, String -> Series/Column

def ohe_prep(df, column, new_name): 
    """ 
    Create One Hot Encoded features of a specific column

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used
        
    Returns: 
        tf_df: One hot encoded features 
    """
    
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df

In [21]:
## Feature set function
## Purpose: Creates the one hot encoding of the dataframe. eg all the genres will have a bucket from 0-5 based on their popularity
## Contract: Dataframe, Float Column -> Dataframe
#function to build entire feature set
def create_feature_set(df, float_cols):
    """ 
    Process spotify df to create a final set of features that will be used to generate recommendations

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        float_cols (list(str)): List of float columns that will be scaled 
        
    Returns: 
        final: final set of features 
    """
    
    #tfidf genre lists
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(df['consolidates_genre_lists'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names()]
    genre_df.reset_index(drop = True, inplace=True)

    #explicity_ohe = ohe_prep(df, 'explicit','exp')      
    year_ohe = ohe_prep(df, 'year_of_release','year') * 0.5
    popularity_ohe = ohe_prep(df, 'popularity_red','pop') * 0.15

    #scale float columns
    floats = df[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

    #concanenate all features
    final = pd.concat([genre_df, floats_scaled, popularity_ohe, year_ohe], axis = 1)
     
    #add song id
    final['id']=df['id'].values
    
    return final

In [None]:
complete_feature_set = create_feature_set(ready_de, float_cols=float_cols)#.mean(axis = 0)

In [None]:
complete_feature_set.head()

## 3 ---------------------------------------------------------------------------------------------------------

In [None]:
## Import Spotify API using Spotipy
client_id = 'b872cb2322494950b8de21f24d45fe1b'
client_secret = '757646e87d9f4dbb83b31c1e63060c7f'
redirect_uri = 'http://localhost:3000/callback'

credentials = oauth2.SpotifyClientCredentials(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET)

token = credentials.get_access_token()
spotify = spotipy.Spotify(auth=token)

In [None]:
## Get playlists and their meta data
user_playlists = {}
user_playlists_cover = {}

for sogns in spotify.current_user_playlists()['items']:
    user_playlists[songs['name']] = songs['uri'].split(':')[2]
    user_playlists_cover[songs['uri'].split(':')[2]] = songs['images'][0]['url']