Importing Necessary Packages

In [1]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

#another useful command to make data exploration easier <Br>
#NOTE: if you are using a massive dataset, this could slow down your code. <br>
pd.set_option('display.max_columns', None)<br>
pd.set_option("max_rows", None)

In [3]:
spotify_df = pd.read_csv('data.csv')

In [4]:
spotify_df.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [5]:
spotify_df.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo'],
      dtype='object')

In [6]:
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

In [7]:
data_w_genre = pd.read_csv('data_w_genres.csv')
data_w_genre.tail()

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
28675,[],麥志誠,0.512,0.356,198773.0,0.306,0.00897,0.108,-10.119,0.0277,150.049,0.328,35.0,10,1,2
28676,"['c-pop', 'classic cantopop', 'classic mandopo...",黃品源,0.541,0.578,293840.0,0.334,6e-06,0.0675,-11.974,0.0267,135.934,0.243,48.0,9,0,2
28677,[],黃國隆,0.785455,0.570818,174582.727273,0.1484,8.3e-05,0.142191,-21.610091,0.054355,119.586273,0.741273,23.0,5,1,11
28678,"['chinese indie', 'chinese indie rock']",黑豹,0.381,0.353,316160.0,0.686,0.0,0.0568,-9.103,0.0395,200.341,0.352,35.0,11,1,2
28679,['classic korean pop'],조정현,0.568,0.447,237688.0,0.215,1e-06,0.0649,-16.478,0.0272,71.979,0.177,31.0,10,1,2


In [8]:
data_w_genre['genres'].values[0]

"['show tunes']"

In [9]:
data_w_genre['genres'].values[10]

"['dark trap', 'meme rap']"

In [10]:
data_w_genre.head(10)

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,['show tunes'],"""Cats"" 1981 Original London Cast",0.590111,0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5,1,9
1,[],"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5,1,26
2,[],"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0,1,7
3,[],"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0,1,27
4,[],"""Joseph And The Amazing Technicolor Dreamcoat""...",0.510714,0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5,1,7
5,[],"""Joseph And The Amazing Technicolor Dreamcoat""...",0.609556,0.487278,205091.944444,0.309906,0.004696,0.274767,-18.266389,0.098022,118.648944,0.441556,32.777778,5,1,36
6,[],"""Mama"" Helen Teagarden",0.725,0.637,135533.0,0.512,0.186,0.426,-20.615,0.21,134.819,0.885,0.0,8,1,2
7,[],"""Test for Victor Young""",0.927,0.734,175693.0,0.474,0.0762,0.737,-10.544,0.256,132.788,0.902,3.0,10,1,2
8,"['comedy rock', 'comic', 'parody']","""Weird Al"" Yankovic",0.173145,0.662787,218948.196721,0.695393,5e-05,0.161102,-9.768705,0.084536,133.03118,0.751344,34.229508,9,1,122
9,"['emo rap', 'florida rap', 'sad rap', 'undergr...",$NOT,0.544467,0.7898,137910.466667,0.532933,0.023063,0.1803,-9.149267,0.293687,112.3448,0.4807,67.533333,1,1,15


As we can see the genres are all in inside the list so we have to extract the word from the list using regex 

In [11]:
data_w_genre.shape

(28680, 16)

In [12]:
data_w_genre['genres_upd'] = data_w_genre['genres'].apply(lambda x: [re.sub(' ','_',i) for i in re.findall(r"'([^']*)'", x)])

In [13]:
data_w_genre['genres_upd'].values[0][0]

'show_tunes'

Now, we solve the problem for the column in Genre, but the same problem was happened in our data set for artist column

In [14]:
spotify_df['artist_upd_v1'] = spotify_df['artists'].apply(lambda x:[re.sub(' ','_',i) for i in re.findall(r"'([^']*)'",x)])

In [15]:
spotify_df['artists'].values[0]

"['Sergei Rachmaninoff', 'James Levine', 'Berliner Philharmoniker']"

In [16]:
spotify_df['artist_upd_v1'].values[0]

['Sergei_Rachmaninoff', 'James_Levine', 'Berliner_Philharmoniker']

In [17]:
spotify_df[spotify_df['artist_upd_v1'].apply(lambda x: not x)].head(5)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,artist_upd_v1
143,0.3,1921,0.772,"[""Scarlet D'Carpio""]",0.56,249370,0.313,0,7b4eHImKQ51DYaQvNTdtEp,5e-06,6,0.115,-8.346,0,Himno Nacional del Perú,0,1921-09-23,0.0376,107.501,[]
234,0.902,1923,0.994,"[""King Oliver's Creole Jazz Band""]",0.708,194533,0.361,0,1xEEYhWxT4WhDQdxfPCT8D,0.883,0,0.103,-11.764,0,Snake Rag,20,1923,0.0441,105.695,[]
238,0.554,1923,0.996,"[""King Oliver's Creole Jazz Band""]",0.546,170827,0.189,0,3rauXVLOOM5BlxWqUcDpkg,0.908,0,0.339,-15.984,1,Chimes Blues,13,1923,0.0581,80.318,[]
244,0.319,1923,0.995,"[""Clarence Williams' Blue Five""]",0.52,197493,0.153,0,1UdqHVRFYMZKU2Q7xkLtYc,0.131,0,0.353,-14.042,1,Pickin' On Your Baby,11,1923,0.044,102.937,[]
249,0.753,1923,0.994,"[""King Oliver's Creole Jazz Band""]",0.359,187227,0.357,0,5SvyP1ZeJX1jA7AOZD08NA,0.819,3,0.29,-11.81,1,Tears,10,1923,0.0511,205.053,[]


In [18]:
spotify_df['artists_upd_v2'] = spotify_df['artists'].apply(lambda x: re.findall('\"(.*?)\"',x))
spotify_df['artists_upd'] = np.where(spotify_df['artist_upd_v1'].apply(lambda x: not x), spotify_df['artists_upd_v2'], spotify_df['artist_upd_v1'] )

In [19]:
spotify_df.head(10)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,loudness,mode,name,popularity,release_date,speechiness,tempo,artist_upd_v1,artists_upd_v2,artists_upd
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,...,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954,"[Sergei_Rachmaninoff, James_Levine, Berliner_P...",[],"[Sergei_Rachmaninoff, James_Levine, Berliner_P..."
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,...,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936,[Dennis_Day],[],[Dennis_Day]
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,...,-14.85,1,Gati Bali,5,1921,0.0339,110.339,[KHP_Kridhamardawa_Karaton_Ngayogyakarta_Hadin...,[],[KHP_Kridhamardawa_Karaton_Ngayogyakarta_Hadin...
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,...,-9.316,1,Danny Boy,3,1921,0.0354,100.109,[Frank_Parker],[],[Frank_Parker]
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,...,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665,[Phil_Regan],[],[Phil_Regan]
5,0.196,1921,0.579,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.697,395076,0.346,0,4pyw9DVHGStUre4J6hPngr,0.168,...,-12.506,1,Gati Mardika,6,1921,0.07,119.824,[KHP_Kridhamardawa_Karaton_Ngayogyakarta_Hadin...,[],[KHP_Kridhamardawa_Karaton_Ngayogyakarta_Hadin...
6,0.406,1921,0.996,['John McCormack'],0.518,159507,0.203,0,5uNZnElqOS3W4fRmRYPk4T,0.0,...,-10.589,1,The Wearing of the Green,4,1921,0.0615,66.221,[John_McCormack],[],[John_McCormack]
7,0.0731,1921,0.993,['Sergei Rachmaninoff'],0.389,218773,0.088,0,02GDntOXexBFUvSgaXLPkd,0.527,...,-21.091,0,"Morceaux de fantaisie, Op. 3: No. 2, Prélude i...",2,1921,0.0456,92.867,[Sergei_Rachmaninoff],[],[Sergei_Rachmaninoff]
8,0.721,1921,0.996,['Ignacio Corsini'],0.485,161520,0.13,0,05xDjWH9ub67nJJk82yfGf,0.151,...,-21.508,0,La Mañanita - Remasterizado,0,1921-03-20,0.0483,64.678,[Ignacio_Corsini],[],[Ignacio_Corsini]
9,0.771,1921,0.982,['Fortugé'],0.684,196560,0.257,0,08zfJvRLp7pjAb94MA9JmF,0.0,...,-16.415,1,Il Etait Syndiqué,0,1921,0.399,109.378,[Fortugé],[],[Fortugé]


In [20]:
#need to create my own song identifier because there are duplicates of the same song with different ids. I see different
spotify_df['artists_song'] = spotify_df.apply(lambda row: row['artists_upd'][0]+row['name'],axis = 1)

In [21]:
spotify_df['artists_song'].value_counts

<bound method IndexOpsMixin.value_counts of 0         Sergei_RachmaninoffPiano Concerto No. 3 in D M...
1                         Dennis_DayClancy Lowered the Boom
2         KHP_Kridhamardawa_Karaton_Ngayogyakarta_Hadini...
3                                     Frank_ParkerDanny Boy
4                     Phil_ReganWhen Irish Eyes Are Smiling
                                ...                        
170648                                        Anuel_AAChina
170649                  AshnikkoHalloweenie III: Seven Days
170650                                           MAMAMOOAYA
170651                                       EminemDarkness
170652                 KEVVOBilletes Azules (with J Balvin)
Name: artists_song, Length: 170653, dtype: object>

In [22]:
spotify_df.sort_values(['artists_song','release_date'], ascending = False, inplace = True)

In [23]:
spotify_df[spotify_df['name']=='Adore You']

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,mode,name,popularity,release_date,speechiness,tempo,artist_upd_v1,artists_upd_v2,artists_upd,artists_song
19425,0.569,2019,0.0237,['Harry Styles'],0.676,207133,0.771,0,3jjujdWJ72nww5eGnfs2E7,7e-06,...,1,Adore You,88,2019-12-13,0.0483,99.048,[Harry_Styles],[],[Harry_Styles],Harry_StylesAdore You
38319,0.569,2019,0.0237,['Harry Styles'],0.676,207133,0.771,0,1M4qEo4HE3PRaCOM7EXNJq,7e-06,...,1,Adore You,77,2019-12-06,0.0483,99.048,[Harry_Styles],[],[Harry_Styles],Harry_StylesAdore You


In [24]:
artists_exploded = spotify_df[['artists_upd','id']].explode('artists_upd')

In [25]:
artists_exploded_enriched = artists_exploded.merge(data_w_genre, how = 'left', left_on = 'artists_upd',right_on = 'artists')
artists_exploded_enriched_nonnull = artists_exploded_enriched[~artists_exploded_enriched.genres_upd.isnull()]

In [26]:
artists_exploded_enriched_nonnull[artists_exploded_enriched_nonnull['id'] =='3jjujdWJ72nww5eGnfs2E7']

Unnamed: 0,artists_upd,id,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres_upd


In [27]:
artists_genres_consolidated = artists_exploded_enriched_nonnull.groupby('id')['genres_upd'].apply(list).reset_index()

In [28]:
artists_genres_consolidated['consolidates_genre_lists'] = artists_genres_consolidated['genres_upd'].apply(lambda x: list(set(list(itertools.chain.from_iterable(x)))))

In [29]:
artists_genres_consolidated.head()

Unnamed: 0,id,genres_upd,consolidates_genre_lists
0,000G1xMMuwxNHmwVsBdtj1,"[[candy_pop, dance_rock, new_wave, new_wave_po...","[rock, candy_pop, new_wave_pop, dance_rock, ne..."
1,000GyYHG4uWmlXieKLij8u,"[[alternative_hip_hop, conscious_hip_hop, minn...","[conscious_hip_hop, minnesota_hip_hop, pop_rap..."
2,000u1dTg7y1XCDXi80hbBX,"[[country, country_road, country_rock]]","[country, country_road, country_rock]"
3,000x2qE0ZI3hodeVrnJK8A,"[[folk-pop, indie_folk, modern_rock, new_ameri...","[new_americana, seattle_indie, stomp_and_holle..."
4,0024tEymsoc9FyKUauQngQ,"[[cyberpunk, new_age]]","[cyberpunk, new_age]"


In [30]:
spotify_df = spotify_df.merge(artists_genres_consolidated[['id','consolidates_genre_lists']], on = 'id',how = 'left')

In [31]:
spotify_df.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,name,popularity,release_date,speechiness,tempo,artist_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists
0,0.177,1989,0.568,['조정현'],0.447,237688,0.215,0,2ghebdwe2pNXT4eL34T7pW,1e-06,...,그아픔까지사랑한거야,31,1989-06-15,0.0272,71.979,[조정현],[],[조정현],조정현그아픔까지사랑한거야,[classic_korean_pop]
1,0.352,1992,0.381,['黑豹'],0.353,316160,0.686,0,3KIuCzckjdeeVuswPo20mC,0.0,...,DON'T BREAK MY HEART,35,1992-12-22,0.0395,200.341,[黑豹],[],[黑豹],黑豹DON'T BREAK MY HEART,"[chinese_indie, chinese_indie_rock]"
2,0.458,1963,0.987,['黃國隆'],0.241,193480,0.0437,0,4prhqrLXYMjHJ6vpRAlasx,0.000453,...,藝旦調,23,1963-05-28,0.0443,85.936,[黃國隆],[],[黃國隆],黃國隆藝旦調,[]
3,0.796,1963,0.852,"['黃國隆', '王秋玉']",0.711,145720,0.111,0,5xFXTvnEe03SyvFpo6pEaE,0.0,...,草螟弄雞公,23,1963-05-28,0.0697,124.273,"[黃國隆, 王秋玉]",[],"[黃國隆, 王秋玉]",黃國隆草螟弄雞公,[]
4,0.704,1963,0.771,['黃國隆'],0.61,208760,0.175,0,6Pqs2suXEqCGx7Lxg5dlrB,0.0,...,思想起,23,1963-05-28,0.0419,124.662,[黃國隆],[],[黃國隆],黃國隆思想起,[]


In [32]:
spotify_df.shape

(170653, 24)

## 2. Feature Engineering 

-Normalize float variable

-OHE year and Popularity variables

-Create TF-IDF features off of artist genres

In [33]:
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170653 entries, 0 to 170652
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   valence                   170653 non-null  float64
 1   year                      170653 non-null  int64  
 2   acousticness              170653 non-null  float64
 3   artists                   170653 non-null  object 
 4   danceability              170653 non-null  float64
 5   duration_ms               170653 non-null  int64  
 6   energy                    170653 non-null  float64
 7   explicit                  170653 non-null  int64  
 8   id                        170653 non-null  object 
 9   instrumentalness          170653 non-null  float64
 10  key                       170653 non-null  int64  
 11  liveness                  170653 non-null  float64
 12  loudness                  170653 non-null  float64
 13  mode                      170653 non-null  i

In [34]:
spotify_df['year'] = spotify_df['release_date'].apply(lambda x: x.split('-')[0])

In [35]:
float_cols = spotify_df.dtypes[spotify_df.dtypes == 'float64'].index.values

In [36]:
ohe_cols = 'popularity'

In [37]:
spotify_df['popularity'].describe()

count    170653.000000
mean         31.431794
std          21.826615
min           0.000000
25%          11.000000
50%          33.000000
75%          48.000000
max         100.000000
Name: popularity, dtype: float64

In [39]:
# create 5 point buckets for popularity 
spotify_df['popularity_red'] = spotify_df['popularity'].apply(lambda x: int(x/5))

In [40]:
# tfidf can't handle nulls so fill any null values with an empty list
spotify_df['consolidates_genre_lists'] = spotify_df['consolidates_genre_lists'].apply(lambda d: d if isinstance(d, list) else [])

In [41]:
spotify_df.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,popularity,release_date,speechiness,tempo,artist_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists,popularity_red
0,0.177,1989,0.568,['조정현'],0.447,237688,0.215,0,2ghebdwe2pNXT4eL34T7pW,1e-06,...,31,1989-06-15,0.0272,71.979,[조정현],[],[조정현],조정현그아픔까지사랑한거야,[classic_korean_pop],6
1,0.352,1992,0.381,['黑豹'],0.353,316160,0.686,0,3KIuCzckjdeeVuswPo20mC,0.0,...,35,1992-12-22,0.0395,200.341,[黑豹],[],[黑豹],黑豹DON'T BREAK MY HEART,"[chinese_indie, chinese_indie_rock]",7
2,0.458,1963,0.987,['黃國隆'],0.241,193480,0.0437,0,4prhqrLXYMjHJ6vpRAlasx,0.000453,...,23,1963-05-28,0.0443,85.936,[黃國隆],[],[黃國隆],黃國隆藝旦調,[],4
3,0.796,1963,0.852,"['黃國隆', '王秋玉']",0.711,145720,0.111,0,5xFXTvnEe03SyvFpo6pEaE,0.0,...,23,1963-05-28,0.0697,124.273,"[黃國隆, 王秋玉]",[],"[黃國隆, 王秋玉]",黃國隆草螟弄雞公,[],4
4,0.704,1963,0.771,['黃國隆'],0.61,208760,0.175,0,6Pqs2suXEqCGx7Lxg5dlrB,0.0,...,23,1963-05-28,0.0419,124.662,[黃國隆],[],[黃國隆],黃國隆思想起,[],4


In [42]:
#simple function to create OHE features
#this gets passed later on
def ohe_prep(df, column, new_name): 
    """ 
    Create One Hot Encoded features of a specific column

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used
        
    Returns: 
        tf_df: One hot encoded features 
    """
    
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df


### TF-IDF 

In [43]:
#function to build entire feature set
def create_feature_set(df, float_cols):
    """ 
    Process spotify df to create a final set of features that will be used to generate recommendations

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        float_cols (list(str)): List of float columns that will be scaled 
        
    Returns: 
        final: final set of features 
    """
    
    #tfidf genre lists
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(df['consolidates_genre_lists'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names()]
    genre_df.reset_index(drop = True, inplace=True)

    #explicity_ohe = ohe_prep(df, 'explicit','exp')    
    year_ohe = ohe_prep(df, 'year','year') * 0.5
    popularity_ohe = ohe_prep(df, 'popularity_red','pop') * 0.15

    #scale float columns
    floats = df[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

    #concanenate all features
    final = pd.concat([genre_df, floats_scaled, popularity_ohe, year_ohe], axis = 1)
     
    #add song id
    final['id']=df['id'].values
    
    return final

In [44]:
complete_feature_set = create_feature_set(spotify_df, float_cols=float_cols)#.mean(axis = 0)



In [45]:
complete_feature_set.head()

Unnamed: 0,genre|_hip_hop,genre|a_cappella,genre|abstract,genre|abstract_beats,genre|abstract_hip_hop,genre|acid_house,genre|acid_rock,genre|acoustic_blues,genre|acoustic_pop,genre|acoustic_punk,...,year|2012,year|2013,year|2014,year|2015,year|2016,year|2017,year|2018,year|2019,year|2020,id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2ghebdwe2pNXT4eL34T7pW
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3KIuCzckjdeeVuswPo20mC
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4prhqrLXYMjHJ6vpRAlasx
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5xFXTvnEe03SyvFpo6pEaE
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6Pqs2suXEqCGx7Lxg5dlrB


### 3. Connect to Spotify API

In [46]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

In [47]:
client_id="**"
client_secret = "**"

In [48]:
scope = 'user-library-read'

if len(sys.argv) > 1:
    username = sys.argv[1]
else:
    print("Usage: %s username" % (sys.argv[0],))
    sys.exit()
    

In [49]:
auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [50]:
token = util.prompt_for_user_token(scope, client_id= client_id, client_secret=client_secret, redirect_uri='http://localhost:8881/')

In [51]:
sp = spotipy.Spotify(auth=token)

In [52]:
#gather playlist names and images. 
#images aren't going to be used until I start building a UI
id_name = {}
list_photo = {}
for i in sp.current_user_playlists()['items']:

    id_name[i['name']] = i['uri'].split(':')[2]
    list_photo[i['uri'].split(':')[2]] = i['images'][0]['url']

In [54]:
id_name

{'Try': '0UnyxoU5f0C5HjMd4LBQhg',
 'Tamil': '1cZQKEpEDAt8YSRhoMDygL',
 'Peel': '3y6B7pPmI33reGUkm0rC2p',
 'Eng': '1HoeZCNCdZHZA8Wo6nd3Dx',
 'Broken': '5dFHKcyCLP66tBWR6DbfFf',
 'Bae': '7jR8iTevvz9ABWV5nAdl77',
 'Other for relax': '3lgp1fNWFKfzjZGNUgnzUo'}

In [55]:
def create_necessary_outputs(playlist_name,id_dic, df):
    """ 
    Pull songs from a specific playlist.

    Parameters: 
        playlist_name (str): name of the playlist you'd like to pull from the spotify API
        id_dic (dic): dictionary that maps playlist_name to playlist_id
        df (pandas dataframe): spotify datafram
        
    Returns: 
        playlist: all songs in the playlist THAT ARE AVAILABLE IN THE KAGGLE DATASET
    """
    
    #generate playlist dataframe
    playlist = pd.DataFrame()
    playlist_name = playlist_name

    for ix, i in enumerate(sp.playlist(id_dic[playlist_name])['tracks']['items']):
        #print(i['track']['artists'][0]['name'])
        playlist.loc[ix, 'artist'] = i['track']['artists'][0]['name']
        playlist.loc[ix, 'name'] = i['track']['name']
        playlist.loc[ix, 'id'] = i['track']['id'] # ['uri'].split(':')[2]
        playlist.loc[ix, 'url'] = i['track']['album']['images'][1]['url']
        playlist.loc[ix, 'date_added'] = i['added_at']

    playlist['date_added'] = pd.to_datetime(playlist['date_added'])  
    
    playlist = playlist[playlist['id'].isin(df['id'].values)].sort_values('date_added',ascending = False)
    
    return playlist

In [56]:
id_name

{'Try': '0UnyxoU5f0C5HjMd4LBQhg',
 'Tamil': '1cZQKEpEDAt8YSRhoMDygL',
 'Peel': '3y6B7pPmI33reGUkm0rC2p',
 'Eng': '1HoeZCNCdZHZA8Wo6nd3Dx',
 'Broken': '5dFHKcyCLP66tBWR6DbfFf',
 'Bae': '7jR8iTevvz9ABWV5nAdl77',
 'Other for relax': '3lgp1fNWFKfzjZGNUgnzUo'}

In [59]:
playlist_EDM = create_necessary_outputs('Broken', id_name,spotify_df)
#playlist_chill = create_necessary_outputs('chill',id_name, spotify_df)
#playlist_classical = create_necessary_outputs('Epic Classical',id_name, spotify_df)

In [74]:
from skimage import io
import matplotlib.pyplot as plt

def visualize_songs(df):
    """ 
    Visualize cover art of the songs in the inputted dataframe

    Parameters: 
        df (pandas dataframe): Playlist Dataframe
    """
    
    temp = df['url'].values
    plt.figure(figsize=(15,int(2 * len(temp))))
    columns = 5
    
    for i, url in enumerate(temp):
        #plt.subplot(len(temp) / columns + 1, columns, i + 1)

        image = io.imread(url)
        plt.imshow(image)
        plt.xticks(color = 'w', fontsize = 0.1)
        plt.yticks(color = 'w', fontsize = 0.1)
        plt.xlabel(df['name'].values[i], fontsize = 12)
        plt.tight_layout(h_pad=0.4, w_pad=0)
        plt.subplots_adjust(wspace=None, hspace=None)

    plt.show()

In [76]:
playlist_EDM

Unnamed: 0,artist,name,id,url,date_added
2,Kina,Can We Kiss Forever?,58wyJLv6yH1La9NIZPl3ne,https://i.scdn.co/image/ab67616d00001e02135847...,2021-12-08 08:33:11+00:00
60,Drake,God's Plan,6DCZcSspjsKoFjzjrWoCdn,https://i.scdn.co/image/ab67616d00001e02f907de...,2021-09-13 06:51:48+00:00
57,Joel Adams,Please Don't Go,3cNjgVBKTJ1SvKhunrCdVy,https://i.scdn.co/image/ab67616d00001e023a22ad...,2021-06-27 07:22:31+00:00
56,Billie Eilish,lovely (with Khalid),0u2P5u6lvoDfwTYjAADbn4,https://i.scdn.co/image/ab67616d00001e028a3f0a...,2021-06-01 18:55:06+00:00
49,Christina Perri,A Thousand Years,6lanRgr6wXibZr8KgzXxBl,https://i.scdn.co/image/ab67616d00001e023dea4a...,2021-05-17 17:45:46+00:00
48,XXXTENTACION,changes,7AFASza1mXqntmGtbxXprO,https://i.scdn.co/image/ab67616d00001e02806c16...,2021-05-17 17:41:37+00:00
47,Seon,Never Be Alright,6zJffE1wHyAu5WwNYwt0VI,https://i.scdn.co/image/ab67616d00001e02edb45d...,2021-05-17 17:41:17+00:00
44,Lewis Capaldi,Before You Go,2gMXnyrvIjhVBUZwvLZDMP,https://i.scdn.co/image/ab67616d00001e027b9639...,2021-05-17 17:38:07+00:00
41,Lewis Capaldi,Bruises,4Of7rzpRpV1mWRbhp5rAqG,https://i.scdn.co/image/ab67616d00001e02fc2101...,2021-05-17 17:36:12+00:00
40,Lewis Capaldi,Someone You Loved,7qEHsqek33rTcFNT9PFqLf,https://i.scdn.co/image/ab67616d00001e02fc2101...,2021-05-17 17:33:47+00:00


In [73]:
def generate_playlist_feature(complete_feature_set, playlist_df, weight_factor):
    """ 
    Summarize a user's playlist into a single vector

    Parameters: 
        complete_feature_set (pandas dataframe): Dataframe which includes all of the features for the spotify songs
        playlist_df (pandas dataframe): playlist dataframe
        weight_factor (float): float value that represents the recency bias. The larger the recency bias, the most priority recent songs get. Value should be close to 1. 
        
    Returns: 
        playlist_feature_set_weighted_final (pandas series): single feature that summarizes the playlist
        complete_feature_set_nonplaylist (pandas dataframe): 
    """
    
    complete_feature_set_playlist = complete_feature_set[complete_feature_set['id'].isin(playlist_df['id'].values)]#.drop('id', axis = 1).mean(axis =0)
    complete_feature_set_playlist = complete_feature_set_playlist.merge(playlist_df[['id','date_added']], on = 'id', how = 'inner')
    complete_feature_set_nonplaylist = complete_feature_set[~complete_feature_set['id'].isin(playlist_df['id'].values)]#.drop('id', axis = 1)
    
    playlist_feature_set = complete_feature_set_playlist.sort_values('date_added',ascending=False)

    most_recent_date = playlist_feature_set.iloc[0,-1]
    
    for ix, row in playlist_feature_set.iterrows():
        playlist_feature_set.loc[ix,'months_from_recent'] = int((most_recent_date.to_pydatetime() - row.iloc[-1].to_pydatetime()).days / 30)
        
    playlist_feature_set['weight'] = playlist_feature_set['months_from_recent'].apply(lambda x: weight_factor ** (-x))
    
    playlist_feature_set_weighted = playlist_feature_set.copy()
    #print(playlist_feature_set_weighted.iloc[:,:-4].columns)
    playlist_feature_set_weighted.update(playlist_feature_set_weighted.iloc[:,:-4].mul(playlist_feature_set_weighted.weight,0))
    playlist_feature_set_weighted_final = playlist_feature_set_weighted.iloc[:, :-4]
    #playlist_feature_set_weighted_final['id'] = playlist_feature_set['id']
    
    return playlist_feature_set_weighted_final.sum(axis = 0), complete_feature_set_nonplaylist

In [65]:
complete_feature_set_playlist_vector_EDM, complete_feature_set_nonplaylist_EDM = generate_playlist_feature(complete_feature_set, playlist_EDM, 1.09)
#complete_feature_set_playlist_vector_chill, complete_feature_set_nonplaylist_chill = generate_playlist_feature(complete_feature_set, playlist_chill, 1.09)

In [66]:
complete_feature_set_playlist_vector_EDM.shape

(1936,)

In [67]:
def generate_playlist_recos(df, features, nonplaylist_features):
    """ 
    Pull songs from a specific playlist.

    Parameters: 
        df (pandas dataframe): spotify dataframe
        features (pandas series): summarized playlist feature
        nonplaylist_features (pandas dataframe): feature set of songs that are not in the selected playlist
        
    Returns: 
        non_playlist_df_top_40: Top 40 recommendations for that playlist
    """
    
    non_playlist_df = df[df['id'].isin(nonplaylist_features['id'].values)]
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('id', axis = 1).values, features.values.reshape(1, -1))[:,0]
    non_playlist_df_top_40 = non_playlist_df.sort_values('sim',ascending = False).head(40)
    non_playlist_df_top_40['url'] = non_playlist_df_top_40['id'].apply(lambda x: sp.track(x)['album']['images'][1]['url'])
    
    return non_playlist_df_top_40

In [68]:
edm_top40 = generate_playlist_recos(spotify_df, complete_feature_set_playlist_vector_EDM, complete_feature_set_nonplaylist_EDM)

In [69]:
edm_top40

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,speechiness,tempo,artist_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists,popularity_red,sim,url
42189,0.753,2018,0.867,['SadBoyProlific'],0.793,89652,0.441,1,1A43TfohQt3H6K5zg28ExD,0.0,...,0.579,74.746,[SadBoyProlific],[],[SadBoyProlific],SadBoyProlificDead and Cold,"[sad_rap, emo_rap]",15,0.702195,https://i.scdn.co/image/ab67616d00001e02793786...
4943,0.722,2019,0.853,['Yxngxr1'],0.859,135000,0.545,1,5rAM2UW8MeVtBxKoNTRXUE,0.0,...,0.248,128.0,[Yxngxr1],[],[Yxngxr1],Yxngxr1Rather Do,"[sad_rap, emo_rap]",12,0.700991,https://i.scdn.co/image/ab67616d00001e02002421...
3825,0.602,2019,0.615,['guccihighwaters'],0.731,152462,0.736,0,4zsYVlQE4mCI6eyKRYJ0db,0.0,...,0.0463,130.07,[guccihighwaters],[],[guccihighwaters],guccihighwatershighschool,"[sad_rap, emo_rap]",12,0.695442,https://i.scdn.co/image/ab67616d00001e02c27929...
52071,0.434,2019,0.844,"['Powfu', 'Rxseboy']",0.788,149000,0.537,0,2L2mNkEWo818IE4fKbnP0O,0.0,...,0.107,85.03,"[Powfu, Rxseboy]",[],"[Powfu, Rxseboy]",PowfuWould Look Perfect,"[sad_rap, emo_rap]",13,0.693794,https://i.scdn.co/image/ab67616d00001e0252520f...
139870,0.656,2019,0.11,"['Daddy Yankee', 'Snow']",0.737,193227,0.86,0,5w9c2J52mkdntKOmRLeM2m,2e-06,...,0.0593,93.989,"[Daddy_Yankee, Snow]",[],"[Daddy_Yankee, Snow]",Daddy_YankeeCon Calma,"[sad_rap, emo_rap]",16,0.692774,https://i.scdn.co/image/ab67616d00001e02896ffc...
42190,0.436,2018,0.481,"['SadBoyProlific', 'Ivri']",0.87,160105,0.341,1,6tAqYm2Wcy2yrPixShJMS6,0.0027,...,0.439,119.054,"[SadBoyProlific, Ivri]",[],"[SadBoyProlific, Ivri]",SadBoyProlificAlone,"[sad_rap, emo_rap]",14,0.684923,https://i.scdn.co/image/ab67616d00001e020ce638...
42188,0.519,2018,0.502,"['SadBoyProlific', 'Kuzu Mellow']",0.862,205740,0.357,1,32ba0Jf4FfjK5mZOy0Mctq,4.8e-05,...,0.306,140.026,"[SadBoyProlific, Kuzu_Mellow]",[],"[SadBoyProlific, Kuzu_Mellow]",SadBoyProlificHeartbreak,"[sad_rap, emo_rap]",12,0.684015,https://i.scdn.co/image/ab67616d00001e02220ac6...
4942,0.723,2019,0.282,['Yxngxr1'],0.908,126544,0.567,1,409qeWve1NEa9hkbrTjaUo,0.0,...,0.148,97.99,[Yxngxr1],[],[Yxngxr1],Yxngxr1Tyler,"[sad_rap, emo_rap]",12,0.682892,https://i.scdn.co/image/ab67616d00001e0207c739...
3605,0.362,2018,0.358,['sadeyes'],0.591,141928,0.488,1,5MsydV2Ii4A7r85gSgh90x,0.0,...,0.16,109.9,[sadeyes],[],[sadeyes],sadeyesyou deserve better,"[sad_rap, emo_rap]",13,0.676914,https://i.scdn.co/image/ab67616d00001e022bd4ed...
42187,0.624,2018,0.416,['SadBoyProlific'],0.703,124891,0.39,1,19NEYj0nF2FHMr9X2HYxQd,0.0,...,0.454,79.884,[SadBoyProlific],[],[SadBoyProlific],SadBoyProlificMidnight Thoughts,"[sad_rap, emo_rap]",13,0.675464,https://i.scdn.co/image/ab67616d00001e02c5457c...
