## SearchEngine Class

In [44]:
# Data Manipulation
import numpy as np
import pandas as pd

class SearchEngine():
    def __init__(self, data):
        '''
        SearchEngine class
        Input: a songs dataset extracted from spotify API
        '''
        self.data = data

    def target_song(self, title='', artist=''):
        '''
        Search engine function for the target song
        Input: song title and/or artist
        Output: self.target dataset song
        '''
        # transform input strings in lowercase
        title = str(title).lower()
        artist = str(artist).lower()
        # filter self.data on the desired song
        if title != '' and artist != '':
            self.target = self.data[self.data[
                'artists'].str.lower().str.contains(artist)]
            self.target = self.target[self.target[
                'name'].str.lower().str.contains(title)]
        elif title != '':
            self.target = self.data[self.data[
                'name'].str.lower().str.contains(title)]
        elif artist != '':
            self.target = self.data[self.data[
                'artists'].str.lower().str.contains(artist)]
        else:
            print('Please select a song title and artist')
        # keep only the first song result
        self.target = self.target.head(1)
        self.artist = self.target["artists"].to_string(
            index=False).strip("['").strip("']")
        self.title = self.target["name"].to_string(index=False)
        print(f'TITLE: {self.title}')
        print(f'ARTIST: {self.artist}')

## Preprocessor Class

In [45]:
# Data Manipulation
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.utils import shuffle

# Pipeline and Column Transformers
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn import set_config
set_config(display = "diagram")

# Scaling
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

# Package classes
# from music_similarity.search_engine import SearchEngine

class Preprocessor():
    def __init__(self, se):
        '''
        Preprocessor class
        Input: a songs dataset extracted from spotify API
        '''
        self.se = se

    def scale_data(self):
        '''
        Adapting data function
        '''
        # drop non numerical features before scaling
        self.X=self.se.data.drop(columns=['name','artists', 'uri'])
        self.X_target=self.se.target.drop(columns=['name','artists', 'uri'])
        # fit and transofrm with MinMaxScaler
        mmscaler = MinMaxScaler().fit(self.X)
        self.X_mmscaled=mmscaler.transform(self.X)
        self.X_target_mmscaled=mmscaler.transform(self.X_target)
        # fit and transofrm with RobustScaler
        roscaler = RobustScaler().fit(self.X)
        self.X_roscaled=roscaler.transform(self.X)
        self.X_target_roscaled=roscaler.transform(self.X_target)

## Playlist Class

In [46]:
# Data Manipulation
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Unsupervised Learning
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# K-nn modelling
from sklearn.neighbors import NearestNeighbors

# Package classes
# from music_similarity.search_engine import SearchEngine
# from music_similarity.preprocessor import Preprocessor

class Playlist():
    def __init__(self, preprocessor, se):
        '''
        Extractor class
        Input: preprocessor class dataset extracted from spotify API
        '''
        self.preprocessor = preprocessor
        self.se = se
        self.playlist_songs = 10

    def build_model(self):
        '''
        Model builder function
        '''
        # Define the model
        self.model=NearestNeighbors(
            n_neighbors=self.playlist_songs + 1).fit(
            self.preprocessor.X_mmscaled)
        # Extract index and distance of self.playlist_songs+1
        # number of colest songs
        self.distance, self.index=self.model.kneighbors(
            self.preprocessor.X_target_mmscaled,
            n_neighbors=self.playlist_songs + 1)
        # Copy found index rows from the original not scaled dataset
        self.playlist = self.se.data.iloc[self.index[0],:]
        self.playlist['distance'] = self.distance[0]
        # Remove the target song from the list
        self.playlist = self.playlist.tail(self.playlist_songs)
        # Ordering the playlist on distance, ascending order
        self.playlist = self.playlist.sort_values(
            by=['distance'], ascending=True, ignore_index=True)
        # Drop not necessary columns
        self.playlist = self.playlist[['name', 'artists', 'distance']]
        # Strip square brackets from the artists strings
        self.playlist['artists'] = self.playlist['artists'].apply(
            lambda x: x.strip("['").strip("']"))
        # Set starting index from 0 to 1
        self.playlist.index += 1

In [47]:
if 'se' in globals():
    del se
spotify = pd.read_csv('../raw_data/full_data.csv')
se = SearchEngine(spotify)
se.target_song("another one bites", "queen")

if 'preprocessor' in globals():
    del preprocessor
preprocessor = Preprocessor(se)
preprocessor.scale_data()

if 'playlist' in globals():
    del playlist
playlist = Playlist(preprocessor, se)
playlist.build_model()

TITLE: Another One Bites The Dust - Remastered 2011
ARTIST: Queen


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.playlist['distance'] = self.distance[0]


In [48]:
playlist.playlist

Unnamed: 0,name,artists,distance
1,I'm In Love,"Evelyn ""Champagne"" King",0.596029
2,Stomp!,The Brothers Johnson,0.633904
3,Mister Magic,"Grover Washington, Jr.",0.649415
4,Shibuya (feat. Syd),Free,0.656752
5,Cola Bottle Baby,Edwin Birdsong,0.667715
6,Upside Down,Diana Ross,0.673694
7,Roller Skates,Steel Pulse,0.683973
8,Atomic Dog,George Clinton,0.703227
9,Magic Carpet Ride,Steppenwolf,0.704115
10,I Been to Georgia on a Fast Train,Billy Joe Shaver,0.709949


## API DATA query

In [9]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time

# TO DO: I'LL TURN THIS INTO A CLASS LATER

class ApiExtractor:
    def __init__(self, se):
        '''
        Extractor class
        Input: preprocessor class dataset extracted from spotify API
        '''
        # credentials need to be exported via the command line
        self.auth_manager = SpotifyClientCredentials()
        self.sp_connection = spotipy.Spotify(auth_manager=self.auth_manager)
        self.search_limit = 10
        self.se = se
    
    def get_track_attrs(self, artist, title):
        # get track attributes
        self.ta_response = self.sp_connection.search(
            q="artist:" + artist + " track:" + title,
            type="track",
            limit=1)

        # parse attributes from track
        self.track = self.ta_response['tracks']['items'][0]
        self.track_name = self.track['name']
        self.track_uri = self.track['uri']
        self.track_popularity = self.track['popularity']
        self.track_explicit = self.track['explicit']
        self.track_artists = [artist['name'] for artist in self.track['artists']]

        # get track features
        self.track_features = self.sp_connection.audio_features(tracks = self.track_uri)[0]

        return [self.track_name, self.track_artists, self.track_popularity,
                self.track_features['danceability'], self.track_features['valence'],
                self.track_features['energy'], self.track_explicit, self.track_features['key'],
                self.track_features['liveness'], self.track_features['loudness'],
                self.track_features['speechiness'], self.track_features['tempo']]
    
    def create_df_songs(self):
        '''
        Function that create self.df_songs dataframe
        with the following features:
        
        - uri 
        - name 
        - artists 
        - popylarity
        - explicit 
        features
        '''
        # extract the list of artists in a np array
        # artists_array from the baseline dataset
        self.artists_array = np.array([])
        for artist in self.se.data.artists.unique():
            self.artists_array = np.append(self.artists_array, artist)
        
        # Test array for debugging purposes
        # self.test_artists_array = np.array(["['Robin Trower', 'Jack Bruce', 'Bill Lordan']",   TEST
        #                                     "['Michael Hedges']"])                             TEST
         
        # find 10 most popular songs for each artist
        # creating features np arrays to store first 5 informations
        self.songs_uri_array = np.array([])
        self.songs_name_array = np.array([])
        self.songs_artists_array = np.array([])
        self.songs_popularity_array = np.array([])
        self.songs_explicit_array = np.array([])
        
        # Test array to not overload API requests
        # for artist in self.test_artists_array:                                                 TEST
        for artist in self.artists_array:
            # Perform API search request
            time.sleep(0.015)
            print(f'{artist} ...')
            # Try the query
            try:
                self.asl_response = self.sp_connection.search(
                                    q="artist:" + artist,
                                    type="track", 
                                    limit=self.search_limit)
            # If there is an exception:
            # jump to the next loop
            except:
                print(f"Oops! {artist} not valid.  Skipped...")
                continue
            
            # iteration over the self.search_limit songs
            for i in range(len(self.asl_response['tracks']['items'])):
                # append uri of the track
                self.songs_uri_array = np.append(
                    self.songs_uri_array, self.asl_response[
                        'tracks']['items'][i]['uri'])
                # append name of the track
                self.songs_name_array = np.append(
                    self.songs_name_array, self.asl_response[
                        'tracks']['items'][i]['name'])
                # append artists of the track
                self.songs_artists_array = np.append(
                    self.songs_artists_array, artist)
                # append popularity of the track
                self.songs_popularity_array = np.append(
                    self.songs_popularity_array, self.asl_response[
                        'tracks']['items'][i]['popularity'])
                # append explicit of the track
                self.songs_explicit_array = np.append(
                    self.songs_explicit_array, self.asl_response[
                        'tracks']['items'][i]['explicit'])

        self.df_songs = pd.DataFrame()
        self.df_songs['uri'] = self.songs_uri_array
        self.df_songs['name'] = self.songs_name_array
        self.df_songs['artists'] = self.songs_artists_array
        self.df_songs['popularity'] = self.songs_popularity_array
        self.df_songs['explicit'] = self.songs_explicit_array
        #self.df_songs.to_csv('../raw_data/songs.csv')
    
    def create_df_audio_analysis(self):
        '''
        Function that create a self.df_analysis dataframe 
        with the following features:
        
        - 12 pitch features
        - 12 timbre features 
        - uri 
        '''
        # From uri array list, query pitch and timbre features
        self.df_analysis = pd.DataFrame()
        self.df_songs_csv = pd.read_csv('../raw_data/songs.csv')
        self.songs_uri_array_csv = self.df_songs_csv['uri']
        # For testing purposes only iterate over the first 3 uri
        #for uri in self.songs_uri_array[:3]:
        #for uri in self.songs_uri_array:
        for uri in self.songs_uri_array_csv:
            time.sleep(0.015)
            print(f'{uri} ...')
            # Perform API audio_analysis request
            try:
                self.track_analysis = self.sp_connection.audio_analysis(track_id = uri)
            except:
                print(f"Oops! {uri} not valid.  Skipped...")
                continue
            # Reset segments arrays before new song analysis
            self.segment_pitch = []
            self.segment_timbre = []
            # Extract pitch and timbre for each segment of the song
            for segment in range(len(self.track_analysis['segments'])):
                self.segment_pitch.append(self.track_analysis['segments'][segment]['pitches'])
                self.segment_timbre.append(self.track_analysis['segments'][segment]['timbre'])
            # Reset df_song_segments dataframe
            self.df_song_segments = pd.DataFrame()
            # create pitch and timbre columns with np arrays
            self.df_song_segments['segment_pitch'] = self.segment_pitch
            self.df_song_segments['segment_timbre'] = self.segment_timbre
            # Split the array elements in different columns
            self.df_split_pitches = pd.DataFrame(self.df_song_segments['segment_pitch'].tolist(), columns=['sp1', 'sp2', 'sp3', 'sp4', 'sp5', 'sp6', 'sp7', 'sp8', 'sp9', 'sp10', 'sp11', 'sp12'])
            self.df_split_timbres = pd.DataFrame(self.df_song_segments['segment_timbre'].tolist(), columns=['tm1', 'tm2', 'tm3', 'tm4', 'tm5', 'tm6', 'tm7', 'tm8', 'tm9', 'tm10', 'tm11', 'tm12'])
            # Add new columns to df_segment
            self.df_song_segments = pd.concat([self.df_song_segments, self.df_split_pitches, self.df_split_timbres], axis=1)
            self.df_song_segments.drop(['segment_pitch', 'segment_timbre'], axis = 1, inplace = True)
            # Transpose mean serie into a dataframe row
            self.df_song_segments = self.df_song_segments.mean().to_frame().T
            # Add uri column to perform the future merge
            self.df_song_segments['uri'] = uri
            self.df_analysis = pd.concat([self.df_analysis, self.df_song_segments])
        #self.df_analysis.to_csv('../raw_data/analysis.csv')
    
    def create_df_audio_features(self):
        '''
        Function that create a self.df_features dataframe 
        with the following features:
        
        - danceability
        - energy
        - key
        - loudness
        - mode
        - speechiness
        - acousticness
        - instrumentalness
        - liveness
        - valence
        - tempo
        - uri
        '''
        self.df_features = pd.DataFrame()
        self.df_songs_csv = pd.read_csv('../raw_data/songs.csv')
        self.songs_uri_array_csv = self.df_songs_csv['uri']
        #for uri in self.songs_uri_array[:2]:
        for uri in self.songs_uri_array_csv:
            time.sleep(0.015)
            print(f'{uri} ...')
            # Perform API audio_features request
            try:
                self.track_features = self.sp_connection.audio_features(tracks = uri)[0]
            except:
                print(f"Oops! {uri} not valid.  Skipped...")
                continue
            self.df_track_features = pd.DataFrame()
            self.df_track_features = pd.DataFrame.from_dict([self.track_features])
            try:
                self.df_track_features.drop(columns=[
                'type',
                'track_href', 
                'analysis_url', 
                'duration_ms', 
                'time_signature', 
                'id'], inplace=True)
            except:
                print(f"Oops! {uri} has no features.  Skipped...")
            self.df_features = pd.concat([self.df_features, self.df_track_features])
        #self.df_features.to_csv('../raw_data/features.csv')
    
    def merge_dataframes(self):
        '''
        Function that merge all the dataframe toghether
        '''
        self.full_data = pd.merge(self.df_songs, self.df_analysis, how='outer', on='uri')
        self.full_data = pd.merge(self.full_data, self.df_features, how='outer', on='uri')
        

In [10]:
# if 'ae' in globals():
#     del ae
# ae = ApiExtractor(se)

# ae.create_df_audio_features()
# ae.create_df_songs()
# ae.create_df_audio_analysis()

# ae.df_songs.to_csv('../raw_data/songs.csv')
# ae.df_analysis.to_csv('../raw_data/analysis.csv')
# ae.df_features.to_csv('../raw_data/features.csv')

In [72]:
df_songs_csv = pd.read_csv('../raw_data/songs.csv', index_col='Unnamed: 0')
df_features_csv = pd.read_csv('../raw_data/features.csv', index_col='Unnamed: 0')
df_analysis_csv = pd.read_csv('../raw_data/analysis.csv', index_col='Unnamed: 0')

In [73]:
full_data = pd.merge(df_songs_csv, df_features_csv, how='left', on='uri')
full_data = pd.merge(full_data, df_analysis_csv, how='left', on='uri').drop(columns='0')

# Romove duplicate songs
full_data = full_data.drop_duplicates(
    subset = ['name', 'artists'],
    keep = 'last').reset_index(drop = True)

In [74]:
full_data.dropna(inplace=True)

# Romove duplicate songs
full_data = full_data.drop_duplicates(
    subset = ['sp1', 'sp2', 'sp3', 'sp4', 'sp5', 
              'sp6', 'sp7', 'sp8', 'sp9', 'sp10', 
              'sp11', 'sp12', 'tm1', 'tm2', 'tm3', 
              'tm4', 'tm5', 'tm6', 'tm7', 'tm8', 
              'tm9', 'tm10', 'tm11', 'tm12'],
    keep = 'last').reset_index(drop = True)
full_data

Unnamed: 0,uri,name,artists,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,sp1,sp2,sp3,sp4,sp5,sp6,sp7,sp8,sp9,sp10,sp11,sp12,tm1,tm2,tm3,tm4,tm5,tm6,tm7,tm8,tm9,tm10,tm11,tm12
0,spotify:track:0zK545STj6P7qbFSpCK9pp,No Matter What - Remastered 2010,['Badfinger'],69.0,0.0,0.529,0.743,9.0,-5.523,1.0,0.0481,0.17700,0.000000,0.0759,0.780,116.817,0.377665,0.519791,0.327122,0.298292,0.521404,0.281361,0.393238,0.261079,0.296045,0.586200,0.266665,0.424601,51.549351,49.471764,28.402259,-6.611691,8.672701,-15.851377,-3.023365,-2.810903,-11.967209,-6.968844,-7.888580,-0.390810
1,spotify:track:6S3JlDAGk3uu3NtZbPnuhS,Baby Blue - Remastered 2010,['Badfinger'],65.0,0.0,0.623,0.876,11.0,-7.788,1.0,0.0660,0.01890,0.013400,0.0728,0.515,124.431,0.552265,0.583575,0.377556,0.324087,0.394371,0.324429,0.408047,0.312939,0.295168,0.296182,0.284657,0.507354,49.332953,51.188700,-0.719471,-7.483537,5.189068,-23.154776,4.812846,-3.114357,-3.276186,-3.877942,-9.597275,6.164654
2,spotify:track:2XKW8CH8nRZH9cF2DNjBHN,Day After Day - Remastered 2010,['Badfinger'],60.0,0.0,0.399,0.686,5.0,-7.112,1.0,0.0311,0.10900,0.000874,0.1120,0.377,101.806,0.377757,0.255924,0.265777,0.247239,0.238636,0.713636,0.380202,0.362070,0.225683,0.257718,0.264996,0.190041,49.577311,29.223746,16.601288,-7.617689,9.076881,-18.725906,16.871063,1.604791,-13.342325,-4.138804,-9.655491,7.142679
3,spotify:track:54qG9hpUjLgkgTJQ9qvB1P,Dear Angie - Remastered 2010,['Badfinger'],58.0,0.0,0.633,0.417,1.0,-9.110,0.0,0.0265,0.30000,0.000003,0.1140,0.506,81.976,0.334110,0.466673,0.210208,0.201350,0.266696,0.234112,0.521972,0.270786,0.329369,0.485270,0.253291,0.361730,44.837955,22.996134,-28.527643,-21.781815,1.009327,-8.241802,-17.597955,-8.792550,-6.962940,-4.061805,-14.220998,-1.930820
4,spotify:track:5CmGnAY7I7Ggj303HZBbkX,I'd Die Babe - Remastered 2010,['Badfinger'],58.0,0.0,0.684,0.669,7.0,-8.327,1.0,0.0366,0.00893,0.000252,0.2830,0.874,126.802,0.374049,0.352641,0.374391,0.216346,0.355064,0.355253,0.296529,0.358758,0.273996,0.433620,0.261253,0.246302,47.798775,73.709352,-10.897862,-10.130981,-6.458335,-12.240673,2.769227,-4.071134,-9.539115,-5.332803,-12.153168,-1.156569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44909,spotify:track:3htJKHlAjUoRmNLheJ8uTV,Tantrum,['Ashnikko'],63.0,1.0,0.777,0.787,9.0,-4.710,1.0,0.0732,0.13800,0.000000,0.0685,0.698,138.043,0.472469,0.556781,0.430952,0.301991,0.437339,0.357109,0.214463,0.180509,0.252104,0.384439,0.214152,0.379581,51.032744,42.784456,-9.147883,3.434815,26.930922,-13.120420,-4.904157,-9.114287,-5.917470,3.399915,-18.153050,-3.134885
44910,spotify:track:37KAHk68qniQvtacrldNmM,Toxic,['Ashnikko'],62.0,1.0,0.843,0.546,2.0,-6.376,1.0,0.0678,0.05350,0.000000,0.2030,0.718,126.098,0.325847,0.562177,0.433343,0.251156,0.150747,0.173755,0.148056,0.292637,0.149910,0.227466,0.225946,0.165433,48.515961,76.318860,22.737798,11.825056,44.593420,-12.249594,12.510079,-11.247243,-21.436805,6.440411,-20.204795,-7.432665
44911,spotify:track:13gqe2AjtaPexcBoqA10IX,Panic Attacks in Paradise,['Ashnikko'],62.0,0.0,0.695,0.386,8.0,-5.114,0.0,0.0337,0.55400,0.000000,0.0906,0.214,127.947,0.179601,0.219873,0.143695,0.450385,0.287020,0.127325,0.261494,0.142457,0.388980,0.166002,0.318616,0.394261,48.624792,37.820737,-10.998397,-32.279270,38.264513,-5.968915,-10.864087,-5.028580,0.202213,-2.764809,-16.266090,6.203757
44912,spotify:track:2hwUHHqLSEue41V952MtD2,Halloweenie IV: Innards,['Ashnikko'],61.0,0.0,0.760,0.592,1.0,-6.379,1.0,0.1770,0.02460,0.000000,0.1250,0.678,136.045,0.485242,0.506790,0.269729,0.374321,0.322699,0.231216,0.309184,0.265014,0.344039,0.259721,0.216127,0.280809,48.444239,97.938711,3.270557,-2.038748,44.802160,-13.183239,-10.260704,-2.715768,-10.664443,12.549157,-17.083035,-1.722709


In [62]:
#df_songs_csv = df_songs_csv.groupby(['name', 'artists']).first().reset_index()

# drop rows which have same order_id
# and customer_id and keep latest entry
df_songs_csv = df_songs_csv.drop_duplicates(
  subset = ['name', 'artists'],
  keep = 'last').reset_index(drop = True)
  
display(df_songs_csv)

Unnamed: 0,uri,name,artists,popularity,explicit
0,spotify:track:0zK545STj6P7qbFSpCK9pp,No Matter What - Remastered 2010,['Badfinger'],69.0,0.0
1,spotify:track:6S3JlDAGk3uu3NtZbPnuhS,Baby Blue - Remastered 2010,['Badfinger'],65.0,0.0
2,spotify:track:2XKW8CH8nRZH9cF2DNjBHN,Day After Day - Remastered 2010,['Badfinger'],60.0,0.0
3,spotify:track:54qG9hpUjLgkgTJQ9qvB1P,Dear Angie - Remastered 2010,['Badfinger'],58.0,0.0
4,spotify:track:5CmGnAY7I7Ggj303HZBbkX,I'd Die Babe - Remastered 2010,['Badfinger'],58.0,0.0
...,...,...,...,...,...
46192,spotify:track:37KAHk68qniQvtacrldNmM,Toxic,['Ashnikko'],62.0,1.0
46193,spotify:track:13gqe2AjtaPexcBoqA10IX,Panic Attacks in Paradise,['Ashnikko'],62.0,0.0
46194,spotify:track:2hwUHHqLSEue41V952MtD2,Halloweenie IV: Innards,['Ashnikko'],61.0,0.0
46195,spotify:track:7HmnJHfs0BkFzX4x8j0hkl,Billetes Azules (with J Balvin),"['KEVVO', 'J Balvin']",49.0,1.0
