## SearchEngine Class

In [4]:
# Data Manipulation
import numpy as np
import pandas as pd

class SearchEngine():
    def __init__(self, data):
        '''
        SearchEngine class
        Input: a songs dataset extracted from spotify API
        '''
        self.data = data

    def target_song(self, title='', artist=''):
        '''
        Search engine function for the target song
        Input: song title and/or artist
        Output: self.target dataset song
        '''
        # transform input strings in lowercase
        title = str(title).lower()
        artist = str(artist).lower()
        # filter self.data on the desired song
        if title != '' and artist != '':
            self.target = self.data[self.data[
                'artists'].str.lower().str.contains(artist)]
            self.target = self.target[self.target[
                'name'].str.lower().str.contains(title)]
        elif title != '':
            self.target = self.data[self.data[
                'name'].str.lower().str.contains(title)]
        elif artist != '':
            self.target = self.data[self.data[
                'artists'].str.lower().str.contains(artist)]
        else:
            print('Please select a song title and artist')
        # keep only the first song result
        self.target = self.target.head(1)
        self.artist = self.target["artists"].to_string(
            index=False).strip("['").strip("']")
        self.title = self.target["name"].to_string(index=False)
        print(f'TITLE: {self.title}')
        print(f'ARTIST: {self.artist}')

## Preprocessor Class

In [5]:
# Data Manipulation
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.utils import shuffle

# Pipeline and Column Transformers
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn import set_config
set_config(display = "diagram")

# Scaling
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

# Package classes
# from music_similarity.search_engine import SearchEngine

class Preprocessor():
    def __init__(self, se):
        '''
        Preprocessor class
        Input: a songs dataset extracted from spotify API
        '''
        self.se = se

    def scale_data(self):
        '''
        Adapting data function
        '''
        # drop non numerical features before scaling
        self.X=self.se.data.drop(columns=['name','artists'])
        self.X_target=self.se.target.drop(columns=['name','artists'])
        # fit and transofrm with MinMaxScaler
        mmscaler = MinMaxScaler().fit(self.X)
        self.X_mmscaled=mmscaler.transform(self.X)
        self.X_target_mmscaled=mmscaler.transform(self.X_target)
        # fit and transofrm with RobustScaler
        roscaler = RobustScaler().fit(self.X)
        self.X_roscaled=roscaler.transform(self.X)
        self.X_target_roscaled=roscaler.transform(self.X_target)

## Playlist Class

In [6]:
# Data Manipulation
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Unsupervised Learning
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# K-nn modelling
from sklearn.neighbors import NearestNeighbors

# Package classes
# from music_similarity.search_engine import SearchEngine
# from music_similarity.preprocessor import Preprocessor

class Playlist():
    def __init__(self, preprocessor, se):
        '''
        Extractor class
        Input: preprocessor class dataset extracted from spotify API
        '''
        self.preprocessor = preprocessor
        self.se = se
        self.playlist_songs = 10

    def build_model(self):
        '''
        Model builder function
        '''
        # Define the model
        self.model=NearestNeighbors(
            n_neighbors=self.playlist_songs + 1).fit(
            self.preprocessor.X_mmscaled)
        # Extract index and distance of self.playlist_songs+1
        # number of colest songs
        self.distance, self.index=self.model.kneighbors(
            self.preprocessor.X_target_mmscaled,
            n_neighbors=self.playlist_songs + 1)
        # Copy found index rows from the original not scaled dataset
        self.playlist = self.se.data.iloc[self.index[0],:]
        self.playlist['distance'] = self.distance[0]
        # Remove the target song from the list
        self.playlist = self.playlist.tail(self.playlist_songs)
        # Ordering the playlist on distance, ascending order
        self.playlist = self.playlist.sort_values(
            by=['distance'], ascending=True, ignore_index=True)
        # Drop not necessary columns
        self.playlist = self.playlist[['name', 'artists', 'distance']]
        # Strip square brackets from the artists strings
        self.playlist['artists'] = self.playlist['artists'].apply(
            lambda x: x.strip("['").strip("']"))
        # Set starting index from 0 to 1
        self.playlist.index += 1

In [7]:
if 'se' in globals():
    del se
spotify = pd.read_csv('../raw_data/ML_spotify_data.csv')
se = SearchEngine(spotify)
se.target_song("f", "u2")

if 'preprocessor' in globals():
    del preprocessor
preprocessor = Preprocessor(se)
preprocessor.scale_data()

if 'playlist' in globals():
    del playlist
playlist = Playlist(preprocessor, se)
playlist.build_model()

TITLE: A Sort Of Homecoming - Live
ARTIST: U2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.playlist['distance'] = self.distance[0]


In [8]:
playlist.playlist

Unnamed: 0,name,artists,distance
1,You've Got Another Thing Coming - Live from th...,Judas Priest,0.171471
2,Breaking The Silence - Remastered 2003,Queensrÿche,0.223612
3,"It Ain't Me, Babe - Live at LA Forum, Inglewoo...",Bob Dylan,0.232693
4,"Blasphemous Rumours - Live at Rose Bowl, Pasad...",Depeche Mode,0.236184
5,Feats Don't Fail Me Now - Live at Lisner Audit...,Little Feat,0.238395
6,Shoot Shoot - Live / 2008 Remaster,UFO,0.239608
7,"Nutrocker - Live At Newcastle City Hall, 26.3....","Emerson, Lake & Palmer",0.244874
8,Comin' Atcha Live / Truckin' - Live At The Tro...,Tesla,0.245215
9,Battle Angels,Sanctuary,0.269785
10,Baba O'Riley - Live At Shepperton,The Who,0.270837


## API DATA query

In [9]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time

# TO DO: I'LL TURN THIS INTO A CLASS LATER

class ApiExtractor:
    def __init__(self, se):
        '''
        Extractor class
        Input: preprocessor class dataset extracted from spotify API
        '''
        # credentials need to be exported via the command line
        self.auth_manager = SpotifyClientCredentials()
        self.sp_connection = spotipy.Spotify(auth_manager=self.auth_manager)
        self.search_limit = 10
        self.se = se
    
    def get_track_attrs(self, artist, title):
        # get track attributes
        self.ta_response = self.sp_connection.search(
            q="artist:" + artist + " track:" + title,
            type="track",
            limit=1)

        # parse attributes from track
        self.track = self.ta_response['tracks']['items'][0]
        self.track_name = self.track['name']
        self.track_uri = self.track['uri']
        self.track_popularity = self.track['popularity']
        self.track_explicit = self.track['explicit']
        self.track_artists = [artist['name'] for artist in self.track['artists']]

        # get track features
        self.track_features = self.sp_connection.audio_features(tracks = self.track_uri)[0]

        return [self.track_name, self.track_artists, self.track_popularity,
                self.track_features['danceability'], self.track_features['valence'],
                self.track_features['energy'], self.track_explicit, self.track_features['key'],
                self.track_features['liveness'], self.track_features['loudness'],
                self.track_features['speechiness'], self.track_features['tempo']]
    
    def create_df_songs(self):
        '''
        Function that create self.df_songs dataframe
        with the following features:
        
        - uri 
        - name 
        - artists 
        - popylarity
        - explicit 
        features
        '''
        # extract the list of artists in a np array
        # artists_array from the baseline dataset
        self.artists_array = np.array([])
        for artist in self.se.data.artists.unique():
            self.artists_array = np.append(self.artists_array, artist)
        
        # Test array for debugging purposes
        # self.test_artists_array = np.array(["['Robin Trower', 'Jack Bruce', 'Bill Lordan']",   TEST
        #                                     "['Michael Hedges']"])                             TEST
         
        # find 10 most popular songs for each artist
        # creating features np arrays to store first 5 informations
        self.songs_uri_array = np.array([])
        self.songs_name_array = np.array([])
        self.songs_artists_array = np.array([])
        self.songs_popularity_array = np.array([])
        self.songs_explicit_array = np.array([])
        
        # Test array to not overload API requests
        # for artist in self.test_artists_array:                                                 TEST
        for artist in self.artists_array:
            # Perform API search request
            time.sleep(0.015)
            print(f'{artist} ...')
            # Try the query
            try:
                self.asl_response = self.sp_connection.search(
                                    q="artist:" + artist,
                                    type="track", 
                                    limit=self.search_limit)
            # If there is an exception:
            # jump to the next loop
            except:
                print(f"Oops! {artist} not valid.  Skipped...")
                continue
            
            # iteration over the self.search_limit songs
            for i in range(len(self.asl_response['tracks']['items'])):
                # append uri of the track
                self.songs_uri_array = np.append(
                    self.songs_uri_array, self.asl_response[
                        'tracks']['items'][i]['uri'])
                # append name of the track
                self.songs_name_array = np.append(
                    self.songs_name_array, self.asl_response[
                        'tracks']['items'][i]['name'])
                # append artists of the track
                self.songs_artists_array = np.append(
                    self.songs_artists_array, artist)
                # append popularity of the track
                self.songs_popularity_array = np.append(
                    self.songs_popularity_array, self.asl_response[
                        'tracks']['items'][i]['popularity'])
                # append explicit of the track
                self.songs_explicit_array = np.append(
                    self.songs_explicit_array, self.asl_response[
                        'tracks']['items'][i]['explicit'])

        self.df_songs = pd.DataFrame()
        self.df_songs['uri'] = self.songs_uri_array
        self.df_songs['name'] = self.songs_name_array
        self.df_songs['artists'] = self.songs_artists_array
        self.df_songs['popularity'] = self.songs_popularity_array
        self.df_songs['explicit'] = self.songs_explicit_array
        #self.df_songs.to_csv('../raw_data/songs.csv')
    
    def create_df_audio_analysis(self):
        '''
        Function that create a self.df_analysis dataframe 
        with the following features:
        
        - 12 pitch features
        - 12 timbre features 
        - uri 
        '''
        # From uri array list, query pitch and timbre features
        self.df_analysis = pd.DataFrame()
        self.df_songs_csv = pd.read_csv('../raw_data/songs.csv')
        self.songs_uri_array_csv = self.df_songs_csv['uri']
        # For testing purposes only iterate over the first 3 uri
        #for uri in self.songs_uri_array[:3]:
        #for uri in self.songs_uri_array:
        for uri in self.songs_uri_array_csv:
            time.sleep(0.015)
            print(f'{uri} ...')
            # Perform API audio_analysis request
            try:
                self.track_analysis = self.sp_connection.audio_analysis(track_id = uri)
            except:
                print(f"Oops! {uri} not valid.  Skipped...")
                continue
            # Reset segments arrays before new song analysis
            self.segment_pitch = []
            self.segment_timbre = []
            # Extract pitch and timbre for each segment of the song
            for segment in range(len(self.track_analysis['segments'])):
                self.segment_pitch.append(self.track_analysis['segments'][segment]['pitches'])
                self.segment_timbre.append(self.track_analysis['segments'][segment]['timbre'])
            # Reset df_song_segments dataframe
            self.df_song_segments = pd.DataFrame()
            # create pitch and timbre columns with np arrays
            self.df_song_segments['segment_pitch'] = self.segment_pitch
            self.df_song_segments['segment_timbre'] = self.segment_timbre
            # Split the array elements in different columns
            self.df_split_pitches = pd.DataFrame(self.df_song_segments['segment_pitch'].tolist(), columns=['sp1', 'sp2', 'sp3', 'sp4', 'sp5', 'sp6', 'sp7', 'sp8', 'sp9', 'sp10', 'sp11', 'sp12'])
            self.df_split_timbres = pd.DataFrame(self.df_song_segments['segment_timbre'].tolist(), columns=['tm1', 'tm2', 'tm3', 'tm4', 'tm5', 'tm6', 'tm7', 'tm8', 'tm9', 'tm10', 'tm11', 'tm12'])
            # Add new columns to df_segment
            self.df_song_segments = pd.concat([self.df_song_segments, self.df_split_pitches, self.df_split_timbres], axis=1)
            self.df_song_segments.drop(['segment_pitch', 'segment_timbre'], axis = 1, inplace = True)
            # Transpose mean serie into a dataframe row
            self.df_song_segments = self.df_song_segments.mean().to_frame().T
            # Add uri column to perform the future merge
            self.df_song_segments['uri'] = uri
            self.df_analysis = pd.concat([self.df_analysis, self.df_song_segments])
        #self.df_analysis.to_csv('../raw_data/analysis.csv')
    
    def create_df_audio_features(self):
        '''
        Function that create a self.df_features dataframe 
        with the following features:
        
        - danceability
        - energy
        - key
        - loudness
        - mode
        - speechiness
        - acousticness
        - instrumentalness
        - liveness
        - valence
        - tempo
        - uri
        '''
        self.df_features = pd.DataFrame()
        self.df_songs_csv = pd.read_csv('../raw_data/songs.csv')
        self.songs_uri_array_csv = self.df_songs_csv['uri']
        #for uri in self.songs_uri_array[:2]:
        for uri in self.songs_uri_array_csv:
            time.sleep(0.015)
            print(f'{uri} ...')
            # Perform API audio_features request
            try:
                self.track_features = self.sp_connection.audio_features(tracks = uri)[0]
            except:
                print(f"Oops! {uri} not valid.  Skipped...")
                continue
            self.df_track_features = pd.DataFrame()
            self.df_track_features = pd.DataFrame.from_dict([self.track_features])
            try:
                self.df_track_features.drop(columns=[
                'type',
                'track_href', 
                'analysis_url', 
                'duration_ms', 
                'time_signature', 
                'id'], inplace=True)
            except:
                print(f"Oops! {uri} has no features.  Skipped...")
            self.df_features = pd.concat([self.df_features, self.df_track_features])
        #self.df_features.to_csv('../raw_data/features.csv')
    
    def merge_dataframes(self):
        '''
        Function that merge all the dataframe toghether
        '''
        self.full_data = pd.merge(self.df_songs, self.df_analysis, how='outer', on='uri')
        self.full_data = pd.merge(self.full_data, self.df_features, how='outer', on='uri')
        

In [10]:
# if 'ae' in globals():
#     del ae
# ae = ApiExtractor(se)

# ae.create_df_audio_features()
# ae.create_df_songs()
# ae.create_df_audio_analysis()

# ae.df_songs.to_csv('../raw_data/songs.csv')
# ae.df_analysis.to_csv('../raw_data/analysis.csv')
# ae.df_features.to_csv('../raw_data/features.csv')

In [81]:
df_songs_csv = pd.read_csv('../raw_data/songs.csv', index_col='Unnamed: 0')
df_features_csv = pd.read_csv('../raw_data/features.csv', index_col='Unnamed: 0')
df_analysis_csv = pd.read_csv('../raw_data/analysis.csv', index_col='Unnamed: 0')

In [82]:
full_data = pd.merge(df_songs_csv, df_features_csv, how='left', on='uri')
full_data = pd.merge(full_data, df_analysis_csv, how='left', on='uri').drop(columns='0')

full_data = full_data.drop_duplicates(
    subset = ['sp1', 'sp2', 'sp3', 'sp4', 'sp5', 
              'sp6', 'sp7', 'sp8', 'sp9', 'sp10', 
              'sp11', 'sp12', 'tm1', 'tm2', 'tm3', 
              'tm4', 'tm5', 'tm6', 'tm7', 'tm8', 
              'tm9', 'tm10', 'tm11', 'tm12'],
    keep = 'last').reset_index(drop = True)

full_data.dropna(inplace=True)
full_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47769 entries, 0 to 47788
Data columns (total 40 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   uri               47769 non-null  object 
 1   name              47769 non-null  object 
 2   artists           47769 non-null  object 
 3   popularity        47769 non-null  float64
 4   explicit          47769 non-null  float64
 5   danceability      47769 non-null  float64
 6   energy            47769 non-null  float64
 7   key               47769 non-null  float64
 8   loudness          47769 non-null  float64
 9   mode              47769 non-null  float64
 10  speechiness       47769 non-null  float64
 11  acousticness      47769 non-null  float64
 12  instrumentalness  47769 non-null  float64
 13  liveness          47769 non-null  float64
 14  valence           47769 non-null  float64
 15  tempo             47769 non-null  float64
 16  sp1               47769 non-null  float6