In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

# Spotify

In [62]:
# Read the spotify dataset
spotify_df = pd.read_csv('dataset.csv')
spotify_df.head(10)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic
5,5,01MVOl9KtVTNfFiBU9I7dc,Tyrone Wells,Days I Will Remember,Days I Will Remember,58,214240,False,0.688,0.481,...,-8.807,1,0.105,0.289,0.0,0.189,0.666,98.017,4,acoustic
6,6,6Vc5wAMmXdKIAM7WUoEb7N,A Great Big World;Christina Aguilera,Is There Anybody Out There?,Say Something,74,229400,False,0.407,0.147,...,-8.822,1,0.0355,0.857,3e-06,0.0913,0.0765,141.284,3,acoustic
7,7,1EzrEOXmMH3G43AXT1y7pA,Jason Mraz,We Sing. We Dance. We Steal Things.,I'm Yours,80,242946,False,0.703,0.444,...,-9.331,1,0.0417,0.559,0.0,0.0973,0.712,150.96,4,acoustic
8,8,0IktbUcnAGrvD03AWnz3Q8,Jason Mraz;Colbie Caillat,We Sing. We Dance. We Steal Things.,Lucky,74,189613,False,0.625,0.414,...,-8.7,1,0.0369,0.294,0.0,0.151,0.669,130.088,4,acoustic
9,9,7k9GuJYLp2AzqokyEdwEw2,Ross Copperman,Hunger,Hunger,56,205594,False,0.442,0.632,...,-6.77,1,0.0295,0.426,0.00419,0.0735,0.196,78.899,4,acoustic


## Preprocessing

In [63]:
# Drop the first columns
spotify_df.drop(columns=spotify_df.columns[0], axis=1, inplace=True)

In [64]:
# Remove unused columns
unused_col = ['track_id', 'artists', 'album_name', 'track_name', 'mode', 'time_signature', 'instrumentalness', 'duration_ms']
spotify_df = spotify_df.drop(columns=unused_col)

In [65]:
spotify_df.head(10)

Unnamed: 0,popularity,explicit,danceability,energy,key,loudness,speechiness,acousticness,liveness,valence,tempo,track_genre
0,73,False,0.676,0.461,1,-6.746,0.143,0.0322,0.358,0.715,87.917,acoustic
1,55,False,0.42,0.166,1,-17.235,0.0763,0.924,0.101,0.267,77.489,acoustic
2,57,False,0.438,0.359,0,-9.734,0.0557,0.21,0.117,0.12,76.332,acoustic
3,71,False,0.266,0.0596,0,-18.515,0.0363,0.905,0.132,0.143,181.74,acoustic
4,82,False,0.618,0.443,2,-9.681,0.0526,0.469,0.0829,0.167,119.949,acoustic
5,58,False,0.688,0.481,6,-8.807,0.105,0.289,0.189,0.666,98.017,acoustic
6,74,False,0.407,0.147,2,-8.822,0.0355,0.857,0.0913,0.0765,141.284,acoustic
7,80,False,0.703,0.444,11,-9.331,0.0417,0.559,0.0973,0.712,150.96,acoustic
8,74,False,0.625,0.414,0,-8.7,0.0369,0.294,0.151,0.669,130.088,acoustic
9,56,False,0.442,0.632,1,-6.77,0.0295,0.426,0.0735,0.196,78.899,acoustic


In [66]:
# Check if dataset contain any null value
spotify_df.isnull().values.any()

False

In [67]:
# Inspect dataset structure
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   popularity    114000 non-null  int64  
 1   explicit      114000 non-null  bool   
 2   danceability  114000 non-null  float64
 3   energy        114000 non-null  float64
 4   key           114000 non-null  int64  
 5   loudness      114000 non-null  float64
 6   speechiness   114000 non-null  float64
 7   acousticness  114000 non-null  float64
 8   liveness      114000 non-null  float64
 9   valence       114000 non-null  float64
 10  tempo         114000 non-null  float64
 11  track_genre   114000 non-null  object 
dtypes: bool(1), float64(8), int64(2), object(1)
memory usage: 9.7+ MB


In [68]:
# Check columns number of unique values
spotify_df.nunique()

popularity        101
explicit            2
danceability     1174
energy           2083
key                12
loudness        19480
speechiness      1489
acousticness     5061
liveness         1722
valence          1790
tempo           45653
track_genre       114
dtype: int64

In [69]:
# Get all track_genre columns unique values
track_genre_list = np.array(spotify_df['track_genre'].unique().tolist())

track_genre_list

array(['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient',
       'anime', 'black-metal', 'bluegrass', 'blues', 'brazil',
       'breakbeat', 'british', 'cantopop', 'chicago-house', 'children',
       'chill', 'classical', 'club', 'comedy', 'country', 'dance',
       'dancehall', 'death-metal', 'deep-house', 'detroit-techno',
       'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm',
       'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk',
       'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove',
       'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle',
       'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian',
       'indie-pop', 'indie', 'industrial', 'iranian', 'j-dance', 'j-idol',
       'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino',
       'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb',
       'new-age', 'opera', 'pagode', 'party', 'piano', 'pop-film', 'pop',
       'pow

In [70]:
# Change the explicit columns to intergers
# It only has two unique values: TRUE, FALSE
explicit_dict = {'FALSE': 0, 'TRUE': 1}
spotify_df['explicit'] = spotify_df['explicit'].replace(explicit_dict).astype(int)

In [71]:
spotify_df.head(10)

Unnamed: 0,popularity,explicit,danceability,energy,key,loudness,speechiness,acousticness,liveness,valence,tempo,track_genre
0,73,0,0.676,0.461,1,-6.746,0.143,0.0322,0.358,0.715,87.917,acoustic
1,55,0,0.42,0.166,1,-17.235,0.0763,0.924,0.101,0.267,77.489,acoustic
2,57,0,0.438,0.359,0,-9.734,0.0557,0.21,0.117,0.12,76.332,acoustic
3,71,0,0.266,0.0596,0,-18.515,0.0363,0.905,0.132,0.143,181.74,acoustic
4,82,0,0.618,0.443,2,-9.681,0.0526,0.469,0.0829,0.167,119.949,acoustic
5,58,0,0.688,0.481,6,-8.807,0.105,0.289,0.189,0.666,98.017,acoustic
6,74,0,0.407,0.147,2,-8.822,0.0355,0.857,0.0913,0.0765,141.284,acoustic
7,80,0,0.703,0.444,11,-9.331,0.0417,0.559,0.0973,0.712,150.96,acoustic
8,74,0,0.625,0.414,0,-8.7,0.0369,0.294,0.151,0.669,130.088,acoustic
9,56,0,0.442,0.632,1,-6.77,0.0295,0.426,0.0735,0.196,78.899,acoustic


In [72]:
# Check explicit unique values still remain at 2
spotify_df['explicit'].nunique()

2

## Classication

In [73]:
# Get the data and labels 
data = spotify_df.drop(columns='track_genre')
labels = spotify_df['track_genre']

In [74]:
print(labels.shape, data.shape)

(114000,) (114000, 11)


In [75]:
# split the dataset
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

### Classification with KNN

In [76]:
# Model
model = KNeighborsClassifier(n_neighbors=3)

In [77]:
# Fit
model.fit(x_train, y_train)

In [78]:
knn_predict = model.predict(x_test)

In [79]:
print(classification_report(y_test, knn_predict))

                   precision    recall  f1-score   support

         acoustic       0.04      0.09      0.05       213
         afrobeat       0.07      0.18      0.10       203
         alt-rock       0.05      0.15      0.08       215
      alternative       0.07      0.12      0.09       184
          ambient       0.10      0.21      0.13       197
            anime       0.02      0.05      0.03       193
      black-metal       0.11      0.21      0.15       210
        bluegrass       0.07      0.17      0.10       205
            blues       0.05      0.12      0.07       214
           brazil       0.06      0.14      0.08       197
        breakbeat       0.18      0.33      0.23       199
          british       0.03      0.05      0.04       214
         cantopop       0.04      0.08      0.05       193
    chicago-house       0.27      0.40      0.32       206
         children       0.08      0.15      0.11       214
            chill       0.06      0.11      0.08       