# 2. Grouping songs together!

In [24]:
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

## 2.1 Getting your data!

In [22]:
echonest = pd.read_csv('csv/echonest.csv')
features = pd.read_csv('csv/features.csv')
tracks = pd.read_csv('csv/tracks.csv')

In [3]:
echonest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13129 entries, 0 to 13128
Columns: 250 entries, track_id to temporal_features_223
dtypes: float64(244), int64(1), object(5)
memory usage: 25.0+ MB


In [4]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106574 entries, 0 to 106573
Columns: 519 entries, track_id to zcr_std_01
dtypes: float64(518), int64(1)
memory usage: 422.0 MB


In [5]:
tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106574 entries, 0 to 106573
Data columns (total 53 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   track_id                  106574 non-null  int64  
 1   album_comments            106574 non-null  int64  
 2   album_date_created        103045 non-null  object 
 3   album_date_released       70294 non-null   object 
 4   album_engineer            15295 non-null   object 
 5   album_favorites           106574 non-null  int64  
 6   album_id                  106574 non-null  int64  
 7   album_information         83149 non-null   object 
 8   album_listens             106574 non-null  int64  
 9   album_producer            18060 non-null   object 
 10  album_tags                106574 non-null  object 
 11  album_title               105549 non-null  object 
 12  album_tracks              106574 non-null  int64  
 13  album_type                100066 non-null  o

In [6]:
print(echonest.columns.to_list())

['track_id', 'audio_features_acousticness', 'audio_features_danceability', 'audio_features_energy', 'audio_features_instrumentalness', 'audio_features_liveness', 'audio_features_speechiness', 'audio_features_tempo', 'audio_features_valence', 'metadata_album_date', 'metadata_album_name', 'metadata_artist_latitude', 'metadata_artist_location', 'metadata_artist_longitude', 'metadata_artist_name', 'metadata_release', 'ranks_artist_discovery_rank', 'ranks_artist_familiarity_rank', 'ranks_artist_hotttnesss_rank', 'ranks_song_currency_rank', 'ranks_song_hotttnesss_rank', 'social_features_artist_discovery', 'social_features_artist_familiarity', 'social_features_artist_hotttnesss', 'social_features_song_currency', 'social_features_song_hotttnesss', 'temporal_features_000', 'temporal_features_001', 'temporal_features_002', 'temporal_features_003', 'temporal_features_004', 'temporal_features_005', 'temporal_features_006', 'temporal_features_007', 'temporal_features_008', 'temporal_features_009', 

In [7]:
print(features.columns.to_list())

['track_id', 'chroma_cens_kurtosis_01', 'chroma_cens_kurtosis_02', 'chroma_cens_kurtosis_03', 'chroma_cens_kurtosis_04', 'chroma_cens_kurtosis_05', 'chroma_cens_kurtosis_06', 'chroma_cens_kurtosis_07', 'chroma_cens_kurtosis_08', 'chroma_cens_kurtosis_09', 'chroma_cens_kurtosis_10', 'chroma_cens_kurtosis_11', 'chroma_cens_kurtosis_12', 'chroma_cens_max_01', 'chroma_cens_max_02', 'chroma_cens_max_03', 'chroma_cens_max_04', 'chroma_cens_max_05', 'chroma_cens_max_06', 'chroma_cens_max_07', 'chroma_cens_max_08', 'chroma_cens_max_09', 'chroma_cens_max_10', 'chroma_cens_max_11', 'chroma_cens_max_12', 'chroma_cens_mean_01', 'chroma_cens_mean_02', 'chroma_cens_mean_03', 'chroma_cens_mean_04', 'chroma_cens_mean_05', 'chroma_cens_mean_06', 'chroma_cens_mean_07', 'chroma_cens_mean_08', 'chroma_cens_mean_09', 'chroma_cens_mean_10', 'chroma_cens_mean_11', 'chroma_cens_mean_12', 'chroma_cens_median_01', 'chroma_cens_median_02', 'chroma_cens_median_03', 'chroma_cens_median_04', 'chroma_cens_median_05'

In [8]:
print(tracks.columns.to_list())

['track_id', 'album_comments', 'album_date_created', 'album_date_released', 'album_engineer', 'album_favorites', 'album_id', 'album_information', 'album_listens', 'album_producer', 'album_tags', 'album_title', 'album_tracks', 'album_type', 'artist_active_year_begin', 'artist_active_year_end', 'artist_associated_labels', 'artist_bio', 'artist_comments', 'artist_date_created', 'artist_favorites', 'artist_id', 'artist_latitude', 'artist_location', 'artist_longitude', 'artist_members', 'artist_name', 'artist_related_projects', 'artist_tags', 'artist_website', 'artist_wikipedia_page', 'set_split', 'set_subset', 'track_bit_rate', 'track_comments', 'track_composer', 'track_date_created', 'track_date_recorded', 'track_duration', 'track_favorites', 'track_genre_top', 'track_genres', 'track_genres_all', 'track_information', 'track_interest', 'track_language_code', 'track_license', 'track_listens', 'track_lyricist', 'track_number', 'track_publisher', 'track_tags', 'track_title']


In [9]:
echonest.head(10)

Unnamed: 0,track_id,audio_features_acousticness,audio_features_danceability,audio_features_energy,audio_features_instrumentalness,audio_features_liveness,audio_features_speechiness,audio_features_tempo,audio_features_valence,metadata_album_date,...,temporal_features_214,temporal_features_215,temporal_features_216,temporal_features_217,temporal_features_218,temporal_features_219,temporal_features_220,temporal_features_221,temporal_features_222,temporal_features_223
0,2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,,...,-1.992303,6.805694,0.23307,0.19288,0.027455,0.06408,3.67696,3.61288,13.31669,262.929749
1,3,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924,,...,-1.582331,8.889308,0.258464,0.220905,0.081368,0.06413,6.08277,6.01864,16.673548,325.581085
2,5,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661,,...,-2.288358,11.527109,0.256821,0.23782,0.060122,0.06014,5.92649,5.86635,16.013849,356.755737
3,10,0.95167,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.96359,2008-03-11,...,-3.662988,21.508228,0.283352,0.26707,0.125704,0.08082,8.41401,8.33319,21.317064,483.403809
4,134,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072,,...,-1.452696,2.356398,0.234686,0.19955,0.149332,0.0644,11.26707,11.20267,26.45418,751.147705
5,139,0.10655,0.260911,0.607067,0.835087,0.223676,0.030569,196.961,0.160267,,...,-3.078667,12.411567,0.270802,0.2727,0.025242,0.06404,2.43669,2.37265,3.897095,37.866043
6,140,0.376312,0.734079,0.265685,0.669581,0.085995,0.039068,107.952,0.609991,,...,-0.934696,-0.260981,0.322232,0.27798,0.136747,0.07533,9.86272,9.78739,21.981621,562.229431
7,141,0.963657,0.435933,0.075632,0.345493,0.105686,0.026658,33.477,0.16395,,...,-0.457298,-0.165962,0.437503,0.30739,0.368764,0.06263,11.18884,11.12621,13.481531,232.993546
8,142,0.662881,0.379065,0.823856,0.910266,0.088705,0.07909,147.781,0.092868,2005,...,-1.125605,1.478418,0.215844,0.20966,0.015754,0.06921,3.91102,3.84181,12.598523,346.813049
9,144,0.909011,0.443643,0.641997,0.924092,0.267669,0.089659,128.537,0.788251,,...,-3.454056,16.247305,0.245023,0.22608,0.033509,0.07066,2.98367,2.91301,10.123219,146.145737


In [10]:
features.head(10)

Unnamed: 0,track_id,chroma_cens_kurtosis_01,chroma_cens_kurtosis_02,chroma_cens_kurtosis_03,chroma_cens_kurtosis_04,chroma_cens_kurtosis_05,chroma_cens_kurtosis_06,chroma_cens_kurtosis_07,chroma_cens_kurtosis_08,chroma_cens_kurtosis_09,...,tonnetz_std_04,tonnetz_std_05,tonnetz_std_06,zcr_kurtosis_01,zcr_max_01,zcr_mean_01,zcr_median_01,zcr_min_01,zcr_skew_01,zcr_std_01
0,2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448
1,3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,...,0.063831,0.014212,0.01774,2.824694,0.466309,0.084578,0.063965,0.0,1.716724,0.06933
2,5,0.527563,-0.077654,-0.27961,0.685883,1.93757,0.880839,-0.923192,-0.927232,0.666617,...,0.04073,0.012691,0.014759,6.808415,0.375,0.053114,0.041504,0.0,2.193303,0.044861
3,10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.0,3.542325,0.0408
4,20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.81641,0.043851,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993
5,26,-0.699535,-0.684158,0.048825,0.042659,-0.818967,-0.917123,-0.901834,-0.066845,-0.291037,...,0.103717,0.025541,0.023846,41.645809,0.250488,0.018388,0.015625,0.0,4.690596,0.014598
6,30,-0.721487,-0.84856,0.890904,0.08862,-0.445513,-1.27117,-1.24019,-1.343765,-0.9056,...,0.141693,0.020426,0.025418,8.166595,0.546875,0.054417,0.036133,0.002441,2.244708,0.052674
7,46,-0.119708,-0.858814,2.362546,0.106584,-1.315912,-1.220354,-1.668162,-0.516033,-0.536395,...,0.132632,0.033212,0.02131,14.731083,0.223633,0.036601,0.032227,0.0,2.848736,0.020713
8,48,-1.054053,0.932339,0.528064,-1.035338,-1.000682,-1.119304,1.16699,-1.002603,-1.094999,...,0.141955,0.024654,0.025203,24.550789,0.371582,0.033015,0.022461,0.0,4.406733,0.039016
9,134,0.918445,0.674147,0.577818,1.281117,0.933746,0.078177,1.199204,-0.175223,0.925482,...,0.058766,0.016322,0.015819,4.731087,0.419434,0.06437,0.050781,0.0,1.806106,0.054623


In [11]:
tracks.head(10)

Unnamed: 0,track_id,album_comments,album_date_created,album_date_released,album_engineer,album_favorites,album_id,album_information,album_listens,album_producer,...,track_information,track_interest,track_language_code,track_license,track_listens,track_lyricist,track_number,track_publisher,track_tags,track_title
0,2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
1,3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
2,5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
3,10,0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4,6,,47632,,...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
4,20,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level
5,26,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,...,,1060,en,Attribution-NonCommercial-NoDerivatives (aka M...,193,,4,,[],Where is your Love?
6,30,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,...,,718,en,Attribution-NonCommercial-NoDerivatives (aka M...,612,,5,,[],Too Happy
7,46,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,...,,252,en,Attribution-NonCommercial-NoDerivatives (aka M...,171,,8,,[],Yosemite
8,48,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,...,,247,en,Attribution-NonCommercial-NoDerivatives (aka M...,173,,9,,[],Light of Light
9,134,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1126,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,943,,5,,[],Street Music


In [12]:
set_echonest = set(echonest.track_id.to_list())
set_features = set(features.track_id.to_list())
set_tracks = set(tracks.track_id.to_list())
echo_feat = set_echonest.issubset(set_features)
echo_trac = set_echonest.issubset(set_tracks)
print(echo_feat)
print(echo_trac)

True
True


In [13]:
#merge
new_df = pd.merge(echonest, features, on='track_id', how='inner')
main_df = pd.merge(new_df, tracks, on='track_id', how='inner')

In [15]:
#519 + 250 + 53 - 2 (track_id duplicates) = 820 columns
main_df.head()

Unnamed: 0,track_id,audio_features_acousticness,audio_features_danceability,audio_features_energy,audio_features_instrumentalness,audio_features_liveness,audio_features_speechiness,audio_features_tempo,audio_features_valence,metadata_album_date,...,track_information,track_interest,track_language_code,track_license,track_listens,track_lyricist,track_number,track_publisher,track_tags,track_title
0,2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,,...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
1,3,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924,,...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
2,5,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661,,...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
3,10,0.95167,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.96359,2008-03-11,...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
4,134,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072,,...,,1126,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,943,,5,,[],Street Music


In [16]:
# main_df.to_csv('csv/main_df.csv', index=False)

In [17]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13129 entries, 0 to 13128
Columns: 820 entries, track_id to track_title
dtypes: float64(764), int64(16), object(40)
memory usage: 82.2+ MB


## 2.2 Choose your features (variables)!

As you may notice, you have plenty of features to work with. So, you need to find a way to reduce the dimensionality (reduce the number of variables to work with). You can follow the next directions to achieve it:

1) Select **one** method for dimensionality reduction and apply it to your data. Some suggestions are Principal Component Analysis, Multiple Correspondence Analysis, Singular Value Decomposition, Factor Analysis for Mixed Data, Two-Steps clustering. Make sure that the method you choose is applicable for the features you have or modify your data to be able to use it. Explain why you chose that method and the limitations it may have.

HINT: We don't want to miss relevant variables like song's duration, language, etc., after the dimensionality reduction. To keep those variables, you can apply the dimensionality reduction method(s) on features coming from the same file. Later you can stack them with the variables selected from another file.

2) Apply the selected method(s) to your data. Make sure that the chosen method retains > 70% of the total variance.

In [20]:
try_df = main_df.copy()

In [21]:
try_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13129 entries, 0 to 13128
Columns: 820 entries, track_id to track_title
dtypes: float64(764), int64(16), object(40)
memory usage: 82.2+ MB


In [26]:
echo_cp = echonest.copy()
feat_cp = features.copy()
tracks_cp = tracks.copy()

### PCA on Echonest

In [47]:
# we want to use the PCA only on the temporal columns, so we need to take them
col_temp = [x for x in echo_cp if x.startswith('temporal')]
other_col = [x for x in echo_cp.columns.to_list() if x not in col_temp]
echo_PCA = echo_cp.loc[:, col_temp]

In [48]:
echo_PCA

Unnamed: 0,temporal_features_000,temporal_features_001,temporal_features_002,temporal_features_003,temporal_features_004,temporal_features_005,temporal_features_006,temporal_features_007,temporal_features_008,temporal_features_009,...,temporal_features_214,temporal_features_215,temporal_features_216,temporal_features_217,temporal_features_218,temporal_features_219,temporal_features_220,temporal_features_221,temporal_features_222,temporal_features_223
0,0.877233,0.588911,0.354243,0.295090,0.298413,0.309430,0.304496,0.334579,0.249495,0.259656,...,-1.992303,6.805694,0.233070,0.192880,0.027455,0.06408,3.676960,3.61288,13.316690,262.929749
1,0.534429,0.537414,0.443299,0.390879,0.344573,0.366448,0.419455,0.747766,0.460901,0.392379,...,-1.582331,8.889308,0.258464,0.220905,0.081368,0.06413,6.082770,6.01864,16.673548,325.581085
2,0.548093,0.720192,0.389257,0.344934,0.361300,0.402543,0.434044,0.388137,0.512487,0.525755,...,-2.288358,11.527109,0.256821,0.237820,0.060122,0.06014,5.926490,5.86635,16.013849,356.755737
3,0.311404,0.711402,0.321914,0.500601,0.250963,0.321316,0.734250,0.325188,0.373012,0.235840,...,-3.662988,21.508228,0.283352,0.267070,0.125704,0.08082,8.414010,8.33319,21.317064,483.403809
4,0.610849,0.569169,0.428494,0.345796,0.376920,0.460590,0.401371,0.449900,0.428946,0.446736,...,-1.452696,2.356398,0.234686,0.199550,0.149332,0.06440,11.267070,11.20267,26.454180,751.147705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13124,0.717013,0.686557,0.411056,0.342718,0.341934,0.482926,0.419219,0.408946,0.393060,0.382778,...,-1.721207,4.686078,0.213789,0.208800,0.007911,0.06395,2.040730,1.97678,8.144532,147.040405
13125,0.673395,0.846995,0.447772,0.425936,0.407817,0.405924,0.290565,0.314019,0.318129,0.310359,...,-0.647897,1.282306,0.214586,0.181860,0.011247,0.06240,0.922360,0.85996,1.794739,6.321268
13126,0.842368,0.719091,0.351503,0.354707,0.314619,0.276266,0.340571,0.342762,0.449963,0.456690,...,-0.771613,1.623510,0.180471,0.128185,0.010103,0.06222,2.251160,2.18894,5.578341,89.180328
13127,0.346748,0.311817,0.220864,0.185269,0.333642,0.290699,0.558345,0.397021,0.217570,0.297939,...,-2.054143,7.927149,0.250178,0.219205,0.014851,0.06390,1.487440,1.42354,2.173092,12.503966


#### Standardize the Data

In [49]:
scaler = StandardScaler()
echo_PCA = scaler.fit_transform(echo_PCA)

In [50]:
echo_PCA

array([[ 2.60237192,  0.91956571, -0.08275171, ..., -0.1010752 ,
         0.88620311,  0.54969139],
       [ 0.52227969,  0.61013805,  0.5894801 , ...,  0.3832916 ,
         1.40707909,  0.82743032],
       [ 0.60518755,  1.70838789,  0.18154766, ...,  0.35263003,
         1.30471517,  0.96563031],
       ...,
       [ 2.39081103,  1.70177654, -0.10343334, ..., -0.38776602,
        -0.31453905, -0.2205552 ],
       [-0.61654685, -0.74539844, -1.08954698, ..., -0.54186878,
        -0.84292396, -0.56046831],
       [-0.78071297, -1.62777553, -2.21479625, ...,  2.49184428,
         1.17362516,  0.72708853]])

#### Apply PCA

In [51]:
pca = PCA(0.90)
pc_echo = pca.fit_transform(echo_PCA)
print(pca.explained_variance_ratio_.sum())
pca_num = len(pca.components_)
print(pca_num)

0.9006240048197013


In [52]:
pc_echo_df = pd.DataFrame(data=pc_echo, columns=['pc_echo_'+str(i) for i in range(pca_num)])
pc_echo_df

Unnamed: 0,pc_echo_0,pc_echo_1,pc_echo_2,pc_echo_3,pc_echo_4,pc_echo_5,pc_echo_6,pc_echo_7,pc_echo_8,pc_echo_9,...,pc_echo_61,pc_echo_62,pc_echo_63,pc_echo_64,pc_echo_65,pc_echo_66,pc_echo_67,pc_echo_68,pc_echo_69,pc_echo_70
0,0.745696,-1.449742,0.243402,0.285016,-1.417538,1.032981,-1.149841,-0.091138,-1.395834,-2.404001,...,0.353559,0.111561,-0.262992,-0.504016,0.604185,-0.128035,-0.608444,-0.370103,-0.411444,-0.955747
1,2.610432,-3.990186,2.287702,-0.523855,0.171606,0.027428,0.318255,-2.011168,-0.017749,-3.278246,...,0.611444,-0.878695,0.244091,0.416863,0.594802,-0.338894,-0.034221,-1.845181,-0.276950,0.296630
2,1.293987,-4.319219,2.530143,-0.218655,-0.523152,0.400669,0.965578,-0.076320,1.447202,-2.345793,...,1.318349,-0.753971,-1.039863,0.777191,0.509547,0.417779,-0.091032,-1.025886,0.068314,-0.272380
3,6.132185,-1.496205,1.317994,1.178291,1.206192,0.521756,-3.728041,1.896103,-0.732778,-3.408716,...,-0.572486,0.454861,-1.059819,1.491763,-1.397941,-0.579548,0.327253,-0.043924,1.003379,-0.564259
4,-2.953884,-5.177221,2.735791,-0.619944,0.793249,0.984330,0.524088,0.027225,-1.042982,0.384810,...,0.547149,-0.094636,0.154925,-0.278138,0.315183,0.768817,-0.381592,0.186775,-0.449920,0.061162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13124,2.379205,-4.313090,-0.721039,0.162001,-2.092707,1.371075,0.333089,-1.045140,2.615540,-1.809308,...,-0.028087,-0.057589,0.620655,-0.427503,0.640985,0.253803,0.077692,-0.012697,0.109495,-0.648873
13125,0.076222,-4.946320,2.234761,3.859317,-3.213747,2.025050,2.920090,-0.261611,1.246771,-1.434390,...,1.174578,0.102255,0.819572,0.209645,0.276500,-0.497789,0.424493,-0.239482,0.408978,1.193789
13126,2.384627,-4.320515,-0.286861,0.112463,-2.184722,2.877628,0.018145,-0.147263,1.016730,-3.700848,...,1.033169,-0.566303,0.512557,-0.682360,-0.380061,0.023999,-0.741503,0.144168,-0.182448,0.207433
13127,-0.351606,0.562424,-3.333085,-2.336173,-0.199171,0.274648,1.251824,0.622272,2.572821,-2.715374,...,-0.262435,-0.174925,-0.055767,0.459823,0.147709,0.412693,0.162588,-0.276375,0.841430,0.173692


#### Final Echonest dataset

In [53]:
echo_final = pd.concat([echo_cp[other_col], pc_echo_df], axis=1)
echo_final

Unnamed: 0,track_id,audio_features_acousticness,audio_features_danceability,audio_features_energy,audio_features_instrumentalness,audio_features_liveness,audio_features_speechiness,audio_features_tempo,audio_features_valence,metadata_album_date,...,pc_echo_61,pc_echo_62,pc_echo_63,pc_echo_64,pc_echo_65,pc_echo_66,pc_echo_67,pc_echo_68,pc_echo_69,pc_echo_70
0,2,0.416675,0.675894,0.634476,0.010628,0.177647,0.159310,165.922,0.576661,,...,0.353559,0.111561,-0.262992,-0.504016,0.604185,-0.128035,-0.608444,-0.370103,-0.411444,-0.955747
1,3,0.374408,0.528643,0.817461,0.001851,0.105880,0.461818,126.957,0.269240,,...,0.611444,-0.878695,0.244091,0.416863,0.594802,-0.338894,-0.034221,-1.845181,-0.276950,0.296630
2,5,0.043567,0.745566,0.701470,0.000697,0.373143,0.124595,100.260,0.621661,,...,1.318349,-0.753971,-1.039863,0.777191,0.509547,0.417779,-0.091032,-1.025886,0.068314,-0.272380
3,10,0.951670,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.963590,2008-03-11,...,-0.572486,0.454861,-1.059819,1.491763,-1.397941,-0.579548,0.327253,-0.043924,1.003379,-0.564259
4,134,0.452217,0.513238,0.560410,0.019443,0.096567,0.525519,114.290,0.894072,,...,0.547149,-0.094636,0.154925,-0.278138,0.315183,0.768817,-0.381592,0.186775,-0.449920,0.061162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13124,124857,0.007592,0.790364,0.719288,0.853114,0.720715,0.082550,141.332,0.890461,,...,-0.028087,-0.057589,0.620655,-0.427503,0.640985,0.253803,0.077692,-0.012697,0.109495,-0.648873
13125,124862,0.041498,0.843077,0.536496,0.865151,0.547949,0.074001,101.975,0.476845,,...,1.174578,0.102255,0.819572,0.209645,0.276500,-0.497789,0.424493,-0.239482,0.408978,1.193789
13126,124863,0.000124,0.609686,0.895136,0.846624,0.632903,0.051517,129.996,0.496667,,...,1.033169,-0.566303,0.512557,-0.682360,-0.380061,0.023999,-0.741503,0.144168,-0.182448,0.207433
13127,124864,0.327576,0.574426,0.548327,0.452867,0.075928,0.033388,142.009,0.569274,,...,-0.262435,-0.174925,-0.055767,0.459823,0.147709,0.412693,0.162588,-0.276375,0.841430,0.173692


In [54]:
echo_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13129 entries, 0 to 13128
Data columns (total 97 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   track_id                            13129 non-null  int64  
 1   audio_features_acousticness         13129 non-null  float64
 2   audio_features_danceability         13129 non-null  float64
 3   audio_features_energy               13129 non-null  float64
 4   audio_features_instrumentalness     13129 non-null  float64
 5   audio_features_liveness             13129 non-null  float64
 6   audio_features_speechiness          13129 non-null  float64
 7   audio_features_tempo                13129 non-null  float64
 8   audio_features_valence              13129 non-null  float64
 9   metadata_album_date                 2717 non-null   object 
 10  metadata_album_name                 2872 non-null   object 
 11  metadata_artist_latitude            9770 

### PCA on Features

The features dataset has so many columns, and since some of them have a string on their name in common, we are going to apply the PCA separately.

In [65]:
col_temp_chroma = [x for x in feat_cp if x.startswith('chroma')]
col_temp_mfcc = [x for x in feat_cp if x.startswith('mfcc')]
col_temp_rmse = [x for x in feat_cp if x.startswith('rmse')]
col_temp_spectral = [x for x in feat_cp if x.startswith('spectral')]
col_temp_tonnetz = [x for x in feat_cp if x.startswith('tonnetz')]
col_temp_zcr = [x for x in feat_cp if x.startswith('zcr')]
col_temp = col_temp_chroma + col_temp_mfcc + col_temp_rmse + col_temp_spectral + col_temp_tonnetz + col_temp_zcr
other_col = [x for x in feat_cp.columns.to_list() if x not in col_temp]

##### chroma column

In [66]:
print(len(col_temp_chroma))

252


###### Standardize the Data

In [67]:
feat_PCA_chroma = feat_cp.loc[:, col_temp_chroma]
scaler = StandardScaler()
feat_PCA_chroma = scaler.fit_transform(feat_PCA_chroma)

###### Apply PCA

In [68]:
pca = PCA(0.90)
pc_feat_chroma = pca.fit_transform(feat_PCA_chroma)
print(pca.explained_variance_ratio_.sum())
pca_num = len(pca.components_)
print(pca_num)

0.9008784979584343
69


In [69]:
pc_feat_chroma_df = pd.DataFrame(data=pc_feat_chroma, columns=['pc_feat_chroma_'+str(i) for i in range(pca_num)])

##### mfcc column

In [70]:
print(len(col_temp_mfcc))

140


###### Standardize the Data

In [71]:
feat_PCA_mfcc = feat_cp.loc[:, col_temp_mfcc]
scaler = StandardScaler()
feat_PCA_mfcc = scaler.fit_transform(feat_PCA_mfcc)

###### Apply PCA

In [72]:
pca = PCA(0.90)
pc_feat_mfcc = pca.fit_transform(feat_PCA_mfcc)
print(pca.explained_variance_ratio_.sum())
pca_num = len(pca.components_)
print(pca_num)

0.900458177630659
46


In [73]:
pc_feat_mfcc_df = pd.DataFrame(data=pc_feat_mfcc, columns=['pc_feat_mfcc_'+str(i) for i in range(pca_num)])

##### rmse column

In [74]:
print(len(col_temp_rmse))

7


###### Standardize the Data

In [75]:
feat_PCA_rmse = feat_cp.loc[:, col_temp_rmse]
scaler = StandardScaler()
feat_PCA_rmse = scaler.fit_transform(feat_PCA_rmse)

###### Apply PCA

In [76]:
pca = PCA(0.90)
pc_feat_rmse = pca.fit_transform(feat_PCA_rmse)
print(pca.explained_variance_ratio_.sum())
pca_num = len(pca.components_)
print(pca_num)

0.9505190980384978
4


In [77]:
pc_feat_rmse_df = pd.DataFrame(data=pc_feat_rmse, columns=['pc_feat_rmse_'+str(i) for i in range(pca_num)])

##### spectral column

In [78]:
print(len(col_temp_spectral))

70


###### Standardize the Data

In [79]:
feat_PCA_spectral = feat_cp.loc[:, col_temp_spectral]
scaler = StandardScaler()
feat_PCA_spectral = scaler.fit_transform(feat_PCA_spectral)

###### Apply PCA

In [80]:
pca = PCA(0.90)
pc_feat_spectral = pca.fit_transform(feat_PCA_spectral)
print(pca.explained_variance_ratio_.sum())
pca_num = len(pca.components_)
print(pca_num)

0.9056235609376357
25


In [81]:
pc_feat_spectral_df = pd.DataFrame(data=pc_feat_spectral, columns=['pc_feat_spectral_'+str(i) for i in range(pca_num)])

##### tonnetz column

In [82]:
print(len(col_temp_tonnetz))

42


###### Standardize the Data

In [83]:
feat_PCA_tonnetz = feat_cp.loc[:, col_temp_tonnetz]
scaler = StandardScaler()
feat_PCA_tonnetz = scaler.fit_transform(feat_PCA_tonnetz)

###### Apply PCA

In [84]:
pca = PCA(0.90)
pc_feat_tonnetz = pca.fit_transform(feat_PCA_tonnetz)
print(pca.explained_variance_ratio_.sum())
pca_num = len(pca.components_)
print(pca_num)

0.9016426078965258
19


In [85]:
pc_feat_tonnetz_df = pd.DataFrame(data=pc_feat_tonnetz, columns=['pc_feat_tonnetz_'+str(i) for i in range(pca_num)])

##### zcr column

In [86]:
print(len(col_temp_zcr))

7


###### Standardize the Data

In [87]:
feat_PCA_zcr = feat_cp.loc[:, col_temp_zcr]
scaler = StandardScaler()
feat_PCA_zcr = scaler.fit_transform(feat_PCA_zcr)

###### Apply PCA

In [88]:
pca = PCA(0.90)
pc_feat_zcr = pca.fit_transform(feat_PCA_zcr)
print(pca.explained_variance_ratio_.sum())
pca_num = len(pca.components_)
print(pca_num)

0.9475564878849736
4


In [89]:
pc_feat_zcr_df = pd.DataFrame(data=pc_feat_zcr, columns=['pc_feat_zcr_'+str(i) for i in range(pca_num)])

#### Final Features dataset

In [90]:
feat_final = pd.concat([feat_cp[other_col], pc_feat_chroma_df, pc_feat_mfcc_df,
                        pc_feat_rmse_df, pc_feat_spectral_df, pc_feat_tonnetz_df,
                        pc_feat_zcr_df], axis=1)
feat_final

Unnamed: 0,track_id,pc_feat_chroma_0,pc_feat_chroma_1,pc_feat_chroma_2,pc_feat_chroma_3,pc_feat_chroma_4,pc_feat_chroma_5,pc_feat_chroma_6,pc_feat_chroma_7,pc_feat_chroma_8,...,pc_feat_tonnetz_13,pc_feat_tonnetz_14,pc_feat_tonnetz_15,pc_feat_tonnetz_16,pc_feat_tonnetz_17,pc_feat_tonnetz_18,pc_feat_zcr_0,pc_feat_zcr_1,pc_feat_zcr_2,pc_feat_zcr_3
0,2,0.775576,0.363594,1.361543,-3.699529,-6.147773,-5.360282,-1.897546,2.495594,-0.448821,...,1.247071,-1.481287,-0.974557,1.478836,-0.802692,-1.134505,-1.545477,0.216026,-0.536895,0.496658
1,3,2.396017,-2.216482,-5.514670,-0.510122,1.642152,-3.034927,-3.248663,2.734878,-0.931649,...,-1.893135,-0.590861,0.539187,0.068886,-0.248310,-0.548751,-1.577371,0.246390,-0.750121,0.305139
2,5,0.238587,-0.583145,-3.281558,-1.874630,-1.639829,-0.529630,-2.849928,1.959087,-0.719294,...,-1.393036,-3.392923,0.067738,2.889566,-2.384271,-1.169644,-0.114501,-0.275687,-0.554619,0.115641
3,10,6.340997,1.307150,-4.068519,-2.896113,1.793527,-2.828162,-2.905909,1.605703,-0.014508,...,-0.723196,-0.966325,-1.228191,-2.836012,0.106454,0.514897,-0.904230,0.286333,-0.133111,0.686239
4,20,-1.902813,-1.745450,-2.876881,2.710228,-0.255238,0.354165,-0.259853,-0.680845,1.649494,...,-0.820243,-0.194318,-0.568280,-1.378036,0.150246,-0.751232,0.209431,0.075854,-0.298410,-0.085887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106569,155316,1.563061,-1.342380,6.758747,-1.639735,4.023439,2.421908,6.661562,-0.091210,-1.808362,...,-0.259396,0.601070,0.542979,0.643495,-0.633165,0.268631,1.002904,-1.588391,0.458332,-0.144659
106570,155317,0.976325,-4.937334,2.160015,2.554928,-2.190888,6.302024,-0.158702,1.094481,-0.163301,...,0.833649,1.390238,-0.092980,-0.301943,0.478629,-0.241331,0.918410,-1.372796,0.139755,-0.050931
106571,155318,3.719993,-4.143312,2.385685,-1.277097,1.464155,1.783208,3.163242,1.525225,-0.570905,...,0.379637,0.132028,-0.427939,-0.718780,-0.895336,-0.217903,0.555178,-1.416186,0.351057,-0.013998
106572,155319,4.565423,-3.903901,0.804858,0.306852,1.538969,-1.027913,0.382135,0.735833,0.115947,...,0.305969,-0.487434,0.108620,-0.486189,-0.727463,0.095465,0.557101,-1.318067,0.685069,-0.064413


In [91]:
feat_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106574 entries, 0 to 106573
Columns: 168 entries, track_id to pc_feat_zcr_3
dtypes: float64(167), int64(1)
memory usage: 136.6 MB
