# Data pre-processing notebook
### Dropping non-songs, encoding categorical variables

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../../tracks.csv")
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


Data cleanup

In [2]:
!pip install category_encoders



In [3]:
import category_encoders as ce

def clean_data(df):
    # Drop the id column, name, release_date
    ### Engineer year feature
    x = df.copy()
    drop_cols = ['id', 'name', 'release_date', 'id_artists', 'artists']
    x = x.drop(labels=drop_cols, axis=1)
    
    # Drop zero tempo songs
    x = x[x['tempo']!=0]
    
    # Do one-hot encoding for key
    ohe = ce.one_hot.OneHotEncoder(cols=['key'])
    x = ohe.fit_transform(x)
    
    # Do ordinal encoding for artists
    #oe = ce.ordinal.OrdinalEncoder()
    #x_oe = oe.fit_transform(x[['artists']])
    #x['artists'] = x_oe
    
    # Normalize numeric features
    num_features = ['popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature']
    
    for i in num_features:
        col_mean = x[i].values.mean()
        col_var = np.var(df[i])
        col_norm = [(o - col_mean) / col_var**(1/2) for o in x[i].values]
        x[i] = col_norm
    
    return x


In [4]:

clean_df = clean_data(df)
print(f'Old df shape: {df.shape}\nOld df columns: {[i for i in df.columns]}\n\n'
        f'Cleaned df shape: {clean_df.shape}\nClean df columns: {[i for i in clean_df.columns]}')

  elif pd.api.types.is_categorical(cols):


Old df shape: (586672, 20)
Old df columns: ['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artists', 'id_artists', 'release_date', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']

Cleaned df shape: (586344, 26)
Clean df columns: ['popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11', 'key_12', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']


In [5]:
clean_df.describe()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key_1,key_2,key_3,key_4,key_5,...,key_12,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,586344.0,586344.0,586344.0,586344.0,586344.0,586344.0,586344.0,586344.0,586344.0,586344.0,...,586344.0,586344.0,586344.0,586344.0,586344.0,586344.0,586344.0,586344.0,586344.0,586344.0
mean,2.9083650000000005e-17,6.126956e-17,-6.980077e-18,4.907382e-16,-1.126507e-16,0.127717,0.071086,0.125781,0.036711,0.091388,...,0.111044,-7.367859e-18,0.658792,-1.19243e-16,1.983505e-16,2.088206e-16,6.223902e-17,5.0217770000000006e-17,5.5065050000000004e-17,-1.275803e-16
std,0.9998227,0.9952174,1.000212,0.9970543,0.999428,0.333775,0.256969,0.331603,0.18805,0.288161,...,0.314187,0.9963881,0.474115,1.000185,0.9998803,0.9992351,0.9996187,0.9989943,0.9958386,0.9813526
min,-1.500701,-1.697285,-0.214849,-3.074662,-2.152405,0.0,0.0,0.0,0.0,0.0,...,0.0,-8.802915,0.0,-0.583249,-1.289455,-0.4244867,-1.129392,-2.144603,-2.957428,-8.190745
25%,-0.7930492,-0.4340974,-0.214849,-0.6677147,-0.7869069,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.5279452,0.0,-0.3942474,-1.011674,-0.4244867,-0.6271283,-0.801804,-0.7687832,0.2630195
50%,-0.03096308,-0.1195375,-0.214849,0.07881222,0.02683466,0.0,0.0,0.0,0.0,0.0,...,0.0,0.1884565,1.0,-0.336991,-0.07971846,-0.4243953,-0.4063232,0.04423698,-0.03746548,0.2630195
75%,0.7311231,0.2673159,-0.214849,0.7350335,0.8167594,0.0,0.0,0.0,0.0,0.0,...,0.0,0.7307683,1.0,-0.1585512,0.9580173,-0.3888886,0.3477778,0.839826,0.5982336,0.2630195
max,3.942772,42.60912,4.656401,2.571249,1.817066,1.0,1.0,1.0,1.0,1.0,...,1.0,3.060351,1.0,4.814415,1.565752,3.322689,4.264763,1.736319,4.295441,2.376461


In [8]:
clean_df.to_csv('../../songs_cleaned.csv')

FileNotFoundError: [Errno 2] No such file or directory: './Data/songs_cleaned.csv'