# Data pre-processing notebook
### Dropping non-songs, encoding categorical variables

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/tracks_drop_duplicates.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,index,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,0,0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,...,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,1,1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,...,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,2,2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,...,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,3,3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,...,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,4,4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,...,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


Data cleanup

In [2]:
!pip install category_encoders



In [6]:
import category_encoders as ce

def clean_data(df):
    # Drop the id column, name, release_date
    x = df.copy()
    drop_cols = ['Unnamed: 0', 'index', 'id', 'name', 'release_date', 'id_artists', 'artists']
    x = x.drop(labels=drop_cols, axis=1)
    
    # Drop zero tempo songs
    x = x[x['tempo']!=0]
    
    # Do one-hot encoding for key
    ohe = ce.one_hot.OneHotEncoder(cols=['key'])
    x = ohe.fit_transform(x)
    
    
    # Normalize numeric features
    num_features = ['popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature']
    
    for i in num_features:
        col_mean = x[i].values.mean()
        col_var = np.var(df[i])
        col_norm = [(o - col_mean) / col_var**(1/2) for o in x[i].values]
        x[i] = col_norm
    
    return x


In [7]:

clean_df = clean_data(df)
print(f'Old df shape: {df.shape}\nOld df columns: {[i for i in df.columns]}\n\n'
        f'Cleaned df shape: {clean_df.shape}\nClean df columns: {[i for i in clean_df.columns]}')

  elif pd.api.types.is_categorical(cols):


Old df shape: (446475, 22)
Old df columns: ['Unnamed: 0', 'index', 'id', 'name', 'popularity', 'duration_ms', 'explicit', 'artists', 'id_artists', 'release_date', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']

Cleaned df shape: (446198, 26)
Clean df columns: ['popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11', 'key_12', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']


In [8]:
clean_df.describe()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key_1,key_2,key_3,key_4,key_5,...,key_12,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,446198.0,446198.0,446198.0,446198.0,446198.0,446198.0,446198.0,446198.0,446198.0,446198.0,...,446198.0,446198.0,446198.0,446198.0,446198.0,446198.0,446198.0,446198.0,446198.0,446198.0
mean,-2.5988590000000002e-17,-7.732880000000001e-17,4.204037e-17,9.629792e-16,-1.910926e-16,0.126845,0.072347,0.126238,0.035659,0.089841,...,0.111926,3.725031e-16,0.657195,-4.1148600000000006e-17,3.019263e-16,4.700878e-17,2.6880360000000002e-17,-3.765798e-16,-1.133561e-15,-8.917654000000001e-18
std,0.9997841,0.9945503,1.00025,0.9967265,0.9993776,0.3328,0.259062,0.332117,0.185439,0.285955,...,0.315275,0.9971516,0.474648,1.000205,0.9998721,0.9990741,0.9996037,0.9988576,0.9954012,0.9808388
min,-1.480897,-1.595024,-0.2189212,-3.077494,-2.156647,0.0,0.0,0.0,0.0,0.0,...,0.0,-8.715866,0.0,-0.5857257,-1.295942,-0.422701,-1.127532,-2.164436,-2.950973,-7.88551
25%,-0.8253583,-0.4241557,-0.2189212,-0.664467,-0.787286,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.5246672,0.0,-0.4137112,-1.011593,-0.422701,-0.6356965,-0.7950903,-0.7694321,0.2658238
50%,-0.005934939,-0.1168425,-0.2189212,0.08622575,0.02639235,0.0,0.0,0.0,0.0,0.0,...,0.0,0.1908149,1.0,-0.3582066,-0.07074667,-0.422621,-0.4089712,0.05297011,-0.03431833,0.2658238
75%,0.704232,0.2569606,-0.2189212,0.7348243,0.8162558,0.0,0.0,0.0,0.0,0.0,...,0.0,0.7294736,1.0,-0.1711909,0.9564659,-0.3886925,0.3555674,0.8387876,0.6010337,0.2658238
max,3.981925,40.08877,4.570127,2.554504,1.812516,1.0,1.0,1.0,1.0,1.0,...,1.0,3.054693,1.0,4.269682,1.56189,3.335153,4.114989,1.72575,4.194066,2.303657


In [8]:
clean_df.to_csv('data/songs_cleaned.csv')

FileNotFoundError: [Errno 2] No such file or directory: './Data/songs_cleaned.csv'