In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.impute import KNNImputer

In [2]:
df = pd.read_csv('music_genre_train.csv')

print(df.isnull().values.any()) # Checking if there are any NaN values in our dataset
df.dropna() # Removing all rows with NaN values

True


Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,35466.0,Linkin Park,Wth>You (feat. Aceyalone),38.0,0.00696,0.580,252307.0,0.9820,0.000005,E,0.1650,-3.428,Minor,0.0729,98.189,4-Apr,0.5100,Alternative
1,21927.0,Matthew Dear,Bad Ones (feat. Tegan and Sara),52.0,0.00693,0.769,276983.0,0.5110,0.088500,B,0.2230,-8.357,Major,0.0402,104.00299999999999,4-Apr,0.0399,Alternative
2,91335.0,La-33,La Pantera Mambo,47.0,0.47400,0.782,-1.0,0.7910,0.001200,C,0.0555,-6.321,Major,0.0383,99.992,4-Apr,0.8190,Jazz
3,91513.0,Sheena Ringo,あおぞら,35.0,0.64300,0.701,255800.0,0.7620,0.612000,B,0.4320,-6.117,Major,0.0420,130.105,4-Apr,0.6410,Anime
4,80060.0,MGMT,"Of Moons, Birds & Monsters",52.0,0.02180,0.598,286720.0,0.8990,0.013700,C#,0.0814,-3.861,Minor,0.0424,124.061,4-Apr,0.4030,Rock
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39999,66484.0,Nina Simone,Strange Fruit,36.0,0.91200,0.441,208920.0,0.0244,0.000000,B,0.0775,-15.881,Minor,0.0630,92.697,1-Apr,0.2180,Jazz
40000,42198.0,Karol Szymanowski,"Symphony No. 4, Op.60 'Sinfonia Concertante': ...",35.0,0.93800,0.127,495840.0,0.0418,0.535000,E,0.1060,-24.984,Minor,0.0342,86.072,4-Apr,0.0372,Classical
40001,75350.0,empty_field,One Trick Ponies,56.0,0.15000,0.680,321160.0,0.8520,0.002940,C,0.1260,-4.919,Major,0.0257,94.523,4-Apr,0.7070,Rock
40002,41678.0,Subtronics,Depth Perception,27.0,0.11000,0.523,228706.0,0.9040,0.269000,G#,0.2910,-3.038,Minor,0.6080,?,4-Apr,0.5110,Electronic


In [3]:
drop_columns = ['instance_id', 'artist_name', 'track_name', 'obtained_date'] # Removing all non-predictive features
df.drop(drop_columns, inplace=True, axis=1)

df['loudness'] = df['loudness'].abs()

In [4]:
# Changing mode feature into categorical data
df['mode'].replace('Major', 1, inplace=True)
df['mode'].replace('Minor', 0, inplace=True)

In [5]:
# Changing undefined values to NaN
df['duration_ms'] = df['duration_ms'].replace(-1, np.NaN)
df['tempo'] = df['tempo'].replace('?', np.NaN)


In [6]:
# One hot encoding
columns_cat = ['key']
df_cat = df[columns_cat]
df = df.drop(columns_cat, axis=1)
df_cat = pd.get_dummies(df_cat)
df = pd.concat([df_cat, df], axis = 1)
df.head(5)

Unnamed: 0,key_A,key_A#,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,...,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,0,0,0,0,0,0,0,1,0,0,...,252307.0,0.982,5e-06,0.165,3.428,0.0,0.0729,98.189,0.51,Alternative
1,0,0,1,0,0,0,0,0,0,0,...,276983.0,0.511,0.0885,0.223,8.357,1.0,0.0402,104.003,0.0399,Alternative
2,0,0,0,1,0,0,0,0,0,0,...,,0.791,0.0012,0.0555,6.321,1.0,0.0383,99.992,0.819,Jazz
3,0,0,1,0,0,0,0,0,0,0,...,255800.0,0.762,0.612,0.432,6.117,1.0,0.042,130.105,0.641,Anime
4,0,0,0,0,1,0,0,0,0,0,...,286720.0,0.899,0.0137,0.0814,3.861,0.0,0.0424,124.061,0.403,Rock


In [7]:
# Changing music_genre feature into categorical data
print(df['music_genre'].value_counts())
encoder_genre = preprocessing.LabelEncoder()
df["music_genre"] = encoder_genre.fit_transform(df["music_genre"])

encoder_genre.classes_

Country        4040
Jazz           4034
Classical      4018
Alternative    4017
Rock           3991
Electronic     3991
Hip-Hop        3991
Blues          3983
Rap            3968
Anime          3967
Name: music_genre, dtype: int64


array(['Alternative', 'Anime', 'Blues', 'Classical', 'Country',
       'Electronic', 'Hip-Hop', 'Jazz', 'Rap', 'Rock', nan], dtype=object)

In [8]:
'''
Columns: "acousticness", "danceability", "energy", "instrumentalness", "liveness", "speechiness", "valence" have
values in the range from 0 to 1 so we change that range from -1 to 1
'''
def convert_range(x):
    NewValue = (((x-0)*(1+1))/(1-0))-1 #NewValue = (((OldValue - OldMin) * (NewMax - NewMin)) / (OldMax - OldMin)) + NewMin
    return NewValue



df['acousticness'] = df['acousticness'].apply(convert_range)
df['danceability'] = df['danceability'].apply(convert_range)
df['energy'] = df['energy'].apply(convert_range)
df['instrumentalness'] = df['instrumentalness'].apply(convert_range)
df['liveness'] = df['liveness'].apply(convert_range)
df['speechiness'] = df['speechiness'].apply(convert_range)
df['valence'] = df['valence'].apply(convert_range)

In [9]:
# Replacing missing data with KNNImputer
imputer = KNNImputer(n_neighbors=121)
df[:] = imputer.fit_transform(df)

df

Unnamed: 0,key_A,key_A#,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,...,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,252307.00000,0.9640,-0.99999,-0.6700,3.428,0.0,-0.8542,98.189000,0.0200,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,276983.00000,0.0220,-0.82300,-0.5540,8.357,1.0,-0.9196,104.003000,-0.9202,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,242905.77686,0.5820,-0.99760,-0.8890,6.321,1.0,-0.9234,99.992000,0.6380,7.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255800.00000,0.5240,0.22400,-0.1360,6.117,1.0,-0.9160,130.105000,0.2820,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,286720.00000,0.7980,-0.97260,-0.8372,3.861,0.0,-0.9152,124.061000,-0.1940,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39999,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,208920.00000,-0.9512,-1.00000,-0.8450,15.881,0.0,-0.8740,92.697000,-0.5640,7.0
40000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,495840.00000,-0.9164,0.07000,-0.7880,24.984,0.0,-0.9316,86.072000,-0.9256,3.0
40001,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,321160.00000,0.7040,-0.99412,-0.7480,4.919,1.0,-0.9486,94.523000,0.4140,9.0
40002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,228706.00000,0.8080,-0.46200,-0.4180,3.038,0.0,0.2160,131.111851,0.0220,5.0
