In [2]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import os


In [3]:
# Download data
#!wget -O training_data.csv http://handsonml.control.lth.se/data/training_data.csv
#!wget -O songs_to_classify.csv http://handsonml.control.lth.se/data/songs_to_classify.csv

In [4]:
# Load data
train = pd.read_csv("training_data.csv")
test = pd.read_csv("songs_to_classify.csv")
train.shape, test.shape
#750 examples and 14 features(include label)

((750, 14), (200, 13))

In [5]:
# Inspect data
train.sample(15)

Unnamed: 0,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,label
481,0.0632,0.655,243520,0.949,0.0,10,0.298,-3.42,0,0.065,110.99,4,0.776,0
299,0.746,0.486,247227,0.513,0.00213,0,0.113,-9.397,1,0.0349,129.131,4,0.394,1
443,0.464,0.524,203253,0.634,1e-05,7,0.0827,-9.246,1,0.0296,156.197,4,0.549,0
133,0.0141,0.697,228427,0.949,0.0,0,0.748,-7.202,1,0.0353,132.379,4,0.904,0
169,0.0321,0.758,160827,0.71,5e-06,7,0.332,-5.325,1,0.0315,124.069,4,0.961,0
38,0.12,0.702,165173,0.788,0.0,5,0.171,-1.455,0,0.251,163.903,4,0.645,0
415,0.0384,0.841,195474,0.729,0.0,6,0.276,-3.047,0,0.239,132.073,4,0.444,0
193,0.0908,0.683,141367,0.703,0.0,5,0.511,-5.278,1,0.419,90.029,4,0.344,0
493,0.000358,0.626,274213,0.799,0.00155,11,0.36,-6.612,0,0.046,98.992,4,0.369,1
698,0.696,0.821,106684,0.607,0.0,7,0.14,-6.035,1,0.136,97.976,4,0.924,1


In [6]:
# select which features to use
features = ['danceability','key','loudness','instrumentalness','liveness']
X_train = train.loc[:,features].values
print(X_train.shape);
y_train = train.loc[:,'label'].values
X_test = test.loc[:,features].values

(750, 5)


In [7]:
# Normalize data. Can also be done using sklearn methods such as
# MinMaxScaler() or StandardScaler()
X_trainn = X_train*1/np.max(np.abs(X_train), axis=0)
X_testn = X_test*1/np.max(np.abs(X_train), axis=0)

In [8]:
# note: all inputs/features are treated as quantitative/numeric
# some of the features are perhaps more sensible to treat as
# qualitative/cathegorical. For that sklearn preprocessing methods
# such as OneHotEncoder() can be used

# define the k-NN model. To set n_neighbors in a systematic way, use cross validation!
knnmodel = KNeighborsClassifier(n_neighbors=5)

# feed it with data and train it
knnmodel.fit(X_trainn, y_train)

# make predictions
predictions = knnmodel.predict(X=X_testn)
print(predictions)

[0 1 0 1 0 0 0 1 0 0 1 0 0 1 1 1 1 0 1 1 0 0 1 0 0 1 1 0 0 1 1 1 1 1 0 1 1
 1 0 1 0 1 0 1 1 1 0 1 1 0 1 0 1 0 0 1 1 1 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 1
 1 1 0 1 1 0 1 1 0 0 1 0 1 1 1 1 1 0 0 0 1 0 1 1 1 1 0 0 1 0 0 1 0 0 0 1 1
 1 0 1 1 0 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 0 1 1 1 0 0
 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1
 1 1 0 0 0 1 0 0 0 1 1 1 1 1 1]
