# Import packages

In [1]:
import numpy as np
import pandas as pd
import graphviz
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn import svm
from sklearn import tree

# Load dataset

In [2]:
training=pd.read_csv('training_data.csv', sep=',')
test=pd.read_csv('songs_to_classify.csv', sep=',')
print("Training data shape:", training.shape)
print("Test data shape:", test.shape)

Training data shape: (750, 14)
Test data shape: (200, 13)


In [3]:
training.head(5)

Unnamed: 0,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,label
0,0.713,0.514,100125,0.521,0.816,8,0.112,-14.835,0,0.0444,119.879,4,0.143,1
1,0.192,0.714,207019,0.614,0.0,4,0.263,-6.935,1,0.0319,123.969,4,0.582,1
2,0.333,0.63,216200,0.455,4e-06,5,0.127,-9.29,1,0.0292,139.931,4,0.199,1
3,0.601,0.81,136413,0.221,0.21,5,0.184,-11.005,1,0.0429,109.96,4,0.798,1
4,0.883,0.465,181440,0.459,0.000173,6,0.0692,-8.137,0,0.0351,90.807,4,0.288,1


# Preprocess
- ## Scale numerical values with `MinMaxScaler()`
  - acousticness, danceability, duration, ...
- ## Encode categorical values with `OneHotEncoder()`
  - key, time_signature, mode

In [4]:
scaling_cols = ['acousticness', 'danceability', 'duration', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
categorical_cols = ['key', 'time_signature', 'mode']

scaling_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('scale', scaling_transformer, scaling_cols),
        ('cat', categorical_transformer, categorical_cols)])

p = Pipeline(steps=[('preprocessor', preprocessor)])

features_train = p.fit_transform(training.drop('label', axis=1))
label_train = training['label']
features_test = p.fit_transform(test)

## Training and test dataset, post-processing

In [5]:
print("features_train shape:", features_train.shape)
print("label_train shape:", label_train.shape)
print("features_test shape:", features_test.shape)

features_train shape: (750, 28)
label_train shape: (750,)
features_test shape: (200, 28)


In [6]:
print(features_train[0])

[0.71730349 0.46302617 0.10332492 0.51914786 0.84384695 0.0921466
 0.50798129 0.03010321 0.43211266 0.11658526 0.         0.
 0.         0.         0.         0.         0.         0.
 1.         0.         0.         0.         0.         0.
 1.         0.         1.         0.        ]


# Try learning models

## 1. K-nearest neighbours

### Define model

In [7]:
knnmodel = KNeighborsClassifier(n_neighbors = 5)

### Cross-validate

In [8]:
cv_scores = cross_val_score(knnmodel, X=features_train, y=label_train, cv=5)
print(cv_scores)
print("cv_scores mean: {}".format(np.mean(cv_scores)))

[0.72       0.68666667 0.77333333 0.74666667 0.66666667]
cv_scores mean: 0.7186666666666667


### Split training data into train and test

In [9]:
X_train, X_test, y_train, y_test = train_test_split(features_train, label_train, test_size=0.33, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

knnmodel.fit(X=X_train, y=y_train)
# make predictions
predictions_knn = knnmodel.predict(X=X_test)
# evaluate predictions
acc = accuracy_score(y_test, predictions_knn)
print('Accuracy: %.3f' % acc)

(502, 28)
(502,)
(248, 28)
(248,)
Accuracy: 0.710


### Fit model on original training dataset, predict labels for test dataset

In [10]:
knnmodel.fit(X=features_train,y=label_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [11]:
predictions_knn = knnmodel.predict(X=features_test).reshape(-1,1).astype(int).reshape(1,-1)
print(predictions_knn)

[[0 0 0 1 0 0 0 1 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1
  0 0 1 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 1 0 0 1 1
  0 1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0 0 0 1 1 1 1 0 0 1 1 1 0 0 1
  1 1 1 0 0 1 1 1 1 1 0 0 0 1 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 0 0 0 1 1
  1 1 0 1 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 0 0 0 1 0 0 1 1 1 1 0
  1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1]]


## 2. Logistic regression

### Define model

In [12]:
lr = LogisticRegression(random_state=0)

### Cross-validate

In [13]:
cv_scores = cross_val_score(lr, X=features_train, y=label_train, cv=5)
print(cv_scores)
print("cv_scores mean: {}".format(np.mean(cv_scores)))

[0.78666667 0.78       0.82666667 0.80666667 0.77333333]
cv_scores mean: 0.7946666666666667


### Split training data into train and test

In [14]:
X_train, X_test, y_train, y_test = train_test_split(features_train, label_train, test_size=0.33, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

lr.fit(X=X_train, y=y_train)
# make predictions
predictions_lr = lr.predict(X=X_test)
# evaluate predictions
acc = accuracy_score(y_test, predictions_lr)
print('Accuracy: %.3f' % acc)

(502, 28)
(502,)
(248, 28)
(248,)
Accuracy: 0.790


### Fit model on original training dataset, predict labels for test dataset

In [15]:
lr.fit(features_train, label_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
predictions_lr = lr.predict(features_test).reshape(-1,1).astype(int).reshape(1,-1)
print(predictions_lr)

[[0 1 0 1 0 0 1 1 1 0 1 1 0 1 1 0 1 0 1 1 0 0 1 1 0 0 1 1 0 0 1 0 0 1 1 1
  1 1 0 1 0 1 0 1 0 1 0 1 1 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 0
  1 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 1 1 1 1 0 0 1 0 0 1 0 1
  1 1 1 0 0 1 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 0 0
  1 1 0 0 1 0 1 1 0 0 1 1 1 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 1 1 0 0 1 1 1 0
  0 1 1 1 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1]]


## 3. Linear Discriminant Analysis

### Define model

In [17]:
lda = LinearDiscriminantAnalysis()

### Cross-validate

In [18]:
cv_scores = cross_val_score(lda, X=features_train, y=label_train, cv=5)
print(cv_scores)
print("cv_scores mean: {}".format(np.mean(cv_scores)))

[0.76       0.82       0.83333333 0.82666667 0.78      ]
cv_scores mean: 0.804


### Split training data into train and test

In [19]:
X_train, X_test, y_train, y_test = train_test_split(features_train, label_train, test_size=0.33, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

lda.fit(X=X_train, y=y_train)
# make predictions
predictions_lda = lda.predict(X=X_test)
# evaluate predictions
acc = accuracy_score(y_test, predictions_lda)
print('Accuracy: %.3f' % acc)

(502, 28)
(502,)
(248, 28)
(248,)
Accuracy: 0.802


### Fit model on original training dataset, predict labels for test dataset

In [20]:
lda.fit(features_train, label_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

In [21]:
predictions_lda = lda.predict(features_test).reshape(-1,1).astype(int).reshape(1,-1)
print(predictions_lda)

[[0 1 0 1 0 0 1 1 1 0 1 1 0 1 1 0 1 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 0 1 1 1
  1 1 0 1 0 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 0
  1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 1 1 1 1 0 0 1 1 1 1 0 1
  1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 0
  1 1 0 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 0
  1 1 1 1 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1]]


## 4. Quadratic Discriminant Analysis

### Define model

In [22]:
qda = QuadraticDiscriminantAnalysis()

### Cross-validate

In [23]:
cv_scores = cross_val_score(qda, X=features_train, y=label_train, cv=5)
print(cv_scores)
print("cv_scores mean: {}".format(np.mean(cv_scores)))

[0.65333333 0.77333333 0.46       0.64666667 0.72666667]
cv_scores mean: 0.6519999999999999




### Split training data into train and test

In [24]:
X_train, X_test, y_train, y_test = train_test_split(features_train, label_train, test_size=0.33, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

qda.fit(X=X_train, y=y_train)
# make predictions
predictions_qda = qda.predict(X=X_test)
# evaluate predictions
acc = accuracy_score(y_test, predictions_qda)
print('Accuracy: %.3f' % acc)

(502, 28)
(502,)
(248, 28)
(248,)
Accuracy: 0.677




### Fit model on original training dataset, predict labels for test dataset

In [25]:
qda.fit(features_train, label_train)



QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                              store_covariance=False, tol=0.0001)

In [26]:
predictions_qda = qda.predict(features_test).reshape(-1,1).astype(int).reshape(1,-1)
print(predictions_qda)

[[0 1 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0
  1 0 1 0 1 0 0 1 0 1 0 1 1 1 1 1 1 0 0 1 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 0
  1 1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0 1 0 1 1 1 0 0 0 1 0 1 0 0 1
  0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 0 0 0 0 0 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0
  1 1 0 0 1 0 1 0 1 0 1 1 1 1 1 0 0 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 1 1 0 1
  1 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 1 1 1 1]]


## 5. Support Vector Machines

### Define model

In [27]:
svm = svm.SVC()

### Cross-validate

In [28]:
cv_scores = cross_val_score(svm, X=features_train, y=label_train, cv=5)
print(cv_scores)
print("cv_scores mean: {}".format(np.mean(cv_scores)))

[0.78666667 0.78       0.84       0.8        0.77333333]
cv_scores mean: 0.796


### Split training data into train and test

In [29]:
X_train, X_test, y_train, y_test = train_test_split(features_train, label_train, test_size=0.33, random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

svm.fit(X=X_train, y=y_train)
# make predictions
predictions_svm = svm.predict(X=X_test)
# evaluate predictions
acc = accuracy_score(y_test, predictions_svm)
print('Accuracy: %.3f' % acc)

(502, 28)
(502,)
(248, 28)
(248,)
Accuracy: 0.782


### Fit model on original training dataset, predict labels for test dataset

In [30]:
svm.fit(features_train, label_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [31]:
predictions_svm = svm.predict(features_test).reshape(-1,1).astype(int).reshape(1,-1)
print(predictions_svm)

[[0 1 0 1 0 0 1 1 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1
  1 1 0 1 0 1 0 1 0 1 0 1 1 0 0 0 1 0 0 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 0
  1 1 0 0 0 1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 0 0 1 0 1 1 1 1 0 0 1 0 1 1 0 1
  1 1 1 0 0 1 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 0
  1 1 0 0 1 0 1 1 0 0 1 1 1 1 1 0 0 1 0 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 1 0
  1 1 1 1 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1]]
