In [1]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn import tree
import pandas as pd
import graphviz

### Load dataset

In [2]:
training=pd.read_csv('training_data.csv', sep=',')
test=pd.read_csv('songs_to_classify.csv', sep=',')
print("Training data shape", training.shape)
print("Test data shape", test.shape)

Training data shape (750, 14)
Test data shape (200, 13)


In [3]:
print(training.head(5))

   acousticness  danceability  duration  energy  instrumentalness  key  \
0         0.713         0.514    100125   0.521          0.816000    8   
1         0.192         0.714    207019   0.614          0.000000    4   
2         0.333         0.630    216200   0.455          0.000004    5   
3         0.601         0.810    136413   0.221          0.210000    5   
4         0.883         0.465    181440   0.459          0.000173    6   

   liveness  loudness  mode  speechiness    tempo  time_signature  valence  \
0    0.1120   -14.835     0       0.0444  119.879               4    0.143   
1    0.2630    -6.935     1       0.0319  123.969               4    0.582   
2    0.1270    -9.290     1       0.0292  139.931               4    0.199   
3    0.1840   -11.005     1       0.0429  109.960               4    0.798   
4    0.0692    -8.137     0       0.0351   90.807               4    0.288   

   label  
0      1  
1      1  
2      1  
3      1  
4      1  


### Preprocess
- Scale numerical values
- Encode categorical values

In [4]:
scaling_cols = ['acousticness', 'danceability', 'duration', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
categorical_cols = ['key', 'time_signature', 'mode']

scaling_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('scale', scaling_transformer, scaling_cols),
        ('cat', categorical_transformer, categorical_cols)])

p = Pipeline(steps=[('preprocessor', preprocessor)])

X_train = p.fit_transform(training.drop('label', axis=1))
y_train = training['label']
print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)

X_test = p.fit_transform(test)
print("X_test shape", X_test.shape)

X_train shape (750, 28)
y_train shape (750,)
X_test shape (200, 28)


### KNN

In [5]:
knnmodel = KNeighborsClassifier(n_neighbors = 5)

cv_scores = cross_val_score(knnmodel, X=X_train, y=y_train, cv=5)

print(cv_scores)
print("cv_scores mean: {}".format(np.mean(cv_scores)))

[0.72       0.68666667 0.77333333 0.74666667 0.66666667]
cv_scores mean: 0.7186666666666667


In [6]:
knnmodel.fit(X=X_train,y=y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [7]:
predictions_knn = knnmodel.predict(X=X_test).reshape(-1,1).astype(int).reshape(1,-1)
print(predictions_knn)

[[0 0 0 1 0 0 0 1 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1
  0 0 1 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 1 0 0 1 1
  0 1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0 0 0 1 1 1 1 0 0 1 1 1 0 0 1
  1 1 1 0 0 1 1 1 1 1 0 0 0 1 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 0 0 0 1 1
  1 1 0 1 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 0 0 0 1 0 0 1 1 1 1 0
  1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1]]


### Logistic regression

In [8]:
lr = LogisticRegression(random_state=0)
cv_scores = cross_val_score(lr, X=X_train, y=y_train, cv=5)
print(cv_scores)
print("cv_scores mean: {}".format(np.mean(cv_scores)))

[0.78666667 0.78       0.82666667 0.80666667 0.77333333]
cv_scores mean: 0.7946666666666667


In [9]:
lr.fit(X_train, y_train)
predictions_lr = lr.predict(X_test).reshape(-1,1).astype(int).reshape(1,-1)
print(predictions_lr)

[[0 1 0 1 0 0 1 1 1 0 1 1 0 1 1 0 1 0 1 1 0 0 1 1 0 0 1 1 0 0 1 0 0 1 1 1
  1 1 0 1 0 1 0 1 0 1 0 1 1 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 0
  1 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 1 1 1 1 0 0 1 0 0 1 0 1
  1 1 1 0 0 1 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 0 0
  1 1 0 0 1 0 1 1 0 0 1 1 1 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 1 1 0 0 1 1 1 0
  0 1 1 1 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1]]


### LDA

In [10]:
lda = LinearDiscriminantAnalysis()
cv_scores = cross_val_score(lda, X=X_train, y=y_train, cv=5)
print(cv_scores)
print("cv_scores mean: {}".format(np.mean(cv_scores)))

[0.76       0.82       0.83333333 0.82666667 0.78      ]
cv_scores mean: 0.804


In [11]:
lda.fit(X_train, y_train)
predictions_lda = lda.predict(X_test).reshape(-1,1).astype(int).reshape(1,-1)
print(predictions_lda)

[[0 1 0 1 0 0 1 1 1 0 1 1 0 1 1 0 1 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 0 1 1 1
  1 1 0 1 0 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 0
  1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 1 1 1 1 0 0 1 1 1 1 0 1
  1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 0
  1 1 0 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 0
  1 1 1 1 1 1 1 0 1 0 0 1 1 0 1 0 0 1 1 1]]


### QDA

In [12]:
qda = QuadraticDiscriminantAnalysis()
cv_scores = cross_val_score(qda, X=X_train, y=y_train, cv=5)
print(cv_scores)
print("cv_scores mean: {}".format(np.mean(cv_scores)))

[0.65333333 0.77333333 0.46       0.64666667 0.72666667]
cv_scores mean: 0.6519999999999999




In [13]:
qda.fit(X_train, y_train)
predictions_qda = qda.predict(X_test).reshape(-1,1).astype(int).reshape(1,-1)
print(predictions_qda)

[[0 1 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0
  1 0 1 0 1 0 0 1 0 1 0 1 1 1 1 1 1 0 0 1 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 0
  1 1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0 1 0 1 1 1 0 0 0 1 0 1 0 0 1
  0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 0 0 0 0 0 1 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0
  1 1 0 0 1 0 1 0 1 0 1 1 1 1 1 0 0 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 1 1 0 1
  1 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 1 1 1 1]]




### Classification tree

In [14]:
tree_clf = tree.DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
predictions_tree = tree_clf.predict(X_test).reshape(-1,1).astype(int).reshape(1,-1)
print(predictions_tree)

[[0 0 1 1 0 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 1 0 1 1 0 0 1 0 1 0 1 1 0 1 0 1
  0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 1 0 1 1 1 0 0 1 1 1 0 1 1 1 1 0 0 0 1
  0 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 1 0 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0
  1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 1 1 1 1 0 1 0
  1 1 0 0 0 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0 0 0 1 0 0
  0 0 1 1 1 0 1 0 1 0 0 1 1 0 1 1 0 1 1 1]]


In [15]:
# tree.plot_tree(tree_clf) 
# dot_data = tree.export_graphviz(tree_clf, out_file=None, 
#                       filled=True, rounded=True,  
#                       special_characters=True)  
# graph = graphviz.Source(dot_data)
# graph