# KNN Classifier Model

We first generate some accuracy tests using the training data, splitting it by ~0.8. Then, we fit the KNN to all training data and classify the test data. After trying multiple values, k=5 seems to work best.

In [1]:
model = "KNN"

In [2]:
import os
import pandas as pd
import librosa
import numpy as np
from scipy.spatial.distance import euclidean
from collections import Counter
import soundfile as sf
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import mode
import csv
from statistics import mode
from sklearn.preprocessing import StandardScaler
from datetime import datetime

In [3]:
train_features = np.load("Processed(5)/train_features.npy")
test_features = np.load("Processed(5)/test_features.npy")
train_labels = np.load("Processed(5)/train_labels.npy")
unshortened_train_labels = np.load("Processed(5)/unshortened_train_labels.npy")
num_clips=10

In [6]:
train_features = np.load("Processed(7)/train_features.npy")
test_features = np.load("Processed(7)/test_features.npy")
train_labels = np.load("Processed(7)/train_labels.npy")
unshortened_train_labels = np.load("Processed(7)/unshortened_train_labels.npy")
num_clips = 20

In [4]:
# Split training and testing data
# 80% for training, 20% for testing

r = 0.8
split_index = int(len(train_features) * r)
unshortened_split_index = int(len(unshortened_train_labels) * r)

X_train = np.array(train_features[:split_index])
y_train = np.array(train_labels[:split_index])

X_test = np.array(train_features[split_index:])
y_test = np.array(unshortened_train_labels[unshortened_split_index:])

In [25]:
k=100

# Initialize KNN classifier
knn = KNeighborsClassifier(n_neighbors=k)

# Fit KNN classifier with training data
knn.fit(X_train, y_train)

# Predict labels for testing set
y_pred = knn.predict(X_test)

y_pred_unshortened = []

# Calculate the mode for each consecutive 10 entries
for i in range(0, len(y_pred), num_clips):
    group = y_pred[i:i+num_clips]
    mode_value = mode(group)
    y_pred_unshortened.append(mode_value)

# Display classification accuracy
accuracy = np.mean(y_pred_unshortened == y_test)
print("Classification Accuracy:", accuracy)

# Display confusion matrix
confusion_matrix = pd.crosstab(y_test, y_pred_unshortened, rownames=['Actual'], colnames=['Predicted'])
print("\nConfusion Matrix:")
print(confusion_matrix)

Classification Accuracy: 0.65625

Confusion Matrix:
Predicted  blues  classical  country  disco  hiphop  jazz  metal  pop  reggae  \
Actual                                                                          
blues         17          0        1      0       0     2      0    0       2   
classical      0         15        0      0       0     1      0    0       0   
country        0          0        6      0       1     2      0    2       2   
disco          0          0        0      6       1     0      5    2       0   
hiphop         0          0        1      1       5     0      1    3       1   
jazz           0          2        3      0       1    16      0    0       1   
metal          0          0        0      0       1     0     14    0       0   
pop            0          0        1      0       3     0      0   13       0   
reggae         0          1        0      0       3     0      0    0       8   
rock           1          0        0      1       2     0

## Classify test data

In [6]:
X_train = np.array(train_features)
y_train = np.array(train_labels)

X_test = np.array(test_features)

In [7]:
# Initialize KNN classifier
knn_test = KNeighborsClassifier(n_neighbors=k)

# Fit KNN classifier with training data
knn_test.fit(X_train, y_train)

# Predict labels for testing set
y_pred = knn_test.predict(X_test)

y_pred_unshortened = []

# Calculate the mode for each consecutive 10 entries
for i in range(0, len(y_pred), 10):
    group = y_pred[i:i+10]
    mode_value = mode(group)
    y_pred_unshortened.append(mode_value)
    
print(y_pred_unshortened)

['disco', 'pop', 'country', 'blues', 'disco', 'jazz', 'disco', 'classical', 'reggae', 'disco', 'disco', 'hiphop', 'hiphop', 'metal', 'country', 'country', 'hiphop', 'disco', 'hiphop', 'hiphop', 'classical', 'disco', 'jazz', 'metal', 'blues', 'classical', 'country', 'reggae', 'country', 'disco', 'jazz', 'classical', 'country', 'country', 'hiphop', 'pop', 'reggae', 'country', 'jazz', 'country', 'pop', 'metal', 'rock', 'pop', 'jazz', 'disco', 'country', 'pop', 'blues', 'hiphop', 'disco', 'disco', 'hiphop', 'jazz', 'reggae', 'reggae', 'hiphop', 'pop', 'hiphop', 'country', 'classical', 'country', 'country', 'country', 'pop', 'rock', 'pop', 'country', 'rock', 'hiphop', 'rock', 'metal', 'classical', 'metal', 'country', 'rock', 'disco', 'jazz', 'pop', 'blues', 'pop', 'disco', 'reggae', 'blues', 'reggae', 'rock', 'classical', 'disco', 'rock', 'disco', 'reggae', 'classical', 'disco', 'metal', 'rock', 'classical', 'pop', 'blues', 'pop', 'reggae', 'reggae', 'reggae', 'pop', 'disco', 'classical', '

## Submission

In [41]:
if not os.path.isdir("Submissions/"): os.mkdir("Submissions")

current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")

with open(f"Submissions/submission_{model}_{current_datetime}.csv", 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['ID', 'Genre'])
    
    for i in range(len(y_pred_unshortened)):
        filename = f"test{i:03d}.wav"
        prediction = [filename , y_pred_unshortened[i]]
        csvwriter.writerow(prediction)