# Random Forest Classifier Model

We first generate some accuracy tests using the training data, splitting it by ~0.8. Then, we fit the Random Forest Classifier to all training data to classify the test data. We optimized the parameters using GridSearchCV (commented out).

In [None]:
model = "RFC"

In [None]:
import os
import pandas as pd
import librosa
import numpy as np
from scipy.spatial.distance import euclidean
from collections import Counter
import soundfile as sf
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import mode
import csv
from statistics import mode
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier

used Processed(5) for .6 accuracy submission (10 3-second clips, 25 mfcc features, chroma features, spectral contrast and spectral flatness features.)

In [None]:
train_features = np.load("Processed(7)/train_features.npy")
test_features = np.load("Processed(7)/test_features.npy")
train_labels = np.load("Processed(7)/train_labels.npy")
unshortened_train_labels = np.load("Processed(7)/unshortened_train_labels.npy")
num_clips=20

In [None]:
# Split training and testing data
# 80% for training, 20% for testing

r = 0.8
split_index = int(len(train_features) * r)
unshortened_split_index = int(len(unshortened_train_labels) * r)

X_train = np.array(train_features[:split_index])
y_train = np.array(train_labels[:split_index])

X_test = np.array(train_features[split_index:])
y_test = np.array(unshortened_train_labels[unshortened_split_index:])

In [None]:
rf_model = RandomForestClassifier(n_estimators=300, random_state=3, max_depth=20, min_samples_split=5)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

y_pred_unshortened = []

# Calculate the mode for each consecutive 10 entries
for i in range(0, len(y_pred), num_clips):
    group = y_pred[i:i+num_clips]
    mode_value = mode(group)
    y_pred_unshortened.append(mode_value)

# Display classification accuracy
accuracy = np.mean(y_pred_unshortened == y_test)
print("Classification Accuracy:", accuracy)

# Display confusion matrix
confusion_matrix = pd.crosstab(y_test, y_pred_unshortened, rownames=['Actual'], colnames=['Predicted'])
print("\nConfusion Matrix:")
print(confusion_matrix)

Classification Accuracy: 0.7375

Confusion Matrix:
Predicted  blues  classical  country  disco  hiphop  jazz  metal  pop  reggae  \
Actual                                                                          
blues         16          1        2      0       0     2      0    0       1   
classical      0         16        0      0       0     0      0    0       0   
country        0          0       12      1       0     0      0    1       0   
disco          1          0        0      8       1     0      0    2       0   
hiphop         0          0        1      0       6     0      1    3       2   
jazz           0          4        1      1       0    17      0    0       0   
metal          0          0        0      0       0     0     15    0       0   
pop            0          0        2      0       1     0      0   14       0   
reggae         0          0        1      0       1     0      0    1      10   
rock           0          0        1      4       0     0 

In [None]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 3, 5, 7, 10, 20, 30],
#     'min_samples_split': [2, 5, 7]
# }

# grid_search = GridSearchCV(RandomForestClassifier(random_state=3), param_grid, cv=5)
# grid_search.fit(train_features, train_labels)
# print("Best parameters:", grid_search.best_params_)

Best parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 300}


## Classify test data

In [None]:
X_train = np.array(train_features)
y_train = np.array(train_labels)

X_test = np.array(test_features)

In [None]:
rf_model = RandomForestClassifier(n_estimators=300, random_state=3, max_depth=20, min_samples_split=5)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

y_pred_unshortened = []

# Calculate the mode for each consecutive 10 entries
for i in range(0, len(y_pred), 10):
    group = y_pred[i:i+10]
    mode_value = mode(group)
    y_pred_unshortened.append(mode_value)

print(y_pred_unshortened)

['metal', 'pop', 'country', 'blues', 'metal', 'pop', 'disco', 'jazz', 'jazz', 'blues', 'disco', 'metal', 'reggae', 'disco', 'country', 'blues', 'hiphop', 'blues', 'pop', 'hiphop', 'classical', 'disco', 'classical', 'metal', 'blues', 'classical', 'country', 'reggae', 'country', 'blues', 'classical', 'classical', 'country', 'country', 'metal', 'pop', 'reggae', 'country', 'classical', 'country', 'pop', 'metal', 'rock', 'pop', 'rock', 'disco', 'country', 'pop', 'metal', 'disco', 'blues', 'hiphop', 'reggae', 'jazz', 'reggae', 'reggae', 'pop', 'pop', 'metal', 'country', 'classical', 'country', 'country', 'country', 'pop', 'rock', 'pop', 'country', 'rock', 'disco', 'blues', 'metal', 'classical', 'metal', 'blues', 'blues', 'disco', 'rock', 'pop', 'disco', 'pop', 'disco', 'country', 'country', 'reggae', 'rock', 'classical', 'disco', 'rock', 'disco', 'blues', 'classical', 'hiphop', 'metal', 'rock', 'classical', 'pop', 'country', 'pop', 'reggae', 'jazz', 'country', 'pop', 'metal', 'classical', 'p

## Submission

In [None]:
if not os.path.isdir("Submissions/"): os.mkdir("Submissions")

current_datetime = datetime.now().strftime("%Y%m%d_%H%M%S")

with open(f"Submissions/submission_{model}_{current_datetime}.csv", 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['ID', 'Genre'])

    for i in range(len(y_pred_unshortened)):
        filename = f"test{i:03d}.wav"
        prediction = [filename , y_pred_unshortened[i]]
        csvwriter.writerow(prediction)