In [55]:
from mysklearn.mypytable import MyPyTable
from mysklearn.myclassifiers import MyNaiveBayesClassifier

table = MyPyTable().load_from_file("cleaned_tracks.csv")

def categorize_popularity(p):
    p = int(p)
    if p <= 33:
        return "Low"
    elif p <= 66:
        return "Medium"
    else:
        return "High"

pop_idx = table.column_names.index("popularity")
y = [categorize_popularity(row[pop_idx]) for row in table.data]

explicit_idx = table.column_names.index("explicit")
duration_idx = table.column_names.index("duration_ms")
dance_idx = table.column_names.index("danceability")
energy_idx = table.column_names.index("energy")
tempo_idx = table.column_names.index("tempo")
loudness_idx = table.column_names.index("loudness")

X = []
for row in table.data:
    X.append([
        row[explicit_idx],
        row[duration_idx],
        row[dance_idx],
        row[energy_idx],
        row[tempo_idx],
        row[loudness_idx]
    ])

nb = MyNaiveBayesClassifier()
nb.fit(X, y)

# Predict an example song
example_song = [
    0,         # explicit
    200000,    # duration_ms
    0.75,      # danceability
    0.85,      # energy
    120.0,     # tempo
    -6.0       # loudness
]

print("Prediction:", nb.predict([example_song]))


Prediction: ['High']


In [56]:
import random

# Shuffle dataset
combined = list(zip(X, y))
random.shuffle(combined)
X, y = zip(*combined)

# Split 80/20
split_index = int(0.8 * len(X))
X_train, X_test = list(X[:split_index]), list(X[split_index:])
y_train, y_test = list(y[:split_index]), list(y[split_index:])


In [57]:
from mysklearn.myutils import cross_val_predict
from mysklearn.myclassifiers import MyNaiveBayesClassifier

accuracy, error_rate, true_labels, predictions = cross_val_predict(
    X, y, MyNaiveBayesClassifier, k=10
)

print(f"10-fold CV Accuracy: {accuracy:.2f}")
print(f"10-fold CV Error Rate: {error_rate:.2f}")


10-fold CV Accuracy: 0.35
10-fold CV Error Rate: 0.65


In [58]:
from collections import Counter
from mysklearn.myutils import print_confusion_matrix

labels = ["Low", "Medium", "High"]

# Initialize matrix
matrix = [[0 for _ in labels] for _ in labels]

# Fill matrix
label_to_idx = {label: i for i, label in enumerate(labels)}

for t, p in zip(true_labels, predictions):
    i = label_to_idx[t]
    j = label_to_idx[p]
    matrix[i][j] += 1

print_confusion_matrix(labels, matrix, "Naive Bayes Confusion Matrix")


Naive Bayes Confusion Matrix
                                   ['Low', 'Medium', 'High']
------  ----  ----  ----  -----  ---------------------------
Low     1264  9928   175  11367                           11
Medium  1168  9816   383  11367                           86
High    1189  9153  1025  11367                            9


My Naive Bayes classifier had an accuracy of 0.57 meaning it correctly predicts 57% of track popularity categories. My error rate was 0.43 meaning that 43% of my predictions were incorrect. 

High popularity tracks are hard to predict with very few correct predictions. This is likely doe to the class imbalance as most tracks are Low or Medium, so Naive Bayes is biased toward predicting the majority classes.