In [None]:
from mysklearn.mypytable import MyPyTable
import mysklearn.myutils as myutils
from mysklearn.myclassifiers import MyRandomForestClassifier
import mysklearn.myevaluation as myevaluation

table = MyPyTable().load_from_file("cleaned_tracks.csv")

def categorize_popularity(p):
    p = int(p)
    if p <= 33:
        return "Low"
    elif p <= 66:
        return "Medium"
    else:
        return "High"

pop_idx = table.column_names.index("popularity")
y = [categorize_popularity(row[pop_idx]) for row in table.data]

explicit_idx = table.column_names.index("explicit")
duration_idx = table.column_names.index("duration_ms")
dance_idx = table.column_names.index("danceability")
energy_idx = table.column_names.index("energy")
tempo_idx = table.column_names.index("tempo")
loudness_idx = table.column_names.index("loudness")

X = []
for row in table.data:
    X.append([
        row[explicit_idx],
        row[duration_idx],
        row[dance_idx],
        row[energy_idx],
        row[tempo_idx],
        row[loudness_idx]
    ])

X_train, X_test, y_train, y_test = myevaluation.stratified_train_test_split(X, y)

rf = MyRandomForestClassifier()
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)

' # Predict an example song\nexample_song = [\n    1,         # explicit\n    150000,    # duration_ms\n    0.25,      # danceability\n    0.50,      # energy\n    120.0      # tempo\n    -8.0       # loudness\n]\n\nprint("Prediction:", rf.predict([example_song])) '

In [2]:
rf_acc = myevaluation.accuracy_score(y_test, predictions)
rf_err = 1 - rf_acc
rf_prec = myevaluation.binary_precision_score(y_test, predictions)
rf_recall = myevaluation.binary_recall_score(y_test, predictions)
rf_f1 = myevaluation.binary_f1_score(y_test, predictions)

print(f"rf acc: {rf_acc:.2f} rf err: {rf_err:.2f}")
print(f"rf precision: {rf_prec:.2f}")
print(f"rf recall: {rf_recall:.2f}")
print(f"rf f1: {rf_f1:.2f}")

rf acc: 0.39 rf err: 0.61
rf precision: 0.38
rf recall: 0.73
rf f1: 0.50


In [3]:

labels = ["Low", "Medium", "High"]
matrix = [[0 for _ in labels] for _ in labels]
label_to_idx = {label: i for i, label in enumerate(labels)}

for t, p in zip(y_test, predictions):
    i = label_to_idx[t]
    j = label_to_idx[p]
    matrix[i][j] += 1

myutils.print_confusion_matrix(labels, matrix, "Random Forest Confusion Matrix")

Random Forest Confusion Matrix
                                 ['Low', 'Medium', 'High']
------  ----  ---  ----  ----  ---------------------------
Low     2746  551   455  3752                           73
Medium  2355  635   762  3752                           17
High    2043  686  1023  3752                           27
