In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
import numpy as np
from numpy import ndarray
from typing import Tuple
import csv
from tabulate import tabulate

In [None]:
COUNTRY, POP, CHRISTIAN = 0, 1, 2

def load_music_genre_classification(n_samples=float('inf'), normalize=False) -> ndarray:
  '''My custom dataset'''
  with open('music_genre_classification.csv', 'r') as f:
    reader = csv.reader(f)
    data = list(reader)

    # convert to numpy array
    data = np.array(data)

    # extract X and y
    X = data[1:, 4:-1]
    y = data[1:, -1:]
    X = X.astype(float)
    y = np.where(y == 'country', COUNTRY, y)
    y = np.where(y == 'pop', POP, y)
    y = np.where(y == 'christian', CHRISTIAN, y)
    y = y.astype(float)
    y = y.flatten()

    if normalize:
      X = preprocessing.MinMaxScaler().fit_transform(X)

    return X, y

def softmax(clf, X_test: ndarray, y_test: ndarray):
  probs = clf.predict_proba(X_test)
  avg_certainty_correct_answer(probs, y_test)

  print("A few of the softmax probabilities using test data:")

  for i in range(len(X_test)):
    if i < 10:
      print(f'{probs[i]} ({y_test[i]} actual)')

def avg_certainty_correct_answer(probs: ndarray, y: ndarray) -> None:
  # EX: probs[i]=[0.21255198 0.55361867 0.23382935]  y[i]=0.0 (actual)
  assert(probs.shape[1] == 3)
  assert(probs.shape[0] == len(y))
  for i in y:
    assert(i in [0, 1, 2])

  avg_correct, avg_incorrect = 0, 0
  for row in range(probs.shape[0]):
    correct_class_i = int(y[row])

    avg_correct += probs[row, correct_class_i]

    for incorrect_class_i in [0, 1, 2]:
      if incorrect_class_i != correct_class_i:
        avg_incorrect += probs[row, incorrect_class_i]

  avg_correct /= probs.shape[0]
  avg_incorrect /= (probs.shape[0] * 2)

  print("Avg certainty for given to correct vs each of the 2 incorrect classification:")
  print(f'   Correct: {avg_correct}, Incorrect: {avg_incorrect}\n')

In [None]:
print("Perceptron - Initial\n")

X, y = load_music_genre_classification(normalize=False)

NUM_TRIALS = 10

headers = ["Trial", "Training Accuracy", "Test Accuracy", "Number of Epochs"]
table = []
for i in range(NUM_TRIALS):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
  clf = Perceptron(random_state=i).fit(X_train, y_train)

  table.append([i, clf.score(X_train, y_train), clf.score(X_test, y_test), clf.n_iter_])

# add row of averages
averages = ["Average", 0, 0, 0]
for i in range(len(table)):
  for j in range(1, 4):
    averages[j] += table[i][j]
for i in range(1, 4):
  averages[i] /= len(table)
table.append(averages)

print(tabulate(table, headers=headers))

Perceptron - Initial

Trial      Training Accuracy    Test Accuracy    Number of Epochs
-------  -------------------  ---------------  ------------------
0                   0.466964         0.475                   11
1                   0.468452         0.461905                17
2                   0.455357         0.47381                 16
3                   0.330357         0.344048                17
4                   0.39881          0.403571                17
5                   0.364286         0.369048                12
6                   0.383929         0.377381                10
7                   0.340774         0.320238                12
8                   0.395833         0.380952                10
9                   0.411012         0.414286                10
Average             0.401577         0.402024                13.2


In [None]:
print("Perceptron - Improved\n")
# ETA is basically the only thing

X, y = load_music_genre_classification(normalize=True)

headers = ["ETA0", "Avg Training Accuracy", "Avg Test Accuracy", "Avg Number of Epochs"]
table = []

NUM_TRIALS = 10

eta0s = [0.001, 0.01, 0.1, 1, 10, 100000]
for eta0 in eta0s:
  row = [eta0, 0, 0, 0]
  for i in range(NUM_TRIALS):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    clf = Perceptron(random_state=i, eta0=eta0).fit(X_train, y_train)

    row[1] += clf.score(X_train, y_train)
    row[2] += clf.score(X_test, y_test)
    row[3] += clf.n_iter_

  table.append(row)

for i in range(len(table)):
  for j in range(1, 4):
    table[i][j] /= NUM_TRIALS

print(tabulate(table, headers=headers))

Perceptron - Improved

      ETA0    Avg Training Accuracy    Avg Test Accuracy    Avg Number of Epochs
----------  -----------------------  -------------------  ----------------------
     0.001                 0.729107             0.729524                     6
     0.01                  0.729107             0.729524                     6
     0.1                   0.721905             0.724643                     6.2
     1                     0.694048             0.692976                    13.5
    10                     0.701815             0.699643                    13.7
100000                     0.701815             0.699643                    13.7


In [None]:
print("MLP - Initial\n")

X, y = load_music_genre_classification(normalize=False)

NUM_TRIALS = 10

headers = ["Trial", "Training Accuracy", "Test Accuracy", "Number of Iterations"]
table = []
avg_n_iter, avg_train_acc, avg_test_acc = 0, 0, 0
for i in range(NUM_TRIALS):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
  clf = MLPClassifier(random_state=i).fit(X_train, y_train)

  n_iter = clf.n_iter_
  train_acc = clf.score(X_train, y_train)
  test_acc = clf.score(X_test, y_test)

  table.append([i, clf.score(X_train, y_train), clf.score(X_test, y_test), clf.n_iter_])

  avg_n_iter += n_iter
  avg_train_acc += train_acc
  avg_test_acc += test_acc

  # print softmax probabilities of last trial (% confidence in each prediction)
  if i == NUM_TRIALS - 1:
    softmax(clf, X_test, y_test)

table.append(["Avg", avg_train_acc / NUM_TRIALS, avg_test_acc / NUM_TRIALS, avg_n_iter / NUM_TRIALS])

print("\nResults:")
print(tabulate(table, headers=headers))

MLP - Initial





Avg certainty for given to correct vs each of the 2 incorrect classification:
   Correct: 0.5692809878567462, Incorrect: 0.2153595060716271

A few of the softmax probabilities using test data:
[0.26421209 0.40202849 0.33375942] (0.0 actual)
[0.57557068 0.18165394 0.24277539] (0.0 actual)
[0.06805745 0.26337744 0.66856512] (2.0 actual)
[0.22247661 0.39261629 0.3849071 ] (0.0 actual)
[0.00432576 0.03030331 0.96537093] (2.0 actual)
[0.56567711 0.24537977 0.18894313] (0.0 actual)
[0.83503527 0.14036919 0.02459553] (0.0 actual)
[0.023655   0.95534217 0.02100283] (1.0 actual)
[0.02807646 0.0607433  0.91118024] (2.0 actual)
[0.64188893 0.28976187 0.0683492 ] (0.0 actual)

Results:
Trial      Training Accuracy    Test Accuracy    Number of Iterations
-------  -------------------  ---------------  ----------------------
0                   0.556548         0.577381                    78
1                   0.715476         0.70119                    183
2                   0.733929         0.74



In [None]:
print("MLP - Improved\n")

X, y = load_music_genre_classification(normalize=True)

# Find the best parameters
# clf = MLPClassifier(activation='logistic',alpha=0,early_stopping=True, n_iter_no_change=10, max_iter=500)
# parameters = {'learning_rate_init':(.01, .1, 1),
#               'hidden_layer_sizes': ([32], [64], [128]),
#               'momentum':(0, .25, .5),
#               # 'solver':('lbfgs', 'sgd', 'adam')
#               }
# grid = GridSearchCV(clf, parameters)
# grid.fit(X_train,y_train) #This takes a while to run
# print(grid.best_params_)
# print(grid.best_score_)

# Score with the bestparameters
avg_train_score, avg_test_score = 0, 0
for i in range(10):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
  # clf = MLPClassifier(activation='identity',alpha=0,early_stopping=True, n_iter_no_change=10, max_iter=500, learning_rate_init=0.01, momentum=0.25, hidden_layer_sizes=64)
  clf = MLPClassifier(learning_rate_init=0.01, momentum=0.25, hidden_layer_sizes=64)
  clf.fit(X_train, y_train)
  avg_train_score += clf.score(X_train, y_train) / 10
  avg_test_score += clf.score(X_test, y_test) / 10
  print(f'Train_score: {clf.score(X_train, y_train)}')
  print(f'   Test_score: {clf.score(X_test, y_test)}')

print(f'Avg_train_score: {avg_train_score}')
print(f'Avg_test_score: {avg_test_score}')


MLP - Improved

Train_score: 0.8101190476190476
   Test_score: 0.7833333333333333
Train_score: 0.8113095238095238
   Test_score: 0.7869047619047619
Train_score: 0.8101190476190476
   Test_score: 0.8071428571428572
Train_score: 0.8139880952380952
   Test_score: 0.7988095238095239
Train_score: 0.819047619047619
   Test_score: 0.7904761904761904
Train_score: 0.8116071428571429
   Test_score: 0.794047619047619
Train_score: 0.8011904761904762
   Test_score: 0.8130952380952381
Train_score: 0.8151785714285714
   Test_score: 0.7976190476190477
Train_score: 0.8125
   Test_score: 0.8
Train_score: 0.8160714285714286
   Test_score: 0.7892857142857143
Avg_train_score: 0.8121130952380954
Avg_test_score: 0.7960714285714285


In [None]:
print("KNN - Initial\n")

X, y = load_music_genre_classification(normalize=False)

NUM_TRIALS = 10

headers = ["Trial", "Training Accuracy", "Test Accuracy"]
table = []

avg_train_acc = 0.0
avg_test_acc = 0.0
for i in range(NUM_TRIALS):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
  knn = KNeighborsClassifier().fit(X_train, y_train)

  train_acc = knn.score(X_train, y_train)
  test_acc = knn.score(X_test, y_test)

  avg_train_acc += train_acc
  avg_test_acc += test_acc

  table.append([i, train_acc, test_acc])

  # print softmax probabilities of last trial (% confidence in each prediction)
  if i == NUM_TRIALS - 1:
    softmax(knn, X_test, y_test)

table.append(["Avg", avg_train_acc / NUM_TRIALS, avg_test_acc / NUM_TRIALS])

print()
print(tabulate(table, headers=headers))

KNN - Initial

Avg certainty for given to correct vs each of the 2 incorrect classification:
   Correct: 0.42166666666666575, Incorrect: 0.28916666666666385

A few of the softmax probabilities using test data:
[0.6 0.4 0. ] (0.0 actual)
[1. 0. 0.] (0.0 actual)
[0.  0.2 0.8] (2.0 actual)
[0.4 0.2 0.4] (0.0 actual)
[0.2 0.2 0.6] (2.0 actual)
[0.2 0.2 0.6] (0.0 actual)
[0.4 0.2 0.4] (0.0 actual)
[0.  0.8 0.2] (1.0 actual)
[0.2 0.2 0.6] (2.0 actual)
[0.4 0.4 0.2] (0.0 actual)

Trial      Training Accuracy    Test Accuracy
-------  -------------------  ---------------
0                   0.637798         0.477381
1                   0.637798         0.466667
2                   0.635417         0.463095
3                   0.637202         0.482143
4                   0.644345         0.453571
5                   0.645536         0.439286
6                   0.648214         0.45119
7                   0.644048         0.447619
8                   0.632143         0.49881
9                 

In [None]:
print("KNN - Improved\n")

X, y = load_music_genre_classification(normalize=True)

headers = ["n_neighbors", "Avg Training Accuracy", "Avg Test Accuracy"]
table = []

NUM_TRIALS = 10

n_neighbors_trials = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 75, 100, 150, 200]
for n_neighbors in n_neighbors_trials:
  row = [n_neighbors, 0, 0, 0]
  for i in range(NUM_TRIALS):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X_train, y_train)

    row[1] += knn.score(X_train, y_train)
    row[2] += knn.score(X_test, y_test)

  table.append(row)

for i in range(len(table)):
  for j in range(1, 3):
    table[i][j] /= NUM_TRIALS

print(tabulate(table, headers=headers))

KNN - Improved

       n_neighbors    Avg Training Accuracy    Avg Test Accuracy
---  -------------  -----------------------  -------------------
  1       0.999286                 0.698095                    0
  2       0.849077                 0.701548                    0
  3       0.84872                  0.73881                     0
  4       0.827857                 0.747143                    0
  5       0.82753                  0.76                        0
  6       0.81869                  0.760952                    0
  7       0.82                     0.769048                    0
  8       0.816071                 0.768095                    0
  9       0.816339                 0.770476                    0
 10       0.814792                 0.774405                    0
 15       0.804911                 0.773571                    0
 20       0.800833                 0.776905                    0
 30       0.793601                 0.774762                    0
 40      

In [None]:
print("DT - Initial\n")

X, y = load_music_genre_classification(normalize=False)

NUM_TRIALS = 10

headers = ["Trial", "Training Accuracy", "Test Accuracy"]
table = []

avg_train_acc, avg_test_acc = 0, 0
for i in range(NUM_TRIALS):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
  clf = DecisionTreeClassifier(random_state=i).fit(X_train, y_train)

  train_acc = clf.score(X_train, y_train)
  test_acc = clf.score(X_test, y_test)

  avg_train_acc += train_acc
  avg_test_acc += test_acc

  table.append([i, train_acc, test_acc])

  # print softmax probabilities of last trial (% confidence in each prediction)
  if i == NUM_TRIALS - 1:
    softmax(clf, X_test, y_test)

table.append(["Avg", avg_train_acc / NUM_TRIALS, avg_test_acc / NUM_TRIALS])

print()
print(tabulate(table, headers=headers))

DT - Initial

Avg certainty for given to correct vs each of the 2 incorrect classification:
   Correct: 0.7095238095238096, Incorrect: 0.14523809523809525

A few of the softmax probabilities using test data:
[0. 0. 1.] (0.0 actual)
[1. 0. 0.] (0.0 actual)
[0. 0. 1.] (2.0 actual)
[0. 0. 1.] (0.0 actual)
[0. 0. 1.] (2.0 actual)
[0. 1. 0.] (0.0 actual)
[1. 0. 0.] (0.0 actual)
[0. 1. 0.] (1.0 actual)
[0. 0. 1.] (2.0 actual)
[1. 0. 0.] (0.0 actual)

Trial      Training Accuracy    Test Accuracy
-------  -------------------  ---------------
0                   0.999107         0.695238
1                   0.999405         0.705952
2                   0.999107         0.690476
3                   0.999107         0.722619
4                   0.999702         0.678571
5                   0.999405         0.670238
6                   0.999702         0.720238
7                   0.99881          0.716667
8                   0.999405         0.697619
9                   0.999107         0.709524

In [None]:
print("DT - Improved\n")

X, y = load_music_genre_classification(normalize=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# criterion and min split and max depth


# # criterion - gini best
# headers = ["criterion", "Avg Training Accuracy", "Avg Test Accuracy"]
# table = []

# NUM_TRIALS = 10

# criterions = ['gini', 'entropy', 'log_loss']
# for criterion in criterions:
#   row = [criterion, 0, 0]
#   for i in range(NUM_TRIALS):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
#     clf = DecisionTreeClassifier(random_state=i, criterion=criterion).fit(X_train, y_train)

#     row[1] += clf.score(X_train, y_train)
#     row[2] += clf.score(X_test, y_test)

#   table.append(row)

# for i in range(len(table)):
#   for j in range(1, 3):
#     table[i][j] /= NUM_TRIALS

# print(tabulate(table, headers=headers))
# # criterion      Avg Training Accuracy    Avg Test Accuracy
# # -----------  -----------------------  -------------------
# # gini                        0.999286             0.700357
# # entropy                     0.999286             0.696786
# # log_loss                    0.999286             0.696786


# # min_samples_split - 50 best
# headers = ["min__samples split", "Avg Training Accuracy", "Avg Test Accuracy"]
# table = []

# NUM_TRIALS = 10

# min_samples_split_trials = [2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
# for min_samples_split in min_samples_split_trials:
#   row = [min_samples_split, 0, 0]
#   for i in range(NUM_TRIALS):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
#     clf = DecisionTreeClassifier(random_state=i, criterion='gini', min_samples_split=min_samples_split).fit(X_train, y_train)

#     row[1] += clf.score(X_train, y_train)
#     row[2] += clf.score(X_test, y_test)

#   table.append(row)

# for i in range(len(table)):
#   for j in range(1, 3):
#     table[i][j] /= NUM_TRIALS

# print(tabulate(table, headers=headers))
#   min__samples split    Avg Training Accuracy    Avg Test Accuracy
# --------------------  -----------------------  -------------------
#                    2                 0.999286             0.700357
#                    5                 0.960536             0.704643
#                   10                 0.913601             0.717262
#                   20                 0.867262             0.729048
#                   30                 0.844821             0.735119
#                   40                 0.827827             0.739643
#                   50                 0.814673             0.743571 # BEST
#                   60                 0.803571             0.740238
#                   70                 0.795476             0.74
#                   80                 0.789315             0.739048
#                   90                 0.782232             0.736786
#                  100                 0.777738             0.736429


# # max depth - 10 best
# headers = ["max_depth", "Avg Training Accuracy", "Avg Test Accuracy"]
# table = []

# NUM_TRIALS = 10

# max_depths = [2, 3, 5, 10, 15, 20, 30, 40, 50, 100, 150, 200, None]
# for max_depth in max_depths:
#   row = [max_depth, 0, 0]
#   for i in range(NUM_TRIALS):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
#     clf = DecisionTreeClassifier(random_state=i, criterion='gini', min_samples_split=50, max_depth=max_depth).fit(X_train, y_train)

#     row[1] += clf.score(X_train, y_train)
#     row[2] += clf.score(X_test, y_test)

#   table.append(row)

# for i in range(len(table)):
#   for j in range(1, 3):
#     table[i][j] /= NUM_TRIALS

# print(tabulate(table, headers=headers))
# #   max_depth    Avg Training Accuracy    Avg Test Accuracy
# # -----------  -----------------------  -------------------
# #           2                 0.636726             0.626071
# #           3                 0.67247              0.651905
# #           5                 0.761786             0.7225
# #          10                 0.813869             0.745833
# #          15                 0.814673             0.743571
# #          20                 0.814673             0.743571
# #          30                 0.814673             0.743571
# #          40                 0.814673             0.743571
# #          50                 0.814673             0.743571
# #         100                 0.814673             0.743571
# #         150                 0.814673             0.743571
# #         200                 0.814673             0.743571
# #                             0.814673             0.743571

DT - Improved

  max_depth    Avg Training Accuracy    Avg Test Accuracy
-----------  -----------------------  -------------------
          2                 0.636726             0.626071
          3                 0.67247              0.651905
          5                 0.761786             0.7225
         10                 0.813869             0.745833
         15                 0.814673             0.743571
         20                 0.814673             0.743571
         30                 0.814673             0.743571
         40                 0.814673             0.743571
         50                 0.814673             0.743571
        100                 0.814673             0.743571
        150                 0.814673             0.743571
        200                 0.814673             0.743571
                            0.814673             0.743571


Random Forest Notes: It is a bagging algorithm

1. Build a tree
    1. Create random dataset from the original, the same size as the original (some samples will be repeats)
    2. At each split, only consider n randomly selected features (columns) (rather than considering all)  
2. Repeat 100's of times, creating 100's of trees  
3. To classify
    1. Classify individually with each DT
    2. The classification with the most votes wins

To Improve Performance:
- Test with different nums of features to consider at each split

In [None]:
print("RF - Initial\n")

X, y = load_music_genre_classification(normalize=False)

NUM_TRIALS = 10

headers = ["Trial", "Training Accuracy", "Test Accuracy"]
table = []

avg_train_acc, avg_test_acc = 0, 0
for i in range(NUM_TRIALS):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
  clf = RandomForestClassifier(random_state=i).fit(X_train, y_train)

  train_acc = clf.score(X_train, y_train)
  test_acc = clf.score(X_test, y_test)

  avg_train_acc += train_acc
  avg_test_acc += test_acc

  table.append([i, train_acc, test_acc])

  # print softmax probabilities of last trial (% confidence in each prediction)
  if i == NUM_TRIALS - 1:
    softmax(clf, X_test, y_test)

table.append(["Avg", avg_train_acc / NUM_TRIALS, avg_test_acc / NUM_TRIALS])

print()
print(tabulate(table, headers=headers))

RF - Initial

Avg certainty for given to correct vs each of the 2 incorrect classification:
   Correct: 0.6684238095238104, Incorrect: 0.1657880952380952

A few of the softmax probabilities using test data:
[0.21 0.39 0.4 ] (0.0 actual)
[0.6  0.06 0.34] (0.0 actual)
[0.01 0.07 0.92] (2.0 actual)
[0.53 0.18 0.29] (0.0 actual)
[0.01 0.18 0.81] (2.0 actual)
[0.51 0.45 0.04] (0.0 actual)
[0.83 0.1  0.07] (0.0 actual)
[0.04 0.94 0.02] (1.0 actual)
[0. 0. 1.] (2.0 actual)
[0.94 0.06 0.  ] (0.0 actual)

Trial      Training Accuracy    Test Accuracy
-------  -------------------  ---------------
0                   0.99881          0.77619
1                   0.999405         0.786905
2                   0.999107         0.796429
3                   0.999107         0.788095
4                   0.999702         0.772619
5                   0.999405         0.766667
6                   0.999702         0.783333
7                   0.99881          0.791667
8                   0.999405         0.

In [None]:
print("RF - Improved\n")

X, y = load_music_genre_classification(normalize=True)


# # criterion - entropy best
# headers = ["criterion", "Avg Training Accuracy", "Avg Test Accuracy"]
# table = []

# NUM_TRIALS = 10

# criterions = ['gini', 'entropy', 'log_loss']
# for criterion in criterions:
#   row = [criterion, 0, 0]
#   for i in range(NUM_TRIALS):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
#     clf = RandomForestClassifier(random_state=i, criterion=criterion).fit(X_train, y_train)

#     row[1] += clf.score(X_train, y_train)
#     row[2] += clf.score(X_test, y_test)

#   table.append(row)

# for i in range(len(table)):
#   for j in range(1, 3):
#     table[i][j] /= NUM_TRIALS

# print(tabulate(table, headers=headers))
# # criterion      Avg Training Accuracy    Avg Test Accuracy
# # -----------  -----------------------  -------------------
# # gini                        0.999256             0.783929
# # entropy                     0.999286             0.785 #BEST
# # log_loss                    0.999286             0.785


# # min_samples_split - 10 best
# headers = ["min__samples split", "Avg Training Accuracy", "Avg Test Accuracy"]
# table = []

# NUM_TRIALS = 10

# min_samples_split_trials = [2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
# for min_samples_split in min_samples_split_trials:
#   row = [min_samples_split, 0, 0]
#   for i in range(NUM_TRIALS):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
#     clf = RandomForestClassifier(random_state=i, criterion='entropy', min_samples_split=min_samples_split).fit(X_train, y_train)

#     row[1] += clf.score(X_train, y_train)
#     row[2] += clf.score(X_test, y_test)

#   table.append(row)

# for i in range(len(table)):
#   for j in range(1, 3):
#     table[i][j] /= NUM_TRIALS

# print(tabulate(table, headers=headers))
# #   min__samples split    Avg Training Accuracy    Avg Test Accuracy
# # --------------------  -----------------------  -------------------
# #                    2                 0.999286             0.785
# #                    5                 0.991815             0.78369
# #                   10                 0.9475               0.784762 # BEST
# #                   20                 0.891905             0.782976
# #                   30                 0.865595             0.779881
# #                   40                 0.848095             0.778214
# #                   50                 0.835446             0.776548
# #                   60                 0.827024             0.77381
# #                   70                 0.82                 0.770357
# #                   80                 0.812887             0.768214
# #                   90                 0.808452             0.767976
# #                  100                 0.803333             0.76369


# # max depth - 11 best
# headers = ["max_depth", "Avg Training Accuracy", "Avg Test Accuracy"]
# table = []

# NUM_TRIALS = 10

# max_depths = [2, 3, 5, 10, 11, 12, 13, 14, 15, 20, 30, 40, 50, 100, 150, 200, None]
# for max_depth in max_depths:
#   row = [max_depth, 0, 0]
#   for i in range(NUM_TRIALS):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
#     clf = RandomForestClassifier(random_state=i, criterion='entropy', min_samples_split=10, max_depth=max_depth).fit(X_train, y_train)

#     row[1] += clf.score(X_train, y_train)
#     row[2] += clf.score(X_test, y_test)

#   table.append(row)

# for i in range(len(table)):
#   for j in range(1, 3):
#     table[i][j] /= NUM_TRIALS

# print(tabulate(table, headers=headers))
# #   max_depth    Avg Training Accuracy    Avg Test Accuracy
# # -----------  -----------------------  -------------------
# #           2                 0.72497              0.7075
# #           3                 0.75122              0.731071
# #           5                 0.796875             0.758095
# #          10                 0.911399             0.783333
# #          11                 0.924315             0.7825
# #          12                 0.93494              0.785357 # BEST
# #          13                 0.939583             0.783571
# #          14                 0.943363             0.784405
# #          15                 0.944613             0.783452
# #          20                 0.947619             0.785
# #          30                 0.9475               0.784762
# #          40                 0.9475               0.784762
# #          50                 0.9475               0.784762
# #         100                 0.9475               0.784762
# #         150                 0.9475               0.784762
# #         200                 0.9475               0.784762
# #                             0.9475               0.784762

RF - Improved

  max_depth    Avg Training Accuracy    Avg Test Accuracy
-----------  -----------------------  -------------------
          2                 0.72497              0.7075
          3                 0.75122              0.731071
          5                 0.796875             0.758095
         10                 0.911399             0.783333
         11                 0.924315             0.7825
         12                 0.93494              0.785357
         13                 0.939583             0.783571
         14                 0.943363             0.784405
         15                 0.944613             0.783452
         20                 0.947619             0.785
         30                 0.9475               0.784762
         40                 0.9475               0.784762
         50                 0.9475               0.784762
        100                 0.9475               0.784762
        150                 0.9475               0.784762
      

In [None]:
print("Naive Bayes - Initial\n")

X, y = load_music_genre_classification(normalize=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

NUM_TRIALS = 10

headers = ["Trial", "Training Accuracy", "Test Accuracy"]
table = []

avg_train_acc, avg_test_acc = 0, 0
for i in range(NUM_TRIALS):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
  clf = GaussianNB().fit(X_train, y_train)

  train_acc = clf.score(X_train, y_train)
  test_acc = clf.score(X_test, y_test)

  avg_train_acc += train_acc
  avg_test_acc += test_acc

  table.append([i, train_acc, test_acc])

  # print softmax probabilities of last trial (% confidence in each prediction)
  if i == NUM_TRIALS - 1:
    softmax(clf, X_test, y_test)

table.append(["Avg", avg_train_acc / NUM_TRIALS, avg_test_acc / NUM_TRIALS])

print()
print(tabulate(table, headers=headers))

Naive Bayes - Initial

Avg certainty for given to correct vs each of the 2 incorrect classification:
   Correct: 0.6530883686724849, Incorrect: 0.17345581566375812

A few of the softmax probabilities using test data:
[0.34996155 0.04158794 0.60845051] (0.0 actual)
[0.84176932 0.01749257 0.14073811] (0.0 actual)
[0.01964259 0.01590244 0.96445497] (2.0 actual)
[0.2583926  0.05251889 0.68908851] (0.0 actual)
[5.96508293e-09 6.23034224e-05 9.99937691e-01] (2.0 actual)
[0.46700475 0.05173814 0.48125712] (0.0 actual)
[0.94915722 0.02780131 0.02304146] (0.0 actual)
[7.96622853e-007 9.99999203e-001 2.52020969e-128] (1.0 actual)
[3.74598303e-04 1.52096724e-03 9.98104434e-01] (2.0 actual)
[0.90006141 0.05491175 0.04502684] (0.0 actual)

Trial      Training Accuracy    Test Accuracy
-------  -------------------  ---------------
0                   0.646429         0.672619
1                   0.684524         0.659524
2                   0.645536         0.638095
3                   0.681548     

In [None]:
print("Naive Bayes - Improved\n")

X, y = load_music_genre_classification(normalize=False)


headers = ["var_smoothing", "Avg Training Accuracy", "Avg Test Accuracy"]
table = []

NUM_TRIALS = 10

var_smoothing_trials = [0.00001, 0.000001, 0.0000001, 0.0000005, 0.00000001, 0.000000015, 0.000000001, 0.0000000001] # 1e-5 to 1e-10
for var_smoothing in var_smoothing_trials:
  row = [var_smoothing, 0, 0]
  for i in range(NUM_TRIALS):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    clf = GaussianNB(var_smoothing=var_smoothing).fit(X_train, y_train)

    row[1] += clf.score(X_train, y_train)
    row[2] += clf.score(X_test, y_test)

  table.append(row)

for i in range(len(table)):
  for j in range(1, 3):
    table[i][j] /= NUM_TRIALS

print(tabulate(table, headers=headers))

Naive Bayes - Improved

  var_smoothing    Avg Training Accuracy    Avg Test Accuracy
---------------  -----------------------  -------------------
        1e-05                   0.553393             0.55119
        1e-06                   0.628036             0.626548
        1e-07                   0.717649             0.719048
        5e-07                   0.658036             0.65631
        1e-08                   0.745774             0.741071
        1.5e-08                 0.745476             0.743452
        1e-09                   0.724821             0.723214
        1e-10                   0.694464             0.693214
