In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from tqdm.notebook import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
%run pre_proc.ipynb

Not enough Echonest features: (13129, 767)
19922 training examples, 2505 validation examples, 2573 testing examples
Top genres (16): ['Blues', 'Classical', 'Country', 'Easy Listening', 'Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Jazz', 'Old-Time / Historic', 'Pop', 'Rock', 'Soul-RnB', 'Spoken']
All genres (151): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 26, 27, 30, 31, 32, 33, 36, 37, 38, 41, 42, 43, 45, 46, 47, 49, 53, 58, 63, 64, 65, 66, 70, 71, 74, 76, 77, 79, 81, 83, 85, 86, 88, 89, 90, 92, 94, 97, 98, 100, 101, 102, 103, 107, 109, 111, 113, 117, 118, 125, 130, 137, 138, 166, 167, 169, 171, 172, 174, 177, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 214, 224, 232, 236, 240, 247, 250, 267, 286, 296, 297, 311, 314, 322, 337, 359, 360, 361, 362, 374, 378, 400, 401, 404, 428, 439, 440, 441, 442, 443, 456, 468, 491, 495, 502, 504, 514, 524, 538, 539, 542, 580, 602, 619, 651, 659, 695, 741, 763, 80

In [9]:
models = {
    'LR': LogisticRegression(solver='saga'),
    'SVClinear': LinearSVC(),
    'SVCrbf': SVC(kernel='rbf'),
    'kNN': KNeighborsClassifier(n_neighbors=200),
    'MLP': MLPClassifier(random_state=42, max_iter=200),
    'DT': DecisionTreeClassifier(max_depth=5),
    'NB': GaussianNB(),
    'GBC': GradientBoostingClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'QDA': QuadraticDiscriminantAnalysis(),
}

In [5]:
def run_models(models, features):
    columns = list(models.keys()).insert(0, 'dim')
    scores = pd.DataFrame(columns=columns, index=feature_sets.keys())
    for fset_name, fset in tqdm(feature_sets.items(), desc='features'):
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, False)
        scores.loc[fset_name, 'dim'] = X_train.shape[1]
        for clf_name, clf in models.items():
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            scores.loc[fset_name, clf_name] = score
    return scores

def format_scores(scores):
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])

In [8]:
scores = run_models(models, feature_sets)

ipd.display(format_scores(scores))

features:   0%|          | 0/18 [00:00<?, ?it/s]

Unnamed: 0,dim,LDA,QDA
chroma_cens,84.0,38.24%,24.64%
chroma_cqt,84.0,39.76%,3.58%
chroma_stft,84.0,43.53%,5.64%
mfcc,140.0,57.68%,48.39%
rmse,7.0,36.57%,15.04%
spectral_bandwidth,7.0,39.84%,34.16%
spectral_centroid,7.0,43.02%,36.11%
spectral_contrast,49.0,48.93%,41.78%
spectral_rolloff,7.0,41.51%,28.53%
tonnetz,42.0,38.98%,23.05%
