In [376]:
import json

import numpy as np
import pandas as pd
import seaborn as sns
import pickle

from sklearn.metrics import classification_report, confusion_matrix

In [148]:
def load_model(model_path, features_path, scaler_path=None, pca_path=None):
    return {
        "model":    pickle.load(open(model_path, 'rb')), 
        "features": pickle.load(open(features_path, 'rb')),
        "scaler":   pickle.load(open(scaler_path, 'rb')) if scaler_path else None,
        "pca":      pickle.load(open(pca_path, 'rb')) if pca_path else None
    }

random_forest_classifier = load_model(
    "./models/RF_model_final.pickle", 
    "./models/RF_features_order.pickle",
    "./models/RF_scaler.pickle"
)

svm_fp_jazz_classifier  = load_model(
    "./models/SVM_fp_jazz_model_final.pickle",
    "./models/SVM_features_order.pickle",
    "./models/SVM_scaler.pickle",
)

svm_fn_jazz_classifier  = load_model(
    "./models/SVM_fn_jazz_model_final.pickle",
    "./models/SVM_features_order.pickle",
    "./models/SVM_scaler.pickle",
)

In [330]:
def predict(
    classifier, 
    batch,
    cleaned_data=False,
    scaled=False,
    pca=False
):
    """
    classifier: dict with expected pre-fitted dict, transformations and
                features order
    batch: dataframe with the rows to predict
    """

    assert type(batch), pd.DataFrame
    
    if cleaned_data:
        cleaned_batch = batch
    else:
        cleaned_batch = clean_new_data(batch, features=classifier["features"])
        
    
    if not scaled and classifier["scaler"]:
        cleaned_batch = classifier["scaler"].transform(cleaned_batch)
    
    if not pca and classifier["pca"]:
        cleaned_batch = classifier["pca"].transform(cleaned_batch)
    

    return classifier["model"].predict(cleaned_batch), classifier["model"].predict_proba(cleaned_batch), classifier["model"].predict_log_proba(cleaned_batch), cleaned_batch

In [331]:
# Sample one from new songs
jazz = pd.read_csv("jazz.csv")

jazz_sample = jazz.sample(200)

In [332]:
results = predict(random_forest_classifier, jazz_sample)

  return np.log(proba)


In [374]:
jazz_index = 5

#False Negatives Parameters
others_max_proba_false_negative = 0.6 # Can be changed
jazz_min_proba_false_negative = 0.2 # Can be changed

#False Positives Parameters
jazz_max_proba_false_positive = 0.6 # Can be changed
others_min_proba_false_positive = 0.2 # Can be changed

verbose = 0

jazz_cleaned = clean_new_data(jazz_sample, features=svm_fn_jazz_classifier["features"]).to_numpy()

i = 0

corrected_predictions = [] # Feel it with new (if any change) genre.

for predicted_genre, predicted_probas, predicted_log_probas, sample in zip(results[0], results[1], results[2], jazz_cleaned):
    predicted_genre_proba, jazz_proba = (max(predicted_probas), predicted_probas[jazz_index])
    # possible false negatives
    new_pred = None
    if predicted_genre != "Jazz" and \
       predicted_genre_proba <= others_max_proba_false_negative and \
       jazz_proba >= jazz_min_proba_false_negative:
        jazz_votes = 0
        
        log(("Possible false negative"), verbose)
        log((i, predicted_genre, predicted_genre_proba, jazz_proba), verbose)
        log(("-" * 80), verbose)
    
        svm_fn_pred = predict(svm_fn_jazz_classifier, sample.reshape(1, -1), cleaned_data=True, scaled=False)
        svm_fn_prob = dict(zip(svm_fn_jazz_classifier["model"].classes_, svm_fn_pred[1][0]))
        # High Certainty of FN model
        if max(svm_fn_pred[1][0]) > 0.9:
            new_pred = "Jazz" if svm_fn_pred[0] else None
            log(("By FN Certainty:: fn::", "Jazz - " if svm_fn_pred[0] else "No Jazz - ", f"Probas: {svm_fn_prob}"), verbose)
        else:
        # Cast a vote in favour of Jazz if prob and pred are contradictory
            if svm_fn_pred[1][0][1] > 0.5 and svm_fn_pred[0] != "Jazz":
                jazz_votes += 1
                
            svm_fp_pred = predict(svm_fp_jazz_classifier, sample.reshape(1, -1), cleaned_data=True, scaled=False)
            svm_fp_prob = dict(zip(svm_fp_jazz_classifier["model"].classes_, svm_fp_pred[1][0]))
        # High Certainty of FP model
            if max(svm_fp_pred[1][0]) > 0.9: 
                new_pred = "Jazz" if svm_fp_pred[0] else None
                log(("By FP Certainty:: fp::Jazz - " if svm_fp_pred[0] else "fp::No Jazz - ", f"Probas: {svm_fp_prob}"), verbose)
        # Both votes more % to be jazz
            elif svm_fp_pred[1][0][1] >= 0.5 and jazz_votes:
                new_pred = "Jazz"
                log(("By votes :: fp::Jazz - ", f"Probas: {svm_fp_prob}"), verbose)
        # Combined probas are more likely to be Jazz
            elif np.mean([svm_fp_pred[1][0][1], svm_fn_pred[1][0][1]]) > 0.5:
                new_pred = "Jazz"
                log(("By Mean :: fp:: Jazz - ", f"Probas: {svm_fp_prob}"), verbose)
        # Not enough certainty to revoke False Negativeness
            else:
                log(("By Lack Info :: fp:: No Jazz - ", f"Probas: {svm_fp_prob}"), verbose)
        
    # possible false positives
    # This code is not DRY... I am! :(
    elif predicted_genre == "Jazz" and \
         jazz_proba <= jazz_max_proba_false_positive and \
         predicted_genre_proba >= others_min_proba_false_positive:
        
        log("Possible false positive", verbose)
        log((i, predicted_genre, np.partition(predicted_probas, -2)[-2], jazz_proba), verbose)
        log(("-" * 80), verbose)
        
        svm_fp_pred = predict(svm_fp_jazz_classifier, sample.reshape(1, -1), cleaned_data=True, scaled=False)
        svm_fp_prob = dict(zip(svm_fp_jazz_classifier["model"].classes_, svm_fp_pred[1][0]))
        
        jazz_votes = 0
        # High Certainty of FP model
        if max(svm_fp_pred[1][0]) > 0.9:
            new_pred = "Jazz" if svm_fp_pred[0] else None
            log(("By FN Certainty:: fn::", "Jazz - " if svm_fp_pred[0] else "No Jazz - ", f"Probas: {svm_fp_prob}"), verbose)
        else:
        # Cast a vote in favour of Jazz if prob and pred are contradictory
            if svm_fn_pred[1][0][1] > 0.5 and svm_fn_pred[0] != "Jazz":
                jazz_votes += 1
            
            svm_fn_pred = predict(svm_fn_jazz_classifier, sample.reshape(1, -1), cleaned_data=True, scaled=False)
            svm_fn_prob = dict(zip(svm_fn_jazz_classifier["model"].classes_, svm_fn_pred[1][0]))
        
        # High Certainty of FP model
            if max(svm_fn_pred[1][0]) > 0.9: 
                new_pred = "Jazz" if svm_fn_pred[0] else None
                log(("By FP Certainty:: fp::Jazz - " if svm_fn_pred[0] else "fp::No Jazz - ", f"Probas: {svm_fn_prob}"), verbose)
        # Both votes more % to be jazz
            elif svm_fn_pred[1][0][1] >= 0.5 and jazz_votes:
                new_pred = "Jazz"
                log(("By votes :: fp::Jazz - ", f"Probas: {svm_fn_prob}"), verbose)
        # Combined probas are more likely to be Jazz
            elif np.mean([svm_fn_pred[1][0][1], svm_fn_pred[1][0][1]]) > 0.5:
                new_pred = "Jazz"
                log(("By Mean :: fp:: Jazz - ", f"Probas: {svm_fn_prob}"), verbose)
        # Not enough certainty to revoke False Negativeness
            else:
                log(("By Lack Info :: fp:: No Jazz - ", f"Probas: {svm_fn_prob}"), verbose)
            
        
    log(("=" * 80), verbose)
    i = i + 1
    corrected_predictions.append(new_pred if new_pred else predicted_genre)
 



In [381]:

#len(list(results[0])) - list(corrected_predictions).count("Jazz") 

y_test = ["Jazz"] * len(list(results[0]))

print(classification_report(y_test, results[0]))
print(classification_report(y_test, corrected_predictions))

#confusion_matrix

              precision    recall  f1-score   support

       Blues       0.00      0.00      0.00         0
        Jazz       1.00      0.97      0.98       200

    accuracy                           0.97       200
   macro avg       0.50      0.48      0.49       200
weighted avg       1.00      0.97      0.98       200

              precision    recall  f1-score   support

       Blues       0.00      0.00      0.00         0
        Jazz       1.00      0.98      0.99       200

    accuracy                           0.98       200
   macro avg       0.50      0.49      0.50       200
weighted avg       1.00      0.98      0.99       200



In [368]:
def clean_new_data(data, genre=None, features=[]):
    librosa_info = {}

    for index, librosa_features in data.librosa_features.iteritems():
        cleaned_row = clean_new_data_row(librosa_features, librosa_info)
        librosa_info.update(cleaned_row)
        
    librosa_info_df = pd.DataFrame(librosa_info)
    
    cleaned_data =  pd.concat([data.reset_index(), librosa_info_df.reset_index()], axis=1)
    cleaned_data['duration'] = data.reset_index().duration_ms
    if genre:
        cleaned_data['genre_top'] = genre  

    return cleaned_data[features]

def clean_new_data_row(librosa_features, librosa_info={}):
    cleaned_librosa_features = librosa_info
    for feature, value in json.loads(json.loads(librosa_features)).items():
        if "ske" not in feature and "spectral_centroid" not in feature:
            feature = eval(feature) # Parsing a string repr of a tuple. Don't do this at home.
            feature_name = f'{feature[0]}_{feature[1][0:3]}_{feature[2].lstrip("0")}'
            try:            
                cleaned_librosa_features[feature_name].append(value)
            except KeyError:
                cleaned_librosa_features[feature_name] = [value]
    return cleaned_librosa_features

def log(message, verbose=0):
    if verbose:
        print(message)