In [10]:
# The goal of this notebook is to lay the foundation
# for two types of supervised machine learning classifiers.
from collections import Counter
from scipy.sparse import csr_matrix, hstack
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

import ast
import pandas as pd
import numpy as np

Create Helper functions for Supervised Machine Learning

In [3]:
## This pipeline starts from the point we have a pandas dataframe containing song attributes and genre
## step 1: break the data into training and test data
## step 2: generate a classifier using the training data
## step 3: test the performance using the test data

In [1]:
## Step 0: Filter Out Unneeded Cols
def condition_raw_data(raw_df):
    """
    Consume a dataframe

    Return a DF with the musical attributes and genre columns only
    """
    # rename artist genres to be called genre
    raw_df.rename(columns={'artist_genres': 'genre'}, inplace=True)
    # features and y val cols.
    # this is the original feature set before optimizing.
    required_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                'speechiness', 'acousticness', 'instrumentalness', 
                'liveness', 'valence', 'tempo', 'time_signature',
                'duration_ms', 'genre'
       ]
    # update key features based on feature importance
    required_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                'speechiness', 'acousticness', 'instrumentalness', 
                'liveness', 'valence', 'tempo',
                'duration_ms', 'genre'
       ]
    # remove unneccessary cols
    raw_df = raw_df[required_cols]
    return raw_df


In [6]:
## Step 1: Break the data into a training and a test set
def generate_train_test(tracks_df, random_val=42, split_ratio=0.8):
    """
    inputs:
        a dataframe containing song attributes and genre.
        random val for repeatability
        split_ratio = decimal pct of samples to use for training.
    returns:
        two dataframes train_df and test_df
    """
    # step 1 shuffle the df
    temp_df = tracks_df.sample(random_state=random_val, frac=1.0)
    temp_df.reset_index(inplace=True, drop=True)
    # establish a number to split the frame at.
    num_train_samples = int(split_ratio*len(tracks_df))
    # split the DF into two sets train and test
    return np.split(temp_df, [num_train_samples])


In [7]:
# Step 2: train a classifier using the training set
def train_logistic_regression(train_df, random_val=42, data_type='spotify'):
    """
    inputs:
        A dataframe of training data
        A random value for repeatability

    returns:
        a trained classifier.
    """
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']
    # Step 2 create the classifier
    clf = LogisticRegression(random_state=random_val, solver = 'newton-cg', multi_class='multinomial', max_iter=1000)
    # fit the classifier
    return clf.fit(X, y)

In [331]:
# Step 3: train a random forest classifier
def train_random_forest(train_df, random_val=42):
    """
    inputs:
        A dataframe of training data
        A random value for repeatability

    returns:
        a trained classifier.
    """
    # Step 1 create X and y
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']
    # Step 2 create the classifier
    clf = RandomForestClassifier(random_state=random_val, n_estimators=1000, max_features='sqrt', criterion='entropy', max_depth=30)
    # fit the classifier
    return clf.fit(X, y)

In [11]:
# Step 4: test the classifiers
def test_clf_model(clf, test_df):
    """
    inputs:
        testing set
        clf
    returns:
        f1 score
    """
    # step 1 create an X_test and y_test
    X_test = test_df[[col for col in test_df.columns if col != 'genre']]
    y_test = test_df['genre']
    # step 2 predict the genre for the test set
    y_pred = clf.predict(X_test)
    # step 3 calculate the average F1 score for all classes
    return f1_score(y_test, y_pred, average='macro')

In [12]:
# Create a dummy classifier for comparison
def create_dummy(train_df, random_val=42, data_type='spotify'):
    """
    Train a uniform dummy classifier for performance evaluation
    """
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']
    random_clf = DummyClassifier(strategy='uniform', random_state=random_val)
    random_clf.fit(X, y)
    return random_clf

Implement Model Functions and Evaluate Performance

In [332]:
# supervised training using draw_spotify_data_v3
# development cell for evaluating performance with raw api data.
raw_df = pd.read_csv('../raw_spotify_data/pure_genre_data.csv')
clean_df = condition_raw_data(raw_df)
train, test = generate_train_test(clean_df, 42, 0.8)
lr_clf = train_logistic_regression(train, 42)
rf_clf = train_random_forest(train)
dum_clf = create_dummy(train)
lr_score = test_clf_model(lr_clf, test)
rf_score = test_clf_model(rf_clf, test)
dum_score = test_clf_model(dum_clf, test)
print(f'random forest: {rf_score}')
print(f'logistic regression: {lr_score}')
print(f'dummy classifier: {dum_score}')

random forest: 0.731493576362299
logistic regression: 0.661349113003719
Support Vector Classifier: 0.6741944436861882
dummy classifier: 0.1348860045008973


Grid Search for Random Forest Hyper Parameter Tuning

In [315]:
# start grid search for model tuning
temp_clf = RandomForestClassifier(random_state=42, n_jobs=-1)
param_grid = {
    'max_depth' : [30, 50],
    #'min_samples_leaf' : [1],
    'n_estimators' : [1000, 1500, 2000],
    'criterion' : ['gini', 'entropy'],
    #'min_samples_split' : [2, 4, 10],
    #'min_weight_fraction_leaf' : [0.0, 0.05],
    'max_features' : [2, 'sqrt', 4, 5],
    #'max_samples' : [0.9999]
}
gSearch = GridSearchCV(temp_clf, param_grid=param_grid, scoring='accuracy', n_jobs=-1, verbose=1, cv=2)

In [316]:
raw_df = pd.read_csv('../raw_spotify_data/pure_genre_data.csv')
clean_df = condition_raw_data(raw_df)
train, test = generate_train_test(clean_df, 42, 0.8)
X_train = train[[col for col in train.columns if col != 'genre']]
y_train = train['genre']
gSearch.fit(X_train, y_train)

Fitting 2 folds for each of 36 candidates, totalling 72 fits


GridSearchCV(cv=2, estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [30, 50], 'max_features': ['sqrt', 4, 5],
                         'n_estimators': [1000, 1500, 2000]},
             scoring='accuracy', verbose=1)

Grid Search Best Model and Feature Importance for Track Attributes

In [333]:
# find feature importance
gSearch.best_score_
my_feature_importance = gSearch.best_estimator_.feature_importances_
features = [col for col in train.columns if col != 'genre']
# print out feature importances for feature optimization
sorted(list(zip(features, my_feature_importance)), key = lambda x: -x[1])
#gSearch.best_estimator_
#gSearch.best_score_

0.71375

Failure Analysis - Track Attributes Random Forest 
Failure analysis is conducted by calculating the mean track for each genre.
The euclidean distance is used to show that tracks that are incorrectly classified
are on average further away from the mean track than correctly classified tracks

In [363]:
# create mean song for each genre
raw_df = pd.read_csv('../raw_spotify_data/pure_genre_data.csv')
clean_df = condition_raw_data(raw_df)
train, test = generate_train_test(clean_df, 42, 0.8)
# step 1 group by genre
mean_track_df = train.groupby('genre').mean().reset_index()

# start failure analysis for random forest
raw_df = pd.read_csv('../raw_spotify_data/pure_genre_data.csv')
clean_df = condition_raw_data(raw_df)
train, test = generate_train_test(clean_df, 42, 0.8)
# train models
rf_clf = train_random_forest(train)
X_test = test[[col for col in test.columns if col != 'genre']]
y_test = test['genre']
# predict the genres using the X_test feature values.
y_pred_rf = rf_clf.predict(X_test)

# create a failure df to store the predicted genre, true genre
failure_df = pd.DataFrame()
failure_df['true_y'] = y_test
failure_df['pred_y'] = y_pred_rf
# pull in feature values for distance calculation.
failure_df[[col for col in test.columns if col != 'genre']] = X_test

In [403]:
genres_to_collect = ('alt-rock','classical', 'country',
                       'edm', 'heavy-metal',  'hip-hop',
                       'latin')
required_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                'speechiness', 'acousticness', 'instrumentalness', 
                'liveness', 'valence', 'tempo',
                'duration_ms'
            ]
# assign a results column to show if the model predicted the genre correctly
failure_df['results'] = (failure_df['pred_y'] == failure_df['true_y'])
for genre in genres_to_collect:
    # filter by unique genre
    tempDF = failure_df[failure_df['true_y']==genre]
    tempMean = mean_track_df[mean_track_df['genre']==genre]
    # calcualte genre between mean track and every track with that genre
    temp_sim = euclidean_distances(tempMean[required_cols], tempDF[required_cols])
    sim_results = list(zip(temp_sim.flatten(), tempDF['results']))
    # filter by correctly and incorrectly classifications
    failures = [dis for (dis, status) in sim_results if status == False]
    passes = [dis for (dis, status) in sim_results if status == True]
    # evaluate if the mean distance is greater for failures or for passes
    print([(np.mean(failures) > np.mean(passes)), genre])
    print(len(failures), len(passes))
# print the length of number of passes and fails in the entire dataset
len(failure_df[failure_df['results']==True]), len(failure_df[failure_df['results']==False])      


[True, 'alt-rock']
94 92
[False, 'classical']
0 185
[True, 'country']
55 152
[True, 'edm']
75 133
[True, 'heavy-metal']
33 170
[False, 'hip-hop']
61 149
[True, 'latin']
58 143


(1024, 376)