In [88]:
# The goal of this notebook is to lay the foundation
# for two types of supervised machine learning classifiers.
from collections import Counter
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import ast
import pandas as pd
import numpy as np



In [13]:
## This pipeline starts from the point we have a pandas dataframe containing song attributes and genre
## step 1: break the data into training and test data
## step 2: generate a classifier using the training data
## step 3: test the performance using the test data

In [73]:
## Step 0: Filter Out Unneeded Cols
def condition_raw_data(raw_df):
    """
    Consume a dataframe

    Return a DF with the musical attributes and genre columns only
    """
    # rename artist genres to be called genre
    raw_df.rename(columns={'artist_genres': 'genre'}, inplace=True)
    # features and y val cols.
    required_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                'speechiness', 'acousticness', 'instrumentalness', 
                'liveness', 'valence', 'tempo', 'time_signature',
                'duration_ms', 'genre'
       ]
    # remove unneccessary cols
    raw_df = raw_df[required_cols]

    

    return raw_df


In [66]:
## Step N: Create a function that will clean the genres.

def calculate_popular_genre(track_df, max_num_genres = 10):
    """
    Read a series of genre tuples
    Calculate the most frequent genre

    return an updated dataframe with one genre
    """
    # create a counter to collect genre names
    # remove rows missing genre
    track_df = track_df.loc[track_df['genre'].apply(len)>0,:]
    genre_counter = Counter()
    for genres in track_df['genre']:
        for genre in genres:
            # split by space to help eliminate sub categories of major genres
            split_genre = genre.split(' ')
            # fill the counter with the genres
            genre_counter.update(split_genre)
    # create a list to hold the most popular genre for each artist
    top_genres = []
    # convert the counter to a dict so that we can look up the score
    genre_score = dict(genre_counter)
    for genres in track_df['genre']:
        # initialize the score and genre to None
        temp_score = 0
        temp_genre = None
        for genre in genres:
            genre = genre.split(' ')
            for sub_genre in genre:
                # calculate score by looking it up in the dictionary
                eval_score = genre_score[sub_genre]
                if eval_score > temp_score:
                    temp_score = eval_score
                    temp_genre = sub_genre
        top_genres.append(temp_genre)
    track_df['genre'] = top_genres
    # Create a new counter and filter
    filt_genres = Counter(top_genres)
    cut_off_genres = filt_genres.most_common(max_num_genres)
    included_genres = list(zip(*cut_off_genres))[0]
    # filter out low frequency genres
    track_df = track_df[track_df['genre'].isin(included_genres)]
    return track_df



In [14]:
## Step 1: Break the data into a training and a test set
def generate_train_test(tracks_df, random_val=42, split_ratio=0.8):
    """
    inputs:
        a dataframe containing song attributes and genre.
        random val for repeatability
        split_ratio = decimal pct of samples to use for training.
    returns:
        two dataframes train_df and test_df
    """
    # step 1 shuffle the df
    temp_df = tracks_df.sample(random_state=random_val, frac=1.0)
    # establish a number to split the frame at.
    num_train_samples = int(split_ratio*len(tracks_df))
    # split the DF into two sets train and test
    return np.split(temp_df, [num_train_samples])


In [15]:
# Step 2: train a classifier using the training set

def train_logistic_regression(train_df, random_val=42):
    """
    inputs:
        A dataframe of training data
        A random value for repeatability

    returns:
        a trained classifier.
    """
    # Step 1 create X and y
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']

    # Step 2 create the classifier
    clf = LogisticRegression(random_state=random_val, solver = 'lbfgs', multi_class='auto', max_iter=5500)
    
    # fit the classifier
    return clf.fit(X, y)

In [101]:
# Step 3: train a random forest classifier
def train_random_forest(train_df, random_val=42):
    """
    inputs:
        A dataframe of training data
        A random value for repeatability

    returns:
        a trained classifier.
    """
    # Step 1 create X and y
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']

    # Step 2 create the classifier
    clf = RandomForestClassifier(random_state=random_val, n_estimators=500, max_depth=50)
    
    # fit the classifier
    return clf.fit(X, y)

In [94]:
# implement a support vector machine to assist with the high dimensionality of the data

# Step 3: train a SVC classifier
def train_svm(train_df, random_val=42):
    """
    inputs:
        A dataframe of training data
        A random value for repeatability

    returns:
        a trained classifier.
    """
    # Step 1 create X and y
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']

    # Step 2 create the classifier
    clf = make_pipeline(StandardScaler(), SVC(random_state=random_val))
    clf.fit(X,y)
    
    # fit the classifier
    return clf.fit(X, y)

In [17]:
# Step 4: test the classifiers

def test_regression_model(clf, test_df):
    """
    inputs:
        testing set
        clf
    returns:
        f1 score
    """
    # step 1 create an X_test and y_test
    X_test = test_df[[col for col in test_df.columns if col != 'genre']]
    y_test = test_df['genre']

    # step 2 predict the genre for the test set
    y_pred = clf.predict(X_test)

    # step 3 calculate the average F1 score for all classes
    return f1_score(y_test, y_pred, average='macro')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=659c715d-e2b5-478e-9116-4d32a5174810' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>

In [18]:
# Create a dummy classifier for comparison

def create_dummy(train_df, random_val=42):
    """
    Train a uniform dummy classifier for performance evaluation
    """
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']

    random_clf = DummyClassifier(strategy='uniform', random_state=random_val)
    random_clf.fit(X, y)
    return random_clf

In [95]:
# development cell for integrating all functions together.
raw_df = pd.read_csv('dev_data_2.csv')
train, test = generate_train_test(raw_df, 42, 0.8)
lr_clf = train_logistic_regression(train, 42)
rf_clf = train_random_forest(train)
svc_clf = train_svm(train)
dum_clf = create_dummy(train)
lr_score = test_regression_model(lr_clf, test)
rf_score = test_regression_model(rf_clf, test)
svc_score = test_regression_model(svc_clf, test)
dum_score = test_regression_model(dum_clf, test)
print(f'random forest: {rf_score}')
print(f'logistic regression: {lr_score}')
print(f'Support Vector Classifier: {svc_score}')
print(f'dummy classifier: {dum_score}')

random forest: 0.6093159323510513
logistic regression: 0.0962496823110987
Support Vector Classifier: 0.5515528034650814
dummy classifier: 0.06997919095663456


In [102]:
# development cell for evaluating performance with raw api data.
raw_df = pd.read_csv('../raw_spotify_data/raw_spotify_data.csv', converters={"artist_genres": ast.literal_eval})
raw_df = condition_raw_data(raw_df)
clean_df = calculate_popular_genre(raw_df, 5)
train, test = generate_train_test(clean_df, 42, 0.8)
lr_clf = train_logistic_regression(train, 42)
rf_clf = train_random_forest(train)
svc_clf = train_svm(train)
dum_clf = create_dummy(train)
lr_score = test_regression_model(lr_clf, test)
rf_score = test_regression_model(rf_clf, test)
svc_score = test_regression_model(svc_clf, test)
dum_score = test_regression_model(dum_clf, test)
print(f'random forest: {rf_score}')
print(f'logistic regression: {lr_score}')
print(f'Support Vector Classifier: {svc_score}')
print(f'dummy classifier: {dum_score}')

random forest: 0.5941967941967942
logistic regression: 0.14463276836158193
Support Vector Classifier: 0.5400701895013771
dummy classifier: 0.13917133026572154


In [135]:
# supervised training using draw_spotify_data_v2
# development cell for evaluating performance with raw api data.
raw_df = pd.read_csv('../raw_spotify_data/7_unique_genre_attributes.csv')
clean_df = condition_raw_data(raw_df)
train, test = generate_train_test(clean_df, 42, 0.8)
lr_clf = train_logistic_regression(train, 42)
rf_clf = train_random_forest(train)
svc_clf = train_svm(train)
dum_clf = create_dummy(train)
lr_score = test_regression_model(lr_clf, test)
rf_score = test_regression_model(rf_clf, test)
svc_score = test_regression_model(svc_clf, test)
dum_score = test_regression_model(dum_clf, test)
print(f'random forest: {rf_score}')
print(f'logistic regression: {lr_score}')
print(f'Support Vector Classifier: {svc_score}')
print(f'dummy classifier: {dum_score}')

random forest: 0.7521442906638766
logistic regression: 0.20933813317701042
Support Vector Classifier: 0.6829017916533447
dummy classifier: 0.11962428117585011


In [108]:
dev_df = pd.read_csv('dev_data_2.csv')
v1_df = pd.read_csv('../raw_spotify_data/raw_spotify_data.csv', converters={"artist_genres": ast.literal_eval})
v1_df = condition_raw_data(v1_df)
v1_df = calculate_popular_genre(v1_df, 5)

v2_df = pd.read_csv('../raw_spotify_data/track_artist_attributes.csv')
v2_df = condition_raw_data(v2_df) 

In [109]:
dev_df.shape, v1_df.shape, v2_df.shape

((1500, 14), (564, 14), (8947, 14))