In [None]:
# The goal of this notebook is to lay the foundation
# for two types of supervised machine learning classifiers.
from collections import Counter
from scipy.sparse import csr_matrix, hstack
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")

import ast
import pandas as pd
import numpy as np



In [None]:
## This pipeline starts from the point we have a pandas dataframe containing song attributes and genre
## step 1: break the data into training and test data
## step 2: generate a classifier using the training data
## step 3: test the performance using the test data

In [None]:
## Step 0: Filter Out Unneeded Cols
def condition_raw_data(raw_df):
    """
    Consume a dataframe

    Return a DF with the musical attributes and genre columns only
    """
    # rename artist genres to be called genre
    raw_df.rename(columns={'artist_genres': 'genre'}, inplace=True)
    # features and y val cols.
    required_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                'speechiness', 'acousticness', 'instrumentalness', 
                'liveness', 'valence', 'tempo', 'time_signature',
                'duration_ms', 'genre'
       ]
    # remove unneccessary cols
    raw_df = raw_df[required_cols]

    

    return raw_df


In [None]:
## Step N: Create a function that will clean the genres.

def calculate_popular_genre(track_df, max_num_genres = 10):
    """
    Read a series of genre tuples
    Calculate the most frequent genre

    return an updated dataframe with one genre
    """
    # create a counter to collect genre names
    # remove rows missing genre
    track_df = track_df.loc[track_df['genre'].apply(len)>0,:]
    genre_counter = Counter()
    for genres in track_df['genre']:
        for genre in genres:
            # split by space to help eliminate sub categories of major genres
            split_genre = genre.split(' ')
            # fill the counter with the genres
            genre_counter.update(split_genre)
    # create a list to hold the most popular genre for each artist
    top_genres = []
    # convert the counter to a dict so that we can look up the score
    genre_score = dict(genre_counter)
    for genres in track_df['genre']:
        # initialize the score and genre to None
        temp_score = 0
        temp_genre = None
        for genre in genres:
            genre = genre.split(' ')
            for sub_genre in genre:
                # calculate score by looking it up in the dictionary
                eval_score = genre_score[sub_genre]
                if eval_score > temp_score:
                    temp_score = eval_score
                    temp_genre = sub_genre
        top_genres.append(temp_genre)
    track_df['genre'] = top_genres
    # Create a new counter and filter
    filt_genres = Counter(top_genres)
    cut_off_genres = filt_genres.most_common(max_num_genres)
    included_genres = list(zip(*cut_off_genres))[0]
    # filter out low frequency genres
    track_df = track_df[track_df['genre'].isin(included_genres)]
    return track_df



In [None]:
## Step 1: Break the data into a training and a test set
def generate_train_test(tracks_df, random_val=42, split_ratio=0.8):
    """
    inputs:
        a dataframe containing song attributes and genre.
        random val for repeatability
        split_ratio = decimal pct of samples to use for training.
    returns:
        two dataframes train_df and test_df
    """
    # step 1 shuffle the df
    temp_df = tracks_df.sample(random_state=random_val, frac=1.0)
    # establish a number to split the frame at.
    num_train_samples = int(split_ratio*len(tracks_df))
    # split the DF into two sets train and test
    return np.split(temp_df, [num_train_samples])


In [None]:
# Step 2: train a classifier using the training set

def train_logistic_regression(train_df, random_val=42, data_type='spotify'):
    """
    inputs:
        A dataframe of training data
        A random value for repeatability

    returns:
        a trained classifier.
    """
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']
        

    # Step 2 create the classifier
    clf = LogisticRegression(random_state=random_val, solver = 'newton-cg', multi_class='multinomial', max_iter=1000)
    
    # fit the classifier
    return clf.fit(X, y)

In [None]:
# Step 3: train a random forest classifier
def train_random_forest(train_df, random_val=42):
    """
    inputs:
        A dataframe of training data
        A random value for repeatability

    returns:
        a trained classifier.
    """
    # Step 1 create X and y
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']

    # Step 2 create the classifier
    clf = RandomForestClassifier(random_state=random_val, n_estimators=1000)
    
    # fit the classifier
    return clf.fit(X, y)

In [None]:
# implement a support vector machine to assist with the high dimensionality of the data

# Step 3: train a SVC classifier
def train_svm(train_df, random_val=42):
    """
    inputs:
        A dataframe of training data
        A random value for repeatability

    returns:
        a trained classifier.
    """
    # Step 1 create X and y
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']

    # Step 2 create the classifier
    clf = make_pipeline(StandardScaler(), SVC(random_state=random_val))
    clf.fit(X,y)
    
    # fit the classifier
    return clf.fit(X, y)

In [None]:
# Step 4: test the classifiers

def test_clf_model(clf, test_df):
    """
    inputs:
        testing set
        clf
    returns:
        f1 score
    """
    # step 1 create an X_test and y_test
    X_test = test_df[[col for col in test_df.columns if col != 'genre']]
    y_test = test_df['genre']

    # step 2 predict the genre for the test set
    y_pred = clf.predict(X_test)

    # step 3 calculate the average F1 score for all classes
    return f1_score(y_test, y_pred, average='macro')

In [None]:
# Create a dummy classifier for comparison

def create_dummy(train_df, random_val=42, data_type='spotify'):
    """
    Train a uniform dummy classifier for performance evaluation
    """
    X = train_df[[col for col in train_df.columns if col != 'genre']]
    y = train_df['genre']

    random_clf = DummyClassifier(strategy='uniform', random_state=random_val)
    random_clf.fit(X, y)
    return random_clf

In [None]:
# development cell for integrating all functions together.
raw_df = pd.read_csv('dev_data_2.csv')
train, test = generate_train_test(raw_df, 42, 0.8)
lr_clf = train_logistic_regression(train, 42)
rf_clf = train_random_forest(train)
svc_clf = train_svm(train)
dum_clf = create_dummy(train)
lr_score = test_clf_model(lr_clf, test)
rf_score = test_clf_model(rf_clf, test)
svc_score = test_clf_model(svc_clf, test)
dum_score = test_clf_model(dum_clf, test)
print(f'random forest: {rf_score}')
print(f'logistic regression: {lr_score}')
print(f'Support Vector Classifier: {svc_score}')
print(f'dummy classifier: {dum_score}')

random forest: 0.6520960462960702
logistic regression: 0.5041775923096872
Support Vector Classifier: 0.5514640837980559
dummy classifier: 0.06997919095663456


In [None]:
# development cell for evaluating performance with raw api data.
raw_df = pd.read_csv('../raw_spotify_data/raw_spotify_data.csv', converters={"artist_genres": ast.literal_eval})
raw_df = condition_raw_data(raw_df)
clean_df = calculate_popular_genre(raw_df, 5)
train, test = generate_train_test(clean_df, 42, 0.8)
lr_clf = train_logistic_regression(train, 42)
rf_clf = train_random_forest(train)
svc_clf = train_svm(train)
dum_clf = create_dummy(train)
lr_score = test_clf_model(lr_clf, test)
rf_score = test_clf_model(rf_clf, test)
svc_score = test_clf_model(svc_clf, test)
dum_score = test_clf_model(dum_clf, test)
print(f'random forest: {rf_score}')
print(f'logistic regression: {lr_score}')
print(f'Support Vector Classifier: {svc_score}')
print(f'dummy classifier: {dum_score}')

random forest: 0.5776054082903398
logistic regression: 0.2864425318271472
Support Vector Classifier: 0.5119731146046936
dummy classifier: 0.13917133026572154


In [None]:
# supervised training using draw_spotify_data_v2
# development cell for evaluating performance with raw api data.
raw_df = pd.read_csv('../raw_spotify_data/pure_genre_data.csv')
clean_df = condition_raw_data(raw_df)
#clean_df = clean_df[clean_df['genre']!='edm']
#clean_df = clean_df[clean_df['genre']!='classical']
train, test = generate_train_test(clean_df, 42, 0.8)
lr_clf = train_logistic_regression(train, 42)
rf_clf = train_random_forest(train)
svc_clf = train_svm(train)
dum_clf = create_dummy(train)
lr_score = test_clf_model(lr_clf, test)
rf_score = test_clf_model(rf_clf, test)
svc_score = test_clf_model(svc_clf, test)
dum_score = test_clf_model(dum_clf, test)
print(f'random forest: {rf_score}')
print(f'logistic regression: {lr_score}')
print(f'Support Vector Classifier: {svc_score}')
print(f'dummy classifier: {dum_score}')

random forest: 0.7321047458016763
logistic regression: 0.6586439495238977
Support Vector Classifier: 0.6809962004780671
dummy classifier: 0.1348860045008973


In [None]:
# PCA
# goal apply PCA to the data set. Evaluate impact on F1
def apply_pca(train_df, test_df, n_dim = 2):
    """
    read in the training data for a dataframe
    apply dimmensionality reduction

    return modified train and test sets
    """
    # start off with stock settings
    pca = PCA(n_components=n_dim)
    train_df.reset_index(inplace=True, drop=True)
    test_df.reset_index(inplace=True, drop=True)
    X_cols = [col for col in train_df.columns if col != 'genre']
    myScaler = StandardScaler()
    X_train = myScaler.fit_transform(train_df[X_cols])
    X_test = myScaler.transform(test_df[X_cols])
    # only use the training data to fit the pca model
    pca.fit(X_train)
    # lesson learned here you need to reset the index so that pandas merges the labels
    # back in correctly
    pca_train_df = pd.DataFrame(pca.transform(X_train))
    pca_train_df['genre'] = train_df['genre']

    pca_test_df = pd.DataFrame(pca.transform(X_test))
    pca_test_df['genre'] = test_df['genre']
    return pca_train_df, pca_test_df


In [None]:
# implement pca function
raw_df = pd.read_csv('../raw_spotify_data/pure_genre_data.csv')
clean_df = condition_raw_data(raw_df)
train, test = generate_train_test(clean_df, 42, 0.8)
train, test = apply_pca(train, test, 2)
lr_clf = train_logistic_regression(train, 42)
rf_clf = train_random_forest(train)
svc_clf = train_svm(train)
dum_clf = create_dummy(train)
lr_score = test_clf_model(lr_clf, test)
rf_score = test_clf_model(rf_clf, test)
svc_score = test_clf_model(svc_clf, test)
dum_score = test_clf_model(dum_clf, test)
print(f'random forest: {rf_score}')
print(f'logistic regression: {lr_score}')
print(f'Support Vector Classifier: {svc_score}')
print(f'dummy classifier: {dum_score}')



random forest: 0.45408163189558187
logistic regression: 0.4240574171251283
Support Vector Classifier: 0.4379227806628839
dummy classifier: 0.1348860045008973


In [None]:
# apply a TFIDF vectorizer to the lyrics.
def train_tfidf_vectorizer(train, test, random_val = 42):
    """
    read in a train and test set
    return train and test in vectorizer form
    """
    X_train = train['lyric_raw']
    y_train = train['genre'].values
    X_test = test['lyric_raw'].fillna(value=' ')
    X_train.fillna(value=' ', inplace=True)
    vectorizer = TfidfVectorizer(min_df=100, stop_words='english')
    X_train = vectorizer.fit_transform(X_train)

    X_test = vectorizer.transform(X_test)
    y_test = test['genre'].values
    return X_train, y_train, X_test, y_test

def lyrics_random_forest(X_train, y_train, random_state=42):
    """
    read in X_train and y_train
    return clf
    """
    clf = RandomForestClassifier(n_estimators=200, random_state=random_state)
    clf.fit(X_train, y_train)
    return clf

def lyrics_logistic_reg(X_train, y_train, random_val=42):
    """
    train a logistic regression classifier
    return clf
    """
    clf = LogisticRegression(random_state=random_val, solver = 'newton-cg', multi_class='multinomial', max_iter=1000)
    clf.fit(X_train, y_train)
    return clf
def evaluate_lyrics_clf(clf, X_test, y_test):
    """
    predict X_test
    return F1 score
    """
    preds = clf.predict(X_test)
    return f1_score(y_test, preds, average='macro')


In [None]:
lyrics_df = pd.read_csv('../raw_spotify_data/pure_genre_data_w_clean_lyrics.csv')
lyrics_df = lyrics_df[['lyric_raw', 'genre']]
train, test = generate_train_test(lyrics_df)
X_train, y_train, X_test, y_test = train_tfidf_vectorizer(train, test)
lyrics_forest_clf = lyrics_random_forest(X_train, y_train)
evaluate_lyrics_clf(lyrics_forest_clf, X_test, y_test)

0.6398899537539541

In [None]:
# the goal of this cell is to train a model using lyrics AND using song attributes
def lyric_attribute_train_test(all_df):
    """
    read in a dataframe
    return train and test data
    """
    # list of columns to throw away
    blacklist = ['artist_name_y', 'track_name_y','artist_name_x', 'artist_id', 'track_name_x','track_id', 'uri', 'track_href', 'analysis_url', 'type', 'lyric_clean']
    all_df = all_df[[col for col in all_df.columns if col not in blacklist]]
    train, test = generate_train_test(all_df)
    # get vectorized lyrics:
    X_train, y_train, X_test, y_test = train_tfidf_vectorizer(train, test)
    # remove raw lyrics
    del train['lyric_raw'] 
    del test['lyric_raw']
    del train['genre']
    del test['genre']
    X_train = hstack([X_train, train.values])
    X_test = hstack([X_test, test.values])
    return X_train, y_train, X_test, y_test




In [None]:
all_df = pd.read_csv('../raw_spotify_data/pure_genre_data_w_clean_lyrics.csv')
X_train, y_train, X_test, y_test = lyric_attribute_train_test(all_df)
lyrics_forest_clf = lyrics_random_forest(X_train, y_train)
evaluate_lyrics_clf(lyrics_forest_clf, X_test, y_test)

0.817749878924545

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=659c715d-e2b5-478e-9116-4d32a5174810' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>