In [6]:

from collections import Counter
from scipy.sparse import csr_matrix, hstack
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

Term Frequency Inverse Document Frequency (TFIDF) Vectorizer Implementation

In [17]:
# apply a TFIDF vectorizer to the lyrics.
def train_tfidf_vectorizer(train, test, random_val = 42):
    """
    read in a train and test set
    return train and test in vectorizer form
    """
    X_train = train['lyric_raw']
    y_train = train['genre'].values
    X_test = test['lyric_raw'].fillna(value=' ')
    X_train.fillna(value=' ', inplace=True)
    vectorizer = TfidfVectorizer(min_df=50, stop_words='english', ngram_range=(1,2))
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    y_test = test['genre'].values
    return X_train, y_train, X_test, y_test

def lyrics_random_forest(X_train, y_train, random_state=42):
    """
    read in X_train and y_train
    return clf
    """
    clf = RandomForestClassifier(n_estimators=1000, random_state=random_state, max_features=30, max_depth=125)
    clf.fit(X_train, y_train)
    return clf

def lyrics_logistic_reg(X_train, y_train, random_val=42):
    """
    train a logistic regression classifier
    return clf
    """
    clf = LogisticRegression(random_state=random_val, solver = 'newton-cg', multi_class='multinomial', max_iter=1000)
    clf.fit(X_train, y_train)
    return clf
def evaluate_lyrics_clf(clf, X_test, y_test):
    """
    predict X_test
    return F1 score
    """
    preds = clf.predict(X_test)
    return f1_score(y_test, preds, average='macro')

def generate_train_test(tracks_df, random_val=42, split_ratio=0.8):
    """
    inputs:
        a dataframe containing song attributes and genre.
        random val for repeatability
        split_ratio = decimal pct of samples to use for training.
    returns:
        two dataframes train_df and test_df
    """
    # step 1 shuffle the df
    temp_df = tracks_df.sample(random_state=random_val, frac=1.0)
    temp_df.reset_index(inplace=True, drop=True)
    # establish a number to split the frame at.
    num_train_samples = int(split_ratio*len(tracks_df))
    # split the DF into two sets train and test
    return np.split(temp_df, [num_train_samples])


Evaluate TFIDF Model Random Forest Performance

In [18]:
lyrics_df = pd.read_csv('../raw_spotify_data/pure_genre_data_w_clean_lyrics.csv')
lyrics_df = lyrics_df[['lyric_raw', 'genre']]
#lyrics_df = lyrics_df[lyrics_df['genre']!='classical']
#lyrics_df = lyrics_df[lyrics_df['genre']!='edm']
train, test = generate_train_test(lyrics_df)
X_train, y_train, X_test, y_test = train_tfidf_vectorizer(train, test)
lyrics_forest_clf = lyrics_random_forest(X_train, y_train)
evaluate_lyrics_clf(lyrics_forest_clf, X_test, y_test)

0.6373242811123655

Train a Model with Track Features and Lyrical Content

In [19]:
# the goal of this cell is to train a model using lyrics AND using song attributes
def lyric_attribute_train_test(all_df):
    """
    read in a dataframe
    return train and test data
    """
    # list of columns to throw away
    blacklist = ['artist_name_y', 'track_name_y','artist_name_x', 'artist_id', 
                'track_name_x','track_id', 'uri', 'track_href', 'analysis_url', 
                'type', 'lyric_clean']
    all_df = all_df[[col for col in all_df.columns if col not in blacklist]]
    train, test = generate_train_test(all_df)
    # get vectorized lyrics:
    X_train, y_train, X_test, y_test = train_tfidf_vectorizer(train, test)
    # remove raw lyrics
    del train['lyric_raw'] 
    del test['lyric_raw']
    del train['genre']
    del test['genre']
    # add feature columns to the lyric vectors
    X_train = hstack([X_train, train.values])
    X_test = hstack([X_test, test.values])
    return X_train, y_train, X_test, y_test

Evaluate Model Performance with Both Track Features and Lyric Vectors

In [20]:
all_df = pd.read_csv('../raw_spotify_data/pure_genre_data_w_clean_lyrics.csv')
#all_df = all_df[all_df['genre']!='alt-rock']
X_train, y_train, X_test, y_test = lyric_attribute_train_test(all_df)
lyrics_forest_clf = lyrics_random_forest(X_train, y_train)
evaluate_lyrics_clf(lyrics_forest_clf, X_test, y_test)

0.8205925516196133

Hyperparameter Tuning TFIDF Vectorizer Random Forest

In [21]:
# start grid search for model tuning for lyrics too.
temp_clf = RandomForestClassifier(random_state=42, n_jobs=-1)
param_grid = {
    'max_depth' : [100, 125, 150, 200],
    'n_estimators' : [1000],
    'criterion' : ['gini', 'entropy'],
    'max_features' : [30],
}
gSearch = GridSearchCV(temp_clf, param_grid=param_grid, scoring='accuracy', n_jobs=-1, verbose=1, cv=2)

In [22]:
lyrics_df = pd.read_csv('../raw_spotify_data/pure_genre_data_w_clean_lyrics.csv')
lyrics_df = lyrics_df[['lyric_raw', 'genre']]
train, test = generate_train_test(lyrics_df)
X_train, y_train, X_test, y_test = train_tfidf_vectorizer(train, test)
gSearch.fit(X_train, y_train)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


GridSearchCV(cv=2, estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [100, 125, 150, 200],
                         'max_features': [30], 'n_estimators': [1000]},
             scoring='accuracy', verbose=1)

In [23]:
# extract best parameters for random forest
gSearch.best_estimator_
gSearch.best_params_

{'criterion': 'gini',
 'max_depth': 125,
 'max_features': 30,
 'n_estimators': 1000}

Failure Analysis using Tracks and Lyrics Random Forest

In [24]:
all_df = pd.read_csv('../raw_spotify_data/pure_genre_data_w_clean_lyrics.csv')
#all_df = all_df[all_df['genre']!='alt-rock']
X_train, y_train, X_test, y_test = lyric_attribute_train_test(all_df)
rf_clf = lyrics_random_forest(X_train, y_train)
# train models
# predict
y_pred_rf = rf_clf.predict(X_test)
failure_df = pd.DataFrame()
failure_df['true_y'] = y_test
failure_df['pred_y'] = y_pred_rf

In [25]:
genres_to_collect = ('alt-rock','classical', 'country',
                       'edm', 'heavy-metal',  'hip-hop',
                       'latin')
required_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                'speechiness', 'acousticness', 'instrumentalness', 
                'liveness', 'valence', 'tempo',
                'duration_ms'
            ]
# create a new dataframe column that tells the user if the model
# predicted the genre correctly or not
failure_df['results'] = (failure_df['pred_y'] == failure_df['true_y'])
for genre in genres_to_collect:
    tempDF = failure_df[failure_df['true_y']==genre]
    sim_results = tempDF['results'].values
    # calculate number of failures per genre
    failures = [status for status in sim_results if status == False]
    passes = [status for status in sim_results if status == True]
    # print number of passes, fails for each genre
    print([genre])
    print(len(failures), len(passes))
len(failure_df[failure_df['results']==True]), len(failure_df[failure_df['results']==False])

['alt-rock']
81 105
['classical']
0 191
['country']
25 178
['edm']
68 143
['heavy-metal']
28 174
['hip-hop']
25 184
['latin']
21 187


(1162, 248)