# Lyrics genre multi label classification

This notebook:
* Uses the dataset generated by **helgi-05-scraped-lyrics-v2-preprocessing.ipynb**
* Performs **multi** label classification on the genres available

In [1]:
import random
import ast

import numpy as np
import pandas as pd

random_state = 1111
np.random.seed(random_state)
random.seed(random_state)

import sklearn.multiclass
import sklearn.linear_model
import sklearn.naive_bayes
import sklearn.utils

import seaborn as sns
from matplotlib import pyplot as plt

import sys
sys.path.append("../")

from libs.processing import (
    Lemmatizer,
    Stemmer,
    StopWords,
    fit_vectorizer,
    random_search,
    choose_random_params
)



## Load and prepare data

In [2]:
df = pd.read_csv('E:\\Repos\\comp550-final-project\\data\\scraped-lyrics-v2-preprocessed.csv')
df

Unnamed: 0,artist,song,lyrics,genres,category
0,12 Stones,3 Leaf Loser,"Biting the hand that feeds you, lying to the v...","['Hard Rock', 'Rock']",Hard Rock
1,12 Stones,Adrenaline,My heart is beating faster can't control these...,"['Hard Rock', 'Rock']",Hard Rock
2,12 Stones,Anthem For The Underdog,You say you know just who I am\nBut you can't ...,"['Hard Rock', 'Rock']",Hard Rock
3,12 Stones,Anthem For The Underdog (Picture Perfect Sessi...,You say you know just who I am\nBut you can't ...,"['Hard Rock', 'Rock']",Hard Rock
4,12 Stones,Arms Of A Stranger,"I came home early to see you,\nCouldn't wait t...","['Hard Rock', 'Rock']",Hard Rock
...,...,...,...,...,...
58714,Ängie,Spun,"Avoid eye contact, you want it, I know it's a ...",['Indie'],Indie
58715,Ängie,Talk to Me Nice,Imma fuck you up\nImma fuck you up\nImma fuck ...,['Indie'],Indie
58716,Ängie,Two Together,I don't even think that you're my type\nYou're...,['Indie'],Indie
58717,Ängie,Venus In Furs,"Shiny, shiny, shiny boots of leather\nWhiplash...",['Indie'],Indie


In [3]:
# genres are a string, let's convert to a list
df.genres = df.genres.apply(lambda x: ast.literal_eval(x))

In [4]:
data_ratio = 1 # How much of the data to use
df.index.to_numpy()
splits = [0.7, 0.85]

indices = sklearn.utils.shuffle(df.index.to_numpy(), random_state=1234)
n_data_used = int(data_ratio*len(indices))

all_data = []
for i in indices[ : n_data_used]:
    song = df.iloc[i]
    all_data.append((song.lyrics, song.genres))

all_data = np.array(all_data, dtype=object)
n = len(all_data)
data = {
    'X_train_raw': all_data[:int(splits[0]*n), 0],
    'y_train': all_data[:int(splits[0]*n), 1],
    'X_valid_raw': all_data[int(splits[0]*n):int(splits[1]*n), 0],
    'y_valid': all_data[int(splits[0]*n): int(splits[1]*n), 1],
    'X_test_raw': all_data[int(splits[1]*n):, 0],
    'y_test': all_data[int(splits[1]*n):, 1]
}

print(f'Ratio of data used: {data_ratio}*{len(indices)} = {n_data_used}')

Ratio of data used: 1*58719 = 58719


## Validation set hyper-parameter search across models

In [5]:
search_params = {
    'data': {
        'tokenizer': [Lemmatizer, Stemmer],
        'stop_words': [StopWords('nltk_english'), StopWords('None', [])],
        'min_df': [0, 1, 2, 3] # Minimum token frequency
    },
    'model': {
        'logistic_regression': {
            'eta0': [1e-3, 1e-2, 1e-1], # learning rate
            'alpha': [1e-3, 1e-2, 1e-1], # regularization
            'max_iter': np.arange(start=1, stop=10), # epochs
            'loss': ['log'],
            'random_state': [random_state]
        },
        'naive_bayes': {
            'alpha': np.arange(start=0.1, stop=1.1, step=0.1)
        },
        'random_forest': {
            'n_estimators': np.arange(start=10, stop=1000, step=10),
            'max_depth': np.arange(start=1, stop=10, step=2),
            'random_state': [random_state]
        }
    }
}

models = {
    'logistic_regression': sklearn.linear_model.SGDClassifier,
    'naive_bayes': sklearn.naive_bayes.MultinomialNB,
    'random_forest': sklearn.ensemble.RandomForestClassifier
}

In [6]:
data_sets, results = random_search(models, data, search_params, n_datasets=4, n_models=4, multi_label=True)

Data set variation 1/4
	Fitting vectorizer...
	logistic_regression 1/4
	logistic_regression 2/4
	logistic_regression 3/4
	logistic_regression 4/4
	naive_bayes 1/4
	naive_bayes 2/4
	naive_bayes 3/4
	naive_bayes 4/4
	random_forest 1/4
	random_forest 2/4
	random_forest 3/4
	random_forest 4/4
Data set variation 2/4
	Fitting vectorizer...
	logistic_regression 1/4
	logistic_regression 2/4
	logistic_regression 3/4
	logistic_regression 4/4
	naive_bayes 1/4
	naive_bayes 2/4
	naive_bayes 3/4
	naive_bayes 4/4
	random_forest 1/4
	random_forest 2/4
	random_forest 3/4
	random_forest 4/4
Data set variation 3/4
	Fitting vectorizer...
	logistic_regression 1/4
	logistic_regression 2/4
	logistic_regression 3/4
	logistic_regression 4/4
	naive_bayes 1/4
	naive_bayes 2/4
	naive_bayes 3/4
	naive_bayes 4/4
	random_forest 1/4
	random_forest 2/4
	random_forest 3/4
	random_forest 4/4
Data set variation 4/4
	Fitting vectorizer...
	logistic_regression 1/4
	logistic_regression 2/4
	logistic_regression 3/4
	logistic

In [7]:
for i, params in enumerate(data_sets):
    params_str = {key: str(param) for key, param in params.items()}
    print(f'\nDataset variation {i+1}: {params_str}')
    model_results = results[i]
    for model_name, params in model_results.items():
        print(f'\t{model_name}:')
        for j in range(len(params)):
            print('\t\t{}'.format({
                    'model_params': params[j]['model_params'],
                    'valid_accuracy': params[j]['valid_accuracy']
            }))



Dataset variation 1: {'tokenizer': "<class 'libs.processing.Lemmatizer'>", 'stop_words': 'None', 'min_df': '1'}
	logistic_regression:
		{'model_params': {'eta0': 0.001, 'alpha': 0.1, 'max_iter': 3, 'loss': 'log', 'random_state': 1111}, 'valid_accuracy': {'TOP@1': 0.5353088101725704, 'TOP@2': 0.7100363306085377, 'TOP@3': 0.8005222524977293}}
		{'model_params': {'eta0': 0.1, 'alpha': 0.001, 'max_iter': 4, 'loss': 'log', 'random_state': 1111}, 'valid_accuracy': {'TOP@1': 0.4510672116257947, 'TOP@2': 0.6068346957311535, 'TOP@3': 0.7006130790190735}}
		{'model_params': {'eta0': 0.1, 'alpha': 0.001, 'max_iter': 5, 'loss': 'log', 'random_state': 1111}, 'valid_accuracy': {'TOP@1': 0.37454586739327883, 'TOP@2': 0.5375794732061762, 'TOP@3': 0.6424841053587648}}
		{'model_params': {'eta0': 0.01, 'alpha': 0.001, 'max_iter': 8, 'loss': 'log', 'random_state': 1111}, 'valid_accuracy': {'TOP@1': 0.473433242506812, 'TOP@2': 0.6380563124432335, 'TOP@3': 0.7338782924613987}}
	naive_bayes:
		{'model_para

## Test set results

In [9]:
best_models = {
    'logistic_regression': 0,
    'naive_bayes': 3,
    'random_forest': 3
}

for model_name, model_index in best_models.items():
    print(f'Best {model_name} test sets accuracies:')
    for i in range(len(results)):
        print(f'\tDataset {i+1}: {results[i][model_name][model_index]["test_accuracy"]}')

Best logistic_regression test sets accuracies:
	Dataset 1: {'TOP@1': 0.5401907356948229, 'TOP@2': 0.7184377838328792, 'TOP@3': 0.8038147138964578}
	Dataset 2: {'TOP@1': 0.5331516802906449, 'TOP@2': 0.7158265213442325, 'TOP@3': 0.7993869209809265}
	Dataset 3: {'TOP@1': 0.5322434150772025, 'TOP@2': 0.7124205267938238, 'TOP@3': 0.7958673932788374}
	Dataset 4: {'TOP@1': 0.5403042688465032, 'TOP@2': 0.717983651226158, 'TOP@3': 0.8038147138964578}
Best naive_bayes test sets accuracies:
	Dataset 1: {'TOP@1': 0.553928247048138, 'TOP@2': 0.721503178928247, 'TOP@3': 0.8005222524977293}
	Dataset 2: {'TOP@1': 0.5342870118074478, 'TOP@2': 0.6980018165304269, 'TOP@3': 0.7838328792007266}
	Dataset 3: {'TOP@1': 0.5374659400544959, 'TOP@2': 0.7059491371480472, 'TOP@3': 0.7881471389645777}
	Dataset 4: {'TOP@1': 0.5216848319709355, 'TOP@2': 0.6939146230699365, 'TOP@3': 0.7805404178019982}
Best random_forest test sets accuracies:
	Dataset 1: {'TOP@1': 0.4725249772933697, 'TOP@2': 0.6907356948228883, 'TOP@