# Genre classification using lyrics

In [1]:
import random
import copy

import numpy as np
import pandas as pd

import nltk
import nltk.stem
import nltk.corpus
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

import sklearn
import sklearn.naive_bayes
import sklearn.ensemble
import sklearn.metrics
import sklearn.feature_extraction.text
import sklearn.utils
import sklearn.utils.testing
import sklearn.exceptions

random_state = 1111
np.random.seed(random_state)
random.seed(random_state)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Helgi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Helgi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Helgi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data exploration

In [2]:
dataframe = pd.read_csv('E:\\Repos\\comp550-final-project\\data\\scraped-lyrics-v1.csv')
dataframe

Unnamed: 0,artist,song,lyrics,genre
0,Disperse,Tether,Warm-hearted\nDirections\nWe're off the map\nI...,Alternative Rock
1,Disperse,Foreword,[Instrumental],Alternative Rock
2,Disperse,Touching The Golden Cloud,Hear in this garden\nHear in this space\nImmer...,Alternative Rock
3,Disperse,Neon,Hello dear stranger\nI've got so much to tell ...,Alternative Rock
4,Disperse,Kites,Still a headache\nFrom last night\nIt was vali...,Alternative Rock
...,...,...,...,...
79995,Flyleaf,Broken Wings,"Thank you for being such a friend to me\nOh, I...",Rock
79996,Flyleaf,Swept Away,The evil fell from your pretty mouth\nWrapped ...,Rock
79997,Flyleaf,Call You Out,How can you act like you know\nWhen all you kn...,Rock
79998,Flyleaf,Beautiful Bride,Unified diversity\nFunctioning as one body\nEv...,Rock


Songs per genre available:

In [3]:
dataframe.groupby('genre').count().lyrics

genre
Alternative Rock    8000
Country             8000
Hard Rock           8000
Heavy Metal         8000
Hip-Hop             8000
Indie               8000
Pop                 8000
R&B                 8000
Rock                8000
Soul                8000
Name: lyrics, dtype: int64

## Helper functions

In [4]:
class Lemmatizer:
    def __init__(self):
        self.normalizer = nltk.stem.WordNetLemmatizer()
        self.tag_prefix_dict = {
            'J': nltk.corpus.wordnet.ADJ,
            'N': nltk.corpus.wordnet.NOUN,
            'V': nltk.corpus.wordnet.VERB,
            'R': nltk.corpus.wordnet.ADV
        }
    
    def __call__(self, document):
        tokens = nltk.word_tokenize(document)
        return [
            self.normalizer.lemmatize(token, pos=self.get_tag_class(tag))
            for token, tag in nltk.pos_tag(tokens)
        ]
    
    def get_tag_class(self, tag):
        prefix = tag[0].upper()
        return self.tag_prefix_dict.get(prefix, nltk.corpus.wordnet.NOUN)

class Stemmer:
    def __init__(self):
        self.normalizer = nltk.stem.PorterStemmer()
    
    def __call__(self, document):
        return [
            self.normalizer.stem(token)
            for token in nltk.word_tokenize(document)
        ]

def fit_vectorizer(X_data, tokenizer, stop_words, min_df):    
    vectorizer = sklearn.feature_extraction.text.CountVectorizer(
        tokenizer=tokenizer,
        stop_words=stop_words,
        min_df=min_df
    )
    vectorizer.fit_transform(X_data)
    return vectorizer

@sklearn.utils.testing.ignore_warnings(category=sklearn.exceptions.ConvergenceWarning)
def random_search(models, search_params, n_datasets=1, n_models=1):
    data_sets = []
    results = [{} for i in range(n_datasets)]

    data_set_variations = [
        choose_random_params(search_params['data'])
        for i in range(n_datasets)
    ]
    model_variations = {
        model_name: [choose_random_params(search_params['model'][model_name]) for i in range(n_models)]
        for model_name in search_params['model'].keys()
    }
    
    for i in range(n_datasets):
        print(f'Data set variation {i+1}/{n_datasets}')

        data_params = data_set_variations[i]
        data_sets.append(data_params)

        print('\tFitting vectorizer...')
        vectorizer = fit_vectorizer(X_train_raw, **data_params)

        X_train = vectorizer.transform(X_train_raw)
        X_valid = vectorizer.transform(X_valid_raw)
        X_test = vectorizer.transform(X_test_raw)

        for model_name, model_class in models.items():
            for j in range(n_models):
                print(f'\t{model_name} {j+1}/{n_models}')

                model_params = model_variations[model_name][j]

                model = model_class(**model_params)

                model.fit(X_train, y_train)
                valid_predictions = model.predict(X_valid)
                test_predictions = model.predict(X_test)
                
                valid_accuracy = sklearn.metrics.accuracy_score(y_valid, valid_predictions)
                
                # This number is only looked at once at the very end when the best models have been chosen based on validation accuracy
                test_accuracy = sklearn.metrics.accuracy_score(y_test, test_predictions)
                test_confusion_matrix = sklearn.metrics.confusion_matrix(y_test, test_predictions)

                if results[i].get(model_name, None) is None:
                    results[i][model_name] = []

                results[i][model_name].append({
                    'model_params': model_params,
                    'valid_accuracy': valid_accuracy,
                    'test_accuracy': test_accuracy,
                    'test_confusion_matrix': test_confusion_matrix
                })
    
    return data_sets, results

def choose_random_params(parameters):
    return {
        name: np.random.choice(values)
        for name, values in parameters.items()
    }


## Prepare model data

In [5]:
splits = [0.7, 0.85]

genres = dataframe.genre.unique()
genre_ratio = 1 # How many genres to use
song_ratio = 1 # How many songs per genre to use

all_data = []
for genre in genres[ :int(genre_ratio*len(genres))]:
    lyrics = dataframe[dataframe.genre == genre].lyrics
    all_data += [(lyric, genre) for lyric in lyrics[ : int(song_ratio*len(lyrics))]]

all_data = sklearn.utils.shuffle(np.array(all_data), random_state=random_state)

n = all_data.shape[0]

X_train_raw, y_train = all_data[:int(splits[0]*n), 0], all_data[:int(splits[0]*n), 1]
X_valid_raw, y_valid = all_data[int(splits[0]*n):int(splits[1]*n), 0], all_data[int(splits[0]*n): int(splits[1]*n), 1]
X_test_raw, y_test = all_data[int(splits[1]*n):, 0], all_data[int(splits[1]*n):, 1]

print(f'genre_ratio: {genre_ratio}')
print(f'song_ratio: {song_ratio}')
print(f'Genres used: {genres[ :int(genre_ratio*len(genres))]}')
print(f'Ratio of data used: {n}/{len(dataframe)} = {(n/len(dataframe)):.2f}')

genre_ratio: 1
song_ratio: 1
Genres used: ['Alternative Rock' 'Country' 'Hard Rock' 'Soul' 'Heavy Metal' 'Hip-Hop'
 'Indie' 'Pop' 'R&B' 'Rock']
Ratio of data used: 80000/80000 = 1.00


## Validation set hyper-parameter search across models

In [6]:
search_params = {
    'data': {
        'tokenizer': [Lemmatizer(), Stemmer()],
        'stop_words': ['english', None],
        'min_df': [1, 2, 3] # Minimum token frequency
    },
    'model': {
        'logistic_regression': {
            'eta0': [1e-3, 1e-2, 1e-1], # learning rate
            'alpha': [1e-3, 1e-2, 1e-1], # regularization
            'max_iter': np.arange(start=1, stop=5), # epochs
            'random_state': [random_state]
        },
        'linear_support_vector_machine': {
            'kernel': ['linear'],
            'max_iter': np.arange(start=1, stop=5), # epochs
            'C': [1e-3, 1e-2, 1e-1], # L2 regularization
            'random_state': [random_state]
        },
        'naive_bayes': {
            'alpha': np.arange(start=0.1, stop=1.1, step=0.1)
        },
        'random_forest': {
            'n_estimators': np.arange(start=10, stop=1000, step=10),
            'max_depth': np.append(np.array(None), np.arange(start=1, stop=5, step=2)),
            'random_state': [random_state]
        }
    }
}

models = {
    'logistic_regression': sklearn.linear_model.SGDClassifier,
    #'linear_support_vector_machine': sklearn.svm.SVC,
    'naive_bayes': sklearn.naive_bayes.MultinomialNB,
    #'random_forest': sklearn.ensemble.RandomForestClassifier
}

data_sets, results = random_search(models, search_params, n_datasets=2, n_models=3)

for i, params in enumerate(data_sets):
    print(f'\nDataset variation {i+1}: {params}')
    model_results = results[i]
    for model_name, params in model_results.items():
        print(f'\t{model_name}:')
        for j in range(len(params)):
            print('\t\t{}'.format({
                    'model_params': params[j]['model_params'],
                    'valid_accuracy': params[j]['valid_accuracy']
            }))


Data set variation 1/2
	Fitting vectorizer...
	logistic_regression 1/3
	logistic_regression 2/3
	logistic_regression 3/3
	naive_bayes 1/3
	naive_bayes 2/3
	naive_bayes 3/3
Data set variation 2/2
	Fitting vectorizer...


  'stop_words.' % sorted(inconsistent))


	logistic_regression 1/3
	logistic_regression 2/3
	logistic_regression 3/3
	naive_bayes 1/3
	naive_bayes 2/3
	naive_bayes 3/3

Dataset variation 1: {'tokenizer': <__main__.Lemmatizer object at 0x000001F9D22B9308>, 'stop_words': None, 'min_df': 2}
	logistic_regression:
		{'model_params': {'eta0': 0.001, 'alpha': 0.001, 'max_iter': 3, 'random_state': 1111}, 'valid_accuracy': 0.27791666666666665}
		{'model_params': {'eta0': 0.001, 'alpha': 0.1, 'max_iter': 1, 'random_state': 1111}, 'valid_accuracy': 0.30391666666666667}
		{'model_params': {'eta0': 0.1, 'alpha': 0.001, 'max_iter': 3, 'random_state': 1111}, 'valid_accuracy': 0.27791666666666665}
	naive_bayes:
		{'model_params': {'alpha': 0.4}, 'valid_accuracy': 0.3631666666666667}
		{'model_params': {'alpha': 0.5}, 'valid_accuracy': 0.36283333333333334}
		{'model_params': {'alpha': 0.9}, 'valid_accuracy': 0.3605}

Dataset variation 2: {'tokenizer': <__main__.Stemmer object at 0x000001F9D193B488>, 'stop_words': 'english', 'min_df': 3}
	logis

## Test set results

In [15]:
best_models = {
    'logistic_regression': 1,
    'naive_bayes': 0
}

for model_name, model_index in best_models.items():
    print(f'Best {model_name} test sets accuracies:')
    for i in range(len(results)):
        print(f'\tDataset {i+1}: {results[i][model_name][model_index]["test_accuracy"]:.3f}')

random_predictions = np.random.choice(genres, size=y_test.shape[0])
random_accuracy = sklearn.metrics.accuracy_score(y_test, random_predictions)
print(f'Random classifier test set accuracy: {random_accuracy:.3f}', end='\n\n')

print(f'Genres: {genres}', end='\n\n')
print('Best model (Naive Bayes) confusion matrix:')
print(results[0]['naive_bayes'][0]['test_confusion_matrix'])


Best logistic_regression test sets accuracies:
	Dataset 1: 0.305
	Dataset 2: 0.317
Best naive_bayes test sets accuracies:
	Dataset 1: 0.363
	Dataset 2: 0.367
Random classifier test set accuracy: 0.099

Genres: ['Alternative Rock' 'Country' 'Hard Rock' 'Soul' 'Heavy Metal' 'Hip-Hop'
 'Indie' 'Pop' 'R&B' 'Rock']

Best model (Naive Bayes) confusion matrix:
[[448  67  47  99  29 265  93  21  61  63]
 [ 90 731  22  47  12  75  67  13  18 102]
 [206  84 186 287  24  81  78  29  64 145]
 [129  28  81 760  17  32  24   9  43  48]
 [ 39   9   7  11 719  28 156 181   3  38]
 [344 103  32 113  21 370 117  18  62  84]
 [231  60  27  51  87 149 359 123  24 123]
 [ 85  76  17  34 188  61 203 213  16 291]
 [322 128  78 180  18 189  78  26  82 109]
 [111 149  26  45  51  94  92 111  25 488]]
