# COMP550 - Final Project
---

## Table of content
[1. Imports](#imports)  
[2. Import & Cleaning data and Exploratory Data Analysis](#imports-clean)  
[3. Preprocessing steps](#preprocessing)  
[4. Naïve majority model](#naive-model)   
[5. Logistic Regression](#log-reg)  
[6. Naïve Bayes](#naive-bayes)  
[7. Support Vector Machine](#SVM)  
[8. Sequencial model - LSTM](#LSTM)  

# 1. Imports  <a class="anchor" id="imports"></a>

In [None]:
IN_GOOGLE_COLAB = False
root_path = 'data/'
if IN_GOOGLE_COLAB:
    !pip install langdetect
    from google.colab import drive
    drive.mount('/content/drive')
    root_path = 'drive/My Drive/COMP550-Project/data/'

In [None]:
import pandas as pd
import numpy as np
import string
import time
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from langdetect import detect
from scipy.stats import uniform
import warnings
import multiprocessing
import collections
import random
import re
cores = multiprocessing.cpu_count()


# nltk imports
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

# sklearn imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC

# gensim imports
import gensim
from gensim.models import Word2Vec
import gensim.downloader as api

if not IN_GOOGLE_COLAB:
    # pytorch imports
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim
    from torch.utils import data
    from torch.utils.data import DataLoader, TensorDataset
    from torch.autograd import Variable
    torch.manual_seed(1)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 2. Import & Cleaning data and Exploratory Data Analysis   <a class="anchor" id="imports-clean"></a>

**NOTE**: Detecting the language of all the songs is very long (15 minutes). To avoid this step we import directly the preprocessed english that is split into 3 sets: training, validation and test.

In [None]:
USE_CLEANED_DATA = True
cleaned_data_path = root_path + "cleaned_data.csv"
data_path = root_path + "lyrics.csv"
data_raw = pd.read_csv(data_path)
print(len(data_raw), "songs in the dataset")
print(data_raw.head())

The dataset has the following columns:
- **index** (int): index of the song in the dataset
- **song** (string): name of the song
- **year** (float) -> (int): release year
- **artist** (string): artist of the song
- **genre** (string): the genre, this is the label we want to predict
- **lyrics** (string): the lyrics of the song. This is the data we will use to predict the genre. We need to preprocess this data.

If we remove the null elements, we are left with **265701** songs. We convert the year from float to int.

In [None]:
data_all = data_raw[pd.notnull(data_raw)]
data_all = data_all.dropna(how='any',axis=0)
data_all['year'] = pd.to_numeric(data_all['year'], downcast='integer')
data_all['index'] = pd.to_numeric(data_all['index'], downcast='integer')
data_all = data_all.reset_index(drop=True)
data_all

We keep only English songs, using the `langdetect` library. There are **237,363 English songs** in the previous 265,701 songs.

In [None]:
if not USE_CLEANED_DATA:
    en_songs = []
    for song in data_all['lyrics']:
        try:
            lang = detect(song)
            if lang == 'en':
                en_songs.append(True)
            else:
                en_songs.append(False)
        except:
            en_songs.append(False)
    data_en = data_all[en_songs]
    data_en.reset_index(drop=True)
else:
    data_en = pd.read_csv(cleaned_data_path)
    
data_en.head()

In [None]:
data_en['genre'].unique()

We remove songs where the labels are "Other" or "Not Available". This reduces the number of songs from 237363 to **215,825 songs**.

In [None]:
labels = data_en['genre'].tolist()
keep_song = [genre not in ['Not Available', 'Other'] for genre in data_en['genre'].tolist()]
data_en = data_en[keep_song]
data_en = data_en.reset_index(drop=True)
data_en['genre'].value_counts()

Above is the the number of songs in each genre category. **46,5% of the songs are Rock songs**. 

# 3. Preprocessing steps <a class="anchor" id="preprocessing"></a>
The lyrics need to be cleaned before we can use them.
- remove \n line breaks
- remove punctuation
- lowercase the lyrics
- remove verse and chorus indications that are under the form [verse x]
- remove tokens that have a null length

Because this step is long, we saved the preprocesed dataset in the csv `final_lyrics_dataframe.csv`. There are 5 columns: genre (int), lyrics (whole lyrics sequence, preprcesed without removing stopwords, this will be used in the LSTM), lyrics_no_stop_words (preprocessed lyrics with no stopwords), lyrics_stemmd (stemmed lyrics), lyrics_lemamd (lemmatized lyrics).

In [None]:
LOAD_FINAL_LYRICS_DATAFRAME = True
if LOAD_FINAL_LYRICS_DATAFRAME:
    df_balanced_final = pd.read_csv(root_path + "final_lyrics_dataframe.csv")

# replace line breaks, removes punctuation, set everything to lowercase
# removes word if length <= 2, [verse X] or [chorus y] indication
# remove stopwords
GENRE_TO_INT = {'Pop':0, 'Hip-Hop':1, 'Rock':2, 'Metal':3, 'Country':4, 'Jazz':5, 'Electronic':6, 'Folk':7, 'R&B':8, 'Indie':9}
INT_TO_GENRE = {0:'Pop', 1:'Hip-Hop', 2:'Rock', 3:'Metal', 4:'Country', 5:'Jazz', 6:'Electronic', 7:'Folk', 8:'R&B', 9:'Indie'}

if not IN_GOOGLE_COLAB:
    def my_preprocessor(song, remove_stopwords=True):
        song = song.replace('\n', ' ')
        song = song.translate(str.maketrans('', '', string.punctuation))
        song = song.lower()
        song_token = song.split(' ')
        song_token = [w for w in song_token if (len(w) >= 3 and w[0] != '[' and w[-1] != ']')]
        song_token = [w for w in song_token if not any(c.isdigit() for c in w)]
        if remove_stopwords:
            stop_words = set(stopwords.words('english'))
            song_token = [w for w in song_token if (w not in stop_words)]
        song = ' '.join(song_token)
        song = re.sub(r'[^a-z ]+', '', song)
        return song

    # tokenize the song
    def my_tokenizer(song): 
        tokens = song.split(' ')
        return tokens

    # tokenize the song and stems its tokens
    def my_tokenizer_stem(song): 
        tokens = song.split(' ') 
        stemmer = PorterStemmer() 
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        return stemmed_tokens

    # tokenize the song and lemmas its tokens
    def my_tokenizer_lemma(song):
        song=[w for w in song.split(' ') if len(w)>0]
        song_with_pos = pos_tag(song)
        POS_correspondance = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}
        lemmatizer = WordNetLemmatizer()
        lemmatized_song = [lemmatizer.lemmatize(w[0], POS_correspondance.get(w[1][0], wordnet.NOUN)) for w in song_with_pos]
        return lemmatized_song

In [None]:
if not LOAD_FINAL_LYRICS_DATAFRAME:
    # preprocess the songs
    LSTM_lyrics = [my_preprocessor(song, remove_stopwords=False) for song in data_en['lyrics'].to_numpy()]
    no_stopwords_lyrics = [my_preprocessor(song, remove_stopwords=True) for song in data_en['lyrics'].to_numpy()]
    genres_int = [GENRE_TO_INT[genre] for genre in data_en['genre']]

    # only keep songs with more than 20 words
    keep_song_LSTM = []
    keep_song_no_stopwords = []
    keep_genre = []
    for i, song in enumerate(LSTM_lyrics):
        if len(song) > 20:
            keep_song_LSTM.append(song)
            keep_song_no_stopwords.append(no_stopwords_lyrics[i])        
            keep_genre.append(genres_int[i])

    df_final = pd.DataFrame.from_dict({'lyrics': keep_song_LSTM, 'no_stop_words_lyrics': keep_song_no_stopwords, 'genre': keep_genre})

### Balance dataset
Keep only 1778 songs per genre, totalling 17,780 songs

In [None]:
if not LOAD_FINAL_LYRICS_DATAFRAME:
    # Keep only 1778 songs per genre, totalling 17,780 songs

    def get_balanced(df):
        genre_dict={i:0 for i in range(10)}
        df_mix = df.sample(frac=1, random_state = 43)
        df_mix=df_mix.reset_index(drop=True)
        song_to_keep = []
        for i, lyrics in enumerate(df_mix['lyrics']):
            genre=df_mix['genre'][i]
            if genre_dict[genre] < 1778:
                genre_dict[genre] += 1
                song_to_keep.append(True)
            else:
                song_to_keep.append(False)
        df_balanced = df_mix[song_to_keep].reset_index(drop=True)
        df_balanced = df_balanced.sample(frac=1, random_state = 43).reset_index(drop=True)
        return df_balanced

    df_balanced_final = get_balanced(df_final)

### Stem and lemmatize songs

In [None]:
if not LOAD_FINAL_LYRICS_DATAFRAME:
    df_balanced_final['lyrics_stemmed'] = [' '.join(my_tokenizer_stem(song)) for song in df_balanced_final['no_stop_words_lyrics']]
    df_balanced_final['lyrics_lemmad'] = [' '.join(my_tokenizer_lemma(song)) for song in df_balanced_final['no_stop_words_lyrics']]
    df_balanced_final.to_csv(root_path+"final_lyrics_dataframe.csv", index=False)
df_balanced_final

#### We split the data in training, validation and test sets
The english songs are smplit into 3 sets : 90% for training, 5% for validation and 5% for testing.

In [None]:
lyrics_train, lyrics_valid, labels_train, labels_valid = train_test_split(df_balanced_final, df_balanced_final['genre'], test_size=0.1, shuffle=True, random_state=43, stratify=df_balanced_final['genre'])
lyrics_test, lyrics_valid, labels_test, labels_valid = train_test_split(lyrics_valid, labels_valid, test_size=0.5, shuffle=True, random_state=43, stratify=labels_valid)

train_lyrics_preprocessed = lyrics_train['no_stop_words_lyrics'].to_numpy()
train_lyrics = lyrics_train['lyrics'].to_numpy()
train_lyrics_stemmed = lyrics_train['lyrics_stemmed'].to_numpy()
train_lyrics_lemmad = lyrics_train['lyrics_lemmad'].to_numpy()
train_labels = labels_train.to_numpy()

valid_lyrics_preprocessed = lyrics_valid['no_stop_words_lyrics'].to_numpy()
valid_lyrics = lyrics_valid['lyrics'].to_numpy()
valid_lyrics_stemmed = lyrics_valid['lyrics_stemmed'].to_numpy()
valid_lyrics_lemmad = lyrics_valid['lyrics_lemmad'].to_numpy()
valid_labels = labels_valid.to_numpy()

test_lyrics_preprocessed = lyrics_test['no_stop_words_lyrics'].to_numpy()
test_lyrics = lyrics_test['lyrics'].to_numpy()
test_lyrics_stemmed = lyrics_test['lyrics_stemmed'].to_numpy()
test_lyrics_lemmad = lyrics_test['lyrics_lemmad'].to_numpy()
test_labels = labels_test.to_numpy()

print("Training set length:", len(lyrics_train))
print("Validation set length:", len(lyrics_valid))
print("Test set length:", len(lyrics_test))

# 4. Naïve Majority Model  <a class="anchor" id="naive-model"></a>
In this naïve majority model, we guess that all the songs have the genre 'Rock', which is the genre that has the majority of songs. This is a first baseline model, that we can use to compare the results of logistic regression, naive bayes, ...

In [None]:
print(classification_report(labels_train, [2]*len(lyrics_train), target_names=list(GENRE_TO_INT.keys())))

The precision for our baseline model is **10%**.

# 5. Logistic Regression  <a class="anchor" id="log-reg"></a>
We test different forms of the vectorised data: stemmed, lemmatized and no token transformation. The step to vectorize the data is quite long so we decide to test different hyperparameters of a model AFTER the vectorization is performed.


#### Best model with k-fold cross validation
Best accuracy with the following model : **41.0%**   
with: stemming, TD-IDF vectorization, regularization strength {C: 2.2, max_df: 0.5, max_features: 25000, ngram_range: bigram, norm='l2'}.  
On test set: **40.38%**

### Grid search

In [None]:
# Define a pipeline combining a text feature extractor with a simple classifier
GRID_SEARCH_ON = False
pipeline = Pipeline([
    # ('vect', CountVectorizer()),
    ('vect', TfidfVectorizer()),    
    ('clf', LogisticRegression(multi_class='auto', solver='lbfgs', penalty='l2', max_iter=100)),
])

parameters = {
    'vect__max_df': [0.8],
    'vect__max_features': [210000],
    'vect__ngram_range': [(1,2)],
    'vect__norm': ['l2'],
    'clf__C': [2.5, 2.6, 2.7, 2.8, 2.9],
}

# find the best parameters for both the feature extraction and the classifier
if GRID_SEARCH_ON:
    grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
    start = time.time()
    grid_search.fit(train_lyrics_lemmad, train_labels)
    end = time.time()
    print("done in %0.3fs" % (end - start))
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

#### Results grid search 
The different hyperparameters and ranges why are testing:
- **tokenizer**: [my_tokenizer, my_tokenizer_stem, my_tokenizer_lemma].
- **max_df**: range(0.3, 1)
- **max_features**: range(10000, 200000)
- **ngram_range**: unigrams, bigrams. bigram models tend to have the best results.
- **C**: range(0.01, 3). The regularization strength is the most important parameter to finetune. A value around 0.1 increases the accuracy up to 10% compared to a bad choice of strength. When TF-IDF is on, the strength needs to be around 2.
- **TFIDF**: on or off (depends on which vectorizer we use). When turned on, the accuracy is higher.
- **norm**: when TFIDF=on defines the unit norm of each row.

There are a lot of different possible combinations. Here is the methodology for grid search.
0. preprocessing = none
1. Try out 3 different values for each hyperparameter (min, max, middle) and see which parameters modify the most the accuracy. For example the tokenizer doesn't change the accuracy that much, but the regularization strength affects a lot the accuracy.
2. For each hyperparameter that doesn't have a big impacy, chose the value that gives the highest accuracy. If there is no trend (for example the hyperparameter sometimes give better results with a certain value and other times a worst result, take the value that has the smallest computation time).
3. The value of regularization strength is the most important hyperparameter to determine. A value around 0.1 is a good choice.
4. Little by little, trim the ranges of the hyperparameter choices, taking each time the one that affects the most the accuracy.
5. Repeat from 0 for preprocessing = stemming, lemmatization
6. Repeat from 0 for TFIDF = on

**TFIDF=off**  
Best with no tokenization modification: **37.3%** {C: 0,07, max_df: 0,7, max_features: 100000, ngram_range: bigram}  
Best with stemming: **38.9%** {C: 0.1, max_df: 0.7, max_features: 150000, ngram_range: bigram}  
Best with lemmatization: **38.6%** {C: 0.14, max_df: 0.7, max_features: 150000, ngram_range: bigram}

**TFIDF=on**  
Best with no tokenization modification:  **40.5%** {C: 2.6, max_df: 0.5, max_features: 210000, ngram_range: bigram, norm='l2'}  
Best with stemming: **41.0%** {C: 2.2, max_df: 0.5, max_features: 25000, ngram_range: bigram, norm='l2'}  
Best with lemmatization:  **40.1** {C: 2.8, max_df: 0.8, max_features: 210000, ngram_range: bigram, norm='l2'}

In [None]:
# grid_search_df = pd.DataFrame.from_dict(grid_search.cv_results_)
# grid_search_df.to_csv(root_path+"result_reglog_tfidf_preprocessed_6.csv", sep=';', decimal=',')

#### We compute the accuracy of the best model on the validation set

With balanced dataset  
Accuracy on training set: 87.61%  
Accuracy on validation set: 40.38%

In [None]:
# #train_lyrics_preprocessed, train_lyrics_stemmed, train_lyrics_lemmad, train_labels
# vectorizer = TfidfVectorizer(max_df=0.8, max_features=210000, ngram_range=(1, 2), norm='l2')
# classifier = LogisticRegression(multi_class='auto', solver='lbfgs', penalty='l2', C=2.8, max_iter=1000)
# lyrics_train_vec = vectorizer.fit_transform(train_lyrics_preprocessed)
# lyrics_valid_vec = vectorizer.transform(test_lyrics_preprocessed)
# classifier.fit(lyrics_train_vec, train_labels)
# print("Accuracy on training set:", accuracy_score(train_labels, classifier.predict(lyrics_train_vec)))
# print("Accuracy on test set:", accuracy_score(test_labels, classifier.predict(lyrics_valid_vec)))

# 6. Naïve Bayes Model  <a class="anchor" id="naive-bayes"></a>
We test different forms of the vectorised data: stemmed, lemmatized and no token transformation. The step to vectorize the data is quite long so we decide to test different hyperparameters of a model AFTER the vectorization is performed.

#### Best model with k-fold cross validation
Best accuracy with TFIDF and lemmatization on training set: **80.82%**  
On validation set: **39.7%**.

In [None]:
# Define a pipeline combining a text feature extractor with a simple classifier
GRID_SEARCH_ON = False
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    #('vect', TfidfVectorizer()),    
    ('clf', MultinomialNB()),
])

parameters = { 
    'vect__max_df': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'vect__max_features': [100000,150000,200000],
    'vect__ngram_range': [(1,1),(1,2)],
    #'vect__norm': ['l2'], #not a parameter for CountVectorizer()
    'clf__fit_prior': [True],
    'clf__alpha': [1],
}

# find the best parameters for both the feature extraction and the classifier
if GRID_SEARCH_ON:
    grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
    start = time.time()
    grid_search.fit(train_lyrics_lemmad, train_labels)
    end = time.time()
    print("done in %0.3fs" % (end - start))
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

#### Results grid search 
The different hyperparameters and ranges why are testing:
- **tokenizer**: [my_tokenizer, my_tokenizer_stem, my_tokenizer_lemma]. 
- **max_df**: range(0.3, 1). This value is pretty much 0.2 across the board for all the following experiments.
- **max_features**: range(10000, 200000). For TFIDF = off, this value is in the upper end of the range. For TFIDF = on, this value is typically much lower (see results)
- **ngram_range**: unigrams, unigrams and bigrams, bigrams. (1,2) is the best performing parameter in the following tests.
- **alpha**: float - controls smoothing; 0 is no smoothing, 1 is Laplace smoothing. 
- **fit_prior**: bool - whether to learn class priors. All test run best with TRUE.
- **TFIDF**: on or off (depends on which vectorizer we use).
- **norm**: when TFIDF=on defines the unit norm of each row.

There are a lot of different possible combinations. Here is the methodology for grid search.
0. preprocessing = none
1. Try out 3 different values for each hyperparameter (min, max, middle) and see which parameters modify the most the accuracy. For example the tokenizer doesn't change the accuracy that much, but the regularization strength affects a lot the accuracy.
2. For each hyperparameter that doesn't have a big impacy, chose the value that gives the highest accuracy. If there is no trend (for example the hyperparameter sometimes give better results with a certain value and other times a worst result, take the value that has the smallest computation time).
3. The value of regularization strength is the most important hyperparameter to determine. A value around 0.1 is a good choice.
4. Little by little, trim the ranges of the hyperparameter choices, taking each time the one that affects the most the accuracy.
5. Repeat from 0 for preprocessing = stemming, lemmatization
6. Repeat from 0 for TFIDF = on

Initial parameters run as: parameters = { 
    'vect__max_df': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'vect__max_features': [10000,100000,200000],
    'vect__ngram_range': [(1,1),(1,2),(2,2)],
    'vect__norm': ['l2','l1'], #not a parameter for CountVectorizer()
    'clf__fit_prior': [True,False],
    'clf__alpha': [0,0.5,1],
From here we refine our selection criteria per tokenized set.

**TFIDF=off**  
Best with no tokenization modification: **39.5%** {'clf__alpha': 1, 'clf__fit_prior': True, 'vect__max_df': 0.2, 'vect__max_features': 150000, 'vect__ngram_range': (1, 2)}  
Best with stemming: **38.8%**  {clf__alpha: 1,	clf__fit_prior: True, vect__max_df: 0.2, vect__max_features: 200000, vect__ngram_range: (1, 2)}  
Best with lemmatization: **39.1%** {'clf__alpha': 1, 'clf__fit_prior': True, 'vect__max_df': 0.3, 'vect__max_features': 150000, 'vect__ngram_range': (1, 2)}  

**TFIDF=on**  
Best with no tokenization modification: **39.6%** {'clf__alpha': 0.1, 'clf__fit_prior': True, 'vect__max_df': 0.3, 'vect__max_features': 25000, 'vect__ngram_range': (1, 2), 'vect__norm': 'l2'}  
Best with stemming: **39.4%** {'clf__alpha': 0.5, 'clf__fit_prior': True, 'vect__max_df': 0.6, 'vect__max_features': 25000, 'vect__ngram_range': (1, 2), 'vect__norm': 'l2'}  
Best with lemmatization: **39.7%** {'clf__alpha': 0.3, 'clf__fit_prior': True, 'vect__max_df': 0.3, 'vect__max_features': 25000, 'vect__ngram_range': (1, 2), 'vect__norm': 'l2'}

In [None]:
# grid_search_df = pd.DataFrame.from_dict(grid_search.cv_results_)
# grid_search_df.to_csv(root_path+"result_tuned_lemmad.csv", sep=';', decimal=',')

#### We compute the accuracy of the best model on the validation set

with balanced dataset  
Accuracy on training set: 80.37%  
Accuracy on test set: 40.26%

In [None]:
# #train_lyrics_preprocessed, train_lyrics_stemmed, train_lyrics_lemmad, train_labels
# vectorizer = TfidfVectorizer(max_df=0.2, max_features=7500, ngram_range=(1, 2), norm='l2')#add best parameters
# classifier = MultinomialNB(alpha=0.1,fit_prior=True)#add best parameters
# lyrics_train_vec = vectorizer.fit_transform(train_lyrics_stemmed)
# lyrics_test_vec = vectorizer.transform(test_lyrics_stemmed)
# classifier.fit(lyrics_train_vec, train_labels)
# print("Accuracy on training set:", accuracy_score(train_labels, classifier.predict(lyrics_train_vec)))
# print("Accuracy on test set:", accuracy_score(test_labels, classifier.predict(lyrics_test_vec)))

# 7. Support Vector Machine  <a class="anchor" id="SVM"></a>
We test different forms of the vectorised data: stemmed, lemmatized and no token transformation. The step to vectorize the data is quite long so we decide to test different hyperparameters of a model AFTER the vectorization is performed.

SVMs are 2 class classifiers. With the LinearSVC there is multiclass support according to a one-vs-the-rest scheme.

#### Best model with k-fold cross validation
Best accuracy: with TFIDF and lemmatization, for linear kernel.  
Accuracy on training set: **98.53%**  
Accuracy on validation set: **40.7%**

In [None]:
[i/10 for i in range(1, 10)]
# [i*1000 for i in range(150, 250, 30)]

In [None]:
# Define a pipeline combining a text feature extractor with a simple classifier
GRID_SEARCH_ON = False
pipeline = Pipeline([
    # ('vect', CountVectorizer()),
    ('vect', TfidfVectorizer()),    
    ('clf', LinearSVC()),
    # ('clf', SVC()),    
])

parameters = { 
    'vect__max_df': [0.5],
    # 'vect__max_features': [600000, 700000, 800000,],
    'vect__ngram_range': [(1,2)],
    'clf__C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    # 'clf__kernel': ['poly'], # 'rbf', 'sigmoid'
    # 'clf__gamma': ['scale', 'auto']    
}

# find the best parameters for both the feature extraction and the classifier
if GRID_SEARCH_ON:
    grid_search = GridSearchCV(pipeline, parameters, cv=2, n_jobs=-1, verbose=1)
    start = time.time()
    grid_search.fit(lyrics_stemmed, train_labels)
    end = time.time()
    print("done in %0.3fs" % (end - start))
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
# grid_search_df = pd.DataFrame.from_dict(grid_search.cv_results_)
# grid_search_df.to_csv(root_path+"result_SVC_tfidf_stemmed_1.csv", sep=';', decimal=',')

#### Results grid search 
The different hyperparameters and ranges why are testing:
- **tokenizer**: [my_tokenizer, my_tokenizer_stem, my_tokenizer_lemma]. 
- **max_df**: [0.2, 0.5, 0.8]
- **max_features**: [10000, 50000, 100000, 150000, 200000, 250000]
- **ngram_range**: unigrams, unigrams and bigrams.
- **C**: [0.5, 1, 2].
- **TFIDF**: on or off (depends on which vectorizer we use).
- **norm**: when TFIDF=on defines the unit norm of each row.

Initial parameters run as: parameters = { 
    'vect__max_df': [0.2, 0.5, 0.8],
    'vect__max_features': [10000, 50000, 100000, 200000],
    'vect__ngram_range': [(1, 1), (1,2)],
    'clf__C': [0.5, 1, 2],
From here we refine our selection criteria per tokenized set.

**TFIDF=off**  
Best with no tokenization modification: **39.1%** {max_df=0.5, ngram: bigram, C=0.0009} 
Best with stemming: **39,7%** {max_df=0.5, ngram: bigram, C=0.0009}  
Best with lemmatization: **39.2%** {max_df=0.5, ngram: bigram, C=0.0009}  

**TFIDF=on**  
Best with no tokenization modification: **40.3%** {max_df=0.4, ngram: bigram, C=0.3}  
Best with stemming: **40.7%** {max_df=0.3, ngram: bigram, C=0.4}  
Best with lemmatization: **40.4%** {max_df=0.6, ngram: bigram, C=0.305}  

With balanced data:  
accuracy on training set: 98.53%,
on test set: 41.5%    
40,26

In [None]:
# start = time.time()
# vectorizer = TfidfVectorizer(max_df=0.6, ngram_range=(1, 2), norm='l2')#add best parameters
# classifier = LinearSVC(C=0.305) #add best parameters
# lyrics_train_vec = vectorizer.fit_transform(train_lyrics)
# lyrics_test_vec = vectorizer.transform(test_lyrics)
# classifier.fit(lyrics_train_vec, train_labels)
# print("Accuracy on training set:", accuracy_score(train_labels, classifier.predict(lyrics_train_vec)))
# print("Accuracy on test set:", accuracy_score(test_labels, classifier.predict(lyrics_test_vec)))
# print("Done in {:03.2f} seconds".format(time.time()-start))

# 8. Sequenciel model - LSTM  <a class="anchor" id="LSTM"></a>

## Parameters:


In [None]:
#train_lyrics_preprocessed, train_lyrics_stemmed, train_lyrics_lemmad, train_labels

params = {
    'TRAIN_SIZE': len(train_labels),
    'VALID_SIZE': len(valid_labels),
    'TEST_SIZE': len(test_labels),    

    # Hyperparameters
    'WORD_VEC_SIZE': 100, # Size of the word embedding vector WARNING: if WORD_VEC_SIZE!=100 the word embedding has to be retrained
    'USE_WORD2VEC': 'none', # 'none', 'cbow', 'skip_gram'
    'VOCAB_SIZE': 30000, # number of words in vocabulary, -1 if use all the vocab
    'MAX_WORDS': 200, # max number of words in song
    'PADDING_START': False,
}
print(params['TRAIN_SIZE'])
print(params['VALID_SIZE'])
print(params['TEST_SIZE'])

## 8.1 Word embedding

We create word embeddings with Word2Vec, from the gensim package. There are two possible models: continuous bag of words and Skip Gram. The models are trained on the whole data (the 215,824 preprocessed english songs). To avoid re-computing the models, they are saved in the files `model_CBOW_215824_en_songs.model` and `model_Skip_Gram_215824_en_songs.model`.
Helpful resources : 
- https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
- https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
- https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial#Training-the-model

**WARNING** : the saved models are computed with the following parameters: `gensim.models.Word2Vec(data, min_count=1, size=WORD_VEC_SIZE, window=5, sg=0, workers=cores-1)` and `gensim.models.Word2Vec(data, min_count=1, size=WORD_VEC_SIZE, window=10, sg=1, workers=cores-1)`. If the WORD_VEC_SIZE **not** equal to 100, the models have to be retrained.

In [None]:
def get_word2vec_models(params):
    LOAD_SAVED_W2V_MODELS = params['WORD_VEC_SIZE']==100
    if LOAD_SAVED_W2V_MODELS:
        model_CBOW = Word2Vec.load(root_path+'model_CBOW_215824_en_songs_.model')
        model_Skip_Gram = Word2Vec.load(root_path+'model_Skip_Gram_215824_en_songs_.model')
    else:
        print("Computing word2vec")
        all_lyrics = [my_preprocessor(song, remove_stopwords=False) for song in data_en['lyrics'].to_numpy()]
        data = [song.split() for song in all_lyrics]
        # Create CBOW model with gensim https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
        model_CBOW = gensim.models.Word2Vec(data, min_count=1, size=params['WORD_VEC_SIZE'], window=5, sg=0, workers=cores-1)
        # Create Skip Gram model
        model_Skip_Gram = gensim.models.Word2Vec(data, min_count=1, size=params['WORD_VEC_SIZE'], window=10, sg=1, workers=cores-1)
        # Save models for later
        # model_CBOW.save(root_path+'model_CBOW_215824_en_songs__.model')
        # model_Skip_Gram.save(root_path+'model_Skip_Gram_215824_en_songs__.model')
        print("Done")
    return model_CBOW, model_Skip_Gram

model_CBOW, model_Skip_Gram = get_word2vec_models(params)

assert len(model_CBOW.wv.vocab)==len(model_Skip_Gram.wv.vocab)
print("There are", len(model_CBOW.wv.vocab), "words in the vocabulary.")
# Checking that the learning makes sense
print("Cosine similarity between 'woman' and 'girl' - CBOW : ", model_CBOW.wv.similarity('woman', 'girl'), "'woman' and 'pasta' - CBOW : ", model_CBOW.wv.similarity('woman', 'pasta'))
print("Cosine similarity between 'woman' and 'girl' - CBOW : ", model_Skip_Gram.wv.similarity('woman', 'girl'), "'woman' and 'pasta' - CBOW : ", model_Skip_Gram.wv.similarity('woman', 'pasta'))
print("{}: {:.4f}".format(*model_CBOW.wv.most_similar(positive=['woman', 'queen'], negative=['woman'])[0]))
print("{}: {:.4f}".format(*model_Skip_Gram.wv.most_similar(positive=['woman', 'queen'], negative=['woman'])[0]))

### Keeping most common words
We want to keep the `VOCAB_SIZE` most common words (for example **30,000** like in the paper). There are **247,779** embedded words in the learnt model, some are probably misspelt.
`WORD_TO_INT` associated to each word in the vocabulary a unique value, starting from 1. The value 0 will be reserved for the padding. `MOST_FREQUENT_WORDS` lists the most frequent words in the songs.

In [None]:
# Load word occurance dictionary

def get_most_frequent_words(model, params):
    LOAD_WORD_OCC_DICT = True
    if LOAD_WORD_OCC_DICT:
        word_occurrences = np.load(root_path + 'word_occurrences.npy', allow_pickle='TRUE').item()
    else:
        all_lyrics_preprocessed = [my_preprocessor(song, remove_stopwords=False) for song in data_en['lyrics'].to_numpy()]
        splitted_songs = [song.split() for song in all_lyrics_preprocessed]
        word_occurrences = {}
        for song in splitted_songs:
            for word in song:
                if word in word_occurrences.keys():
                    word_occurrences[word] += 1
                else:
                    word_occurrences[word] = 1
        np.save(root_path + 'word_occurrences.npy', word_occurrences) 
    wo_sorted=dict(sorted(word_occurrences.items(), key=lambda x: x[1],reverse=True))
    wo_sorted_list = list(wo_sorted.keys())
    MOST_FREQUENT_WORDS = wo_sorted_list
    if params['VOCAB_SIZE'] != -1:
        MOST_FREQUENT_WORDS = wo_sorted_list[:params['VOCAB_SIZE']]
    WORD_TO_INT = {word:i+1 for i,word in enumerate(MOST_FREQUENT_WORDS)}
    WORD_TO_INT_ALL = {word:i+1 for i,word in enumerate(wo_sorted_list)}
    return MOST_FREQUENT_WORDS, WORD_TO_INT, WORD_TO_INT_ALL

MOST_FREQUENT_WORDS, WORD_TO_INT, WORD_TO_INT_ALL = get_most_frequent_words(model_CBOW, params)

### Song embedding
We convert each song into a vector of integers.

If Word2Vec is not used, we simply give a different integer to the words in the vocabulary. The integers range from 1 to `VOCAB_SIZE`, with `VOCAB_SIZE+1` reserved for out of *UNK* and padding = 0.

If Word2Vec is used, the `UNK is len(model_CBOW.wv.vocab)` and the `padding is len(model_CBOW.wv.vocab)+1`
(words outside of the vocabulary) and `len(model_CBOW.wv.vocab)+1` left for padding.

In [None]:
def embed_songs_idx_balanced(songs, model):
    word2index = {token: token_index for token_index, token in enumerate(model.wv.index2word)}
    embedded_songs_ = []
    for song in songs:
        embedded_song = np.array([], dtype = int)
        for word in song.split():
            if word in WORD_TO_INT:
                idx=word2index[word]
            else:
                idx=len(word2index)
            embedded_song = np.append(embedded_song, idx)
        embedded_songs_.append(embedded_song)
    return np.array(embedded_songs_)
params['USE_WORD2VEC']='none'

def get_song_idx_embedding(params, train_lyrics_, valid_lyrics_, test_lyrics_):
    #'none', 'cbow', 'skip_gram'
    if params['USE_WORD2VEC']=='cbow':
#         embed_song_idxs_train=embed_songs_idx_balanced(train_lyrics_, model_CBOW)
#         embed_song_idxs_valid=embed_songs_idx_balanced(valid_lyrics_, model_CBOW)
#         embed_song_idxs_test=embed_songs_idx_balanced(test_lyrics_, model_CBOW) 
        embed_song_idxs_train = np.load(root_path + 'CBOW_embed_song_idxs_train.npy', allow_pickle=True)
        embed_song_idxs_valid = np.load(root_path + 'CBOW_embed_song_idxs_valid.npy', allow_pickle=True)        
        embed_song_idxs_test = np.load(root_path + 'CBOW_embed_song_idxs_test.npy', allow_pickle=True)                
    elif params['USE_WORD2VEC']=='skip_gram':
#         embed_song_idxs_train=embed_songs_idx_balanced(train_lyrics_, model_Skip_Gram)
#         embed_song_idxs_valid=embed_songs_idx_balanced(valid_lyrics_, model_Skip_Gram)
#         embed_song_idxs_test=embed_songs_idx_balanced(test_lyrics_, model_Skip_Gram) 
        embed_song_idxs_train = np.load(root_path + 'SG_embed_song_idxs_train.npy', allow_pickle=True)                        
        embed_song_idxs_valid = np.load(root_path + 'SG_embed_song_idxs_valid.npy', allow_pickle=True)                                
        embed_song_idxs_test = np.load(root_path + 'SG_embed_song_idxs_test.npy', allow_pickle=True)                                        
    else:
        embed_song_idxs_train = [[WORD_TO_INT.get(w, params['VOCAB_SIZE']+1) for w in song.split()] for song in train_lyrics_]
        embed_song_idxs_valid = [[WORD_TO_INT.get(w, params['VOCAB_SIZE']+1) for w in song.split()] for song in valid_lyrics_]   
        embed_song_idxs_test = [[WORD_TO_INT.get(w, params['VOCAB_SIZE']+1) for w in song.split()] for song in test_lyrics_]
    return embed_song_idxs_train, embed_song_idxs_valid, embed_song_idxs_test

embed_song_idxs_train, embed_song_idxs_valid, embed_song_idxs_test = get_song_idx_embedding(params, train_lyrics, valid_lyrics, test_lyrics)

### Pad data for songs to be same length
Some songs are short and longer than other. We cut the long songs and pad the shorter ones. The padding caracter is `len(model_CBOW.wv.vocab)+1`.

In [None]:
song_lengths = [len(x) for x in embed_song_idxs_train]
pd.Series(song_lengths).hist()
plt.show()
pd.Series(song_lengths).describe()

In [None]:
def pad_line(song, params):
    '''Pads/truncates a song line to have length MAX_WORDS'''
    if params['USE_WORD2VEC'] == 'none':
        padding_int = 0
    else:
        padding_int = len(model_CBOW.wv.vocab)+1
    size = min(params['MAX_WORDS'], len(song))
    to_add = params['MAX_WORDS']-size
    # print(params['PADDING_START'])
    if params['PADDING_START']:
        new_line = np.concatenate((np.full((to_add,), padding_int), song[:size]))
    else:
        new_line = np.concatenate((song[:size], np.full((to_add,), padding_int)))
    return new_line

def get_padded_data(params, embed_song_idxs_train_, embed_song_idxs_valid_, embed_song_idxs_test_):
    # Get train data
    train_data_padded = np.zeros((params['TRAIN_SIZE'], params['MAX_WORDS']), dtype = int)
    for i, song in enumerate(embed_song_idxs_train_):
        arr = pad_line(song, params)
        train_data_padded[i,:] = arr

    # Get dev data
    valid_data_padded = np.zeros((params['VALID_SIZE'], params['MAX_WORDS']), dtype = int)
    for i, song in enumerate(embed_song_idxs_valid_):
        arr = pad_line(song, params)
        valid_data_padded[i,:] = arr

    # Get test data
    test_data_padded = np.zeros((params['TEST_SIZE'], params['MAX_WORDS']), dtype = int)
    for i, song in enumerate(embed_song_idxs_test_):
        arr = pad_line(song, params)
        test_data_padded[i,:] = arr
    return train_data_padded, valid_data_padded, test_data_padded

train_data_padded, valid_data_padded, test_data_padded = get_padded_data(params, embed_song_idxs_train, embed_song_idxs_valid, embed_song_idxs_test)

In [None]:
train_x = np.array([])

dict_length = len(model_CBOW.wv.index2word)
for word in _data_padded[0]:
    if word < dict_length:
        train_x = np.append(train_x, model_CBOW.wv[model_CBOW.wv.index2word[word]])
    else:
        train_x = np.append(train_x, padding_word)
train_x.shape = (1,200, 100)

padding_word = np.array([np.random.rand((100))])
i=0
for song in _data_padded[1:]:
    if i%100 == 0:
        print(i)
    song_x = np.array([[]])
    for word in song:
        if word < dict_length:
        	song_x = np.append(song_x, model_CBOW.wv[model_CBOW.wv.index2word[word]])
        else:
        	song_x = np.append(song_x, padding_word)
    song_x.shape = (1, 200, 100)
    train_x = np.concatenate((train_x, song_x))
    i +=1
train_x.shape

## LSTM Model

In [None]:
model_CBOW.wv

## Create PyTorch dataset

In [None]:
class SongsDataset(data.Dataset):
    def __init__(self, dataset, labels):
        self.dataset = dataset
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        X = torch.tensor(self.dataset[index], dtype=torch.long)
        X = X.to(device)
        y = torch.tensor(self.labels[index], dtype=torch.long)
        y = y.to(device)        
        return X, y

In [None]:
class GenreLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_layers, bidirectional, output_size, batch_size, word_embeddings, drop_p, use_word2vec, song_size, linear_last):
        super(GenreLSTM, self).__init__()
        self.n_hidden = n_hidden
        self.batch_size = batch_size
        self.n_directions = 2 if bidirectional else 1
        self.n_layers = n_layers
        self.song_size = song_size
        self.output_size = output_size
        self.linear_last = linear_last
        
        if use_word2vec!='none':
            # self.word_embeddings = nn.Embedding(len(model_CBOW.wv.index2word)+2, n_embed)
            self.word_embeddings = nn.Embedding(n_vocab, n_embed)            
            self.word_embeddings.weight = nn.Parameter(torch.from_numpy(word_embeddings), requires_grad=True)
            self.word_embeddings.require_grad = True
        else:
            self.word_embeddings = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first=True, bidirectional=bidirectional, dropout=drop_p) 
        # self.lstm = nn.GRU(n_embed, n_hidden, n_layers, batch_first=True, bidirectional=bidirectional, dropout=drop_p)
        # self.lstm2 = nn.GRU(128*2, 64, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(drop_p)
        self.maxpool = nn.AdaptiveMaxPool2d((1, self.n_directions*self.n_hidden))
        self.fc = nn.Linear(self.n_directions*n_hidden, output_size)
        self.lasthidden = nn.Linear(song_size*output_size, output_size)
    
    def forward(self, sentence):
        batch_size = sentence.size(0)
        # (batch_size, seq_length)
        out = self.word_embeddings(sentence)
        # (batch_size, seq_length, n_embed)
        out, _ = self.lstm(out)
        # (batch_size, seq_length, n_directions*n_hidden)
        out = self.dropout(out)
        # (batch_size, seq_length, n_directions*n_hidden)
        if not self.linear_last:
            out = self.maxpool(out)
            out = out.squeeze(1)
        # (batch_size, n_directions*n_hidden)
        # out = out.view(batch_size, -1)
        out = self.fc(out)
        # out = self.fc(out[:, -1])
        # (batch_size, n_output)
        if self.linear_last:
            out = out.view(batch_size, -1)
            out = self.lasthidden(out)
        return out

In [None]:
def run_LSTM(p, word_embedding, train_loader, valid_loader):
    model = GenreLSTM(p['VOCAB_SIZE']+2, p['WORD_VEC_SIZE'], p['N_HIDDEN'], p['N_LAYERS'], p['BIDIRECTIONAL'], p['N_GENRES'], p['BATCH_SIZE'], word_embedding, p['DROPOUT'], p['USE_WORD2VEC'], p['MAX_WORDS'], p['LINEAR_LAST'])
    model = model.to(device)
    model.double()
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0005)#,clipnorm=1.25)
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
    # optimizer = optim.RMSprop(model.parameters(), lr=LR)
    # RMSprop, SGD

    start = time.time()
    train_loss_ = []
    valid_loss_ = []
    train_acc_ = []
    valid_acc_ = []
    ### training proc
    best_valid_acc = 0

    for epoch in range(p['N_EPOCHS']):
        print(epoch, time.time() - start)

        # Training
        total_acc = 0.0
        total_loss = 0.0
        total = 0.0
        for local_batch, local_labels in train_loader:
            # Transfer to GPU
            local_batch, local_labels = local_batch.to(device), local_labels.to(device)

            # Model computations
            model.zero_grad()

            genre_scores = model(local_batch)
            loss = loss_function(genre_scores, local_labels)    
            loss.backward()
            optimizer.step()

            # calc training acc
            _, predicted = torch.max(genre_scores.data, 1)
            # print("train", predicted)
            # print("predicted", predicted)
            # print("local_labels", local_labels)
            total_acc += (predicted == local_labels).sum()
            total += len(local_labels)
            total_loss += loss.item()

        train_loss_.append(total_loss / total)
        train_acc_.append(total_acc / total)

        # Validation
        total_acc = 0.0
        total_loss = 0.0
        total = 0.0
        with torch.set_grad_enabled(False):
            for local_batch, local_labels in valid_loader:
                # Transfer to GPU
                local_batch, local_labels = local_batch.to(device), local_labels.to(device)

                genre_scores = model(local_batch)
                loss = loss_function(genre_scores, local_labels)
                # scheduler.step(loss)
                
                # calc validing acc
                _, predicted = torch.max(genre_scores.data, 1)
                # print("valid", predicted)
                total_acc += (predicted == local_labels).sum()
                total += len(local_labels)
                total_loss += loss.item()

            valid_loss_.append(total_loss / total)
            
            if total_acc / total > best_valid_acc:
                best_valid_acc = total_acc / total
                torch.save(model.state_dict(), "models/bidirectional.pth")
            valid_acc_.append(total_acc / total)
            print('[Epoch: %3d/%3d] Training Loss: %.6f, Testing Loss: %.6f, Training Acc: %.3f, Testing Acc: %.3f'
                  % (epoch, p['N_EPOCHS'], train_loss_[epoch], valid_loss_[epoch], train_acc_[epoch], valid_acc_[epoch]))
    
    return train_loss_, valid_loss_, train_acc_, valid_acc_, time.time() - start


In [None]:
def run_LSTM_with_param(p):
    WORD_EMBEDDING=[]
    model_CBOW, model_Skip_Gram = get_word2vec_models(p)
    if p['USE_WORD2VEC'] != 'none':
        if p['USE_WORD2VEC'] == 'cbow':
            model=model_CBOW
        else:
            model=model_Skip_Gram
        WORD_EMBEDDING=np.concatenate((model.wv[model.wv.index2word], [np.random.rand((p['WORD_VEC_SIZE']))], [np.random.rand((p['WORD_VEC_SIZE']))]))
        WORD_EMBEDDING.astype(np.double)
    len(WORD_EMBEDDING)

    MOST_FREQUENT_WORDS, WORD_TO_INT, WORD_TO_INT_ALL = get_most_frequent_words(model_CBOW, p)
    embed_song_idxs_train, embed_song_idxs_valid, embed_song_idxs_test = get_song_idx_embedding(p, train_lyrics, valid_lyrics, test_lyrics)
    train_data_padded, valid_data_padded, test_data_padded = get_padded_data(p, embed_song_idxs_train, embed_song_idxs_valid, embed_song_idxs_test)

    train_dataset = SongsDataset(train_data_padded, train_labels)
    train_loader = DataLoader(train_dataset, batch_size=p['BATCH_SIZE'], shuffle=True)
    valid_dataset = SongsDataset(valid_data_padded, valid_labels)
    valid_loader = DataLoader(valid_dataset, batch_size=p['BATCH_SIZE'], shuffle=True)
    return run_LSTM(p, WORD_EMBEDDING, train_loader, valid_loader)
    

In [None]:
params = [
    # {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 200, 'VOCAB_SIZE': 30000, 'USE_WORD2VEC': 'cbow', 'N_LAYERS': 1, 'BIDIRECTIONAL': False, 'LINEAR_LAST': False},
    # {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': True, 'MAX_WORDS': 200, 'VOCAB_SIZE': 30000, 'USE_WORD2VEC': 'cbow', 'N_LAYERS': 1, 'BIDIRECTIONAL': False, 'LINEAR_LAST': False},    
    # {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 50, 'VOCAB_SIZE': 30000, 'USE_WORD2VEC': 'cbow', 'N_LAYERS': 1, 'BIDIRECTIONAL': False, 'LINEAR_LAST': False},    
    # {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 600, 'VOCAB_SIZE': 30000, 'USE_WORD2VEC': 'cbow', 'N_LAYERS': 1, 'BIDIRECTIONAL': False, 'LINEAR_LAST': False},        
    # {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 200, 'VOCAB_SIZE': len(model_CBOW.wv.index2word), 'USE_WORD2VEC': 'cbow', 'N_LAYERS': 1, 'BIDIRECTIONAL': False, 'LINEAR_LAST': False},    
    # {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 200, 'VOCAB_SIZE': 30000, 'USE_WORD2VEC': 'none', 'N_LAYERS': 1, 'BIDIRECTIONAL': False, 'LINEAR_LAST': False},    
    # {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 200, 'VOCAB_SIZE': 30000, 'USE_WORD2VEC': 'skip_gram', 'N_LAYERS': 1, 'BIDIRECTIONAL': False, 'LINEAR_LAST': False},        
    # {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 200, 'VOCAB_SIZE': 30000, 'USE_WORD2VEC': 'cbow', 'N_LAYERS': 2, 'BIDIRECTIONAL': False, 'LINEAR_LAST': False},
    # {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 200, 'VOCAB_SIZE': 30000, 'USE_WORD2VEC': 'cbow', 'N_LAYERS': 1, 'BIDIRECTIONAL': True, 'LINEAR_LAST': False},
    # {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 200, 'VOCAB_SIZE': 30000, 'USE_WORD2VEC': 'cbow', 'N_LAYERS': 1, 'BIDIRECTIONAL': False, 'LINEAR_LAST': True},
    # {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 200, 'VOCAB_SIZE': len(model_CBOW.wv.index2word), 'USE_WORD2VEC': 'cbow', 'N_LAYERS': 2, 'BIDIRECTIONAL': True, 'LINEAR_LAST': True},
    # {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 600, 'VOCAB_SIZE': len(model_CBOW.wv.index2word), 'USE_WORD2VEC': 'cbow', 'N_LAYERS': 1, 'BIDIRECTIONAL': True, 'LINEAR_LAST': False},    
    {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 200, 'VOCAB_SIZE': len(model_CBOW.wv.index2word), 'USE_WORD2VEC': 'cbow', 'N_LAYERS': 1, 'BIDIRECTIONAL': True, 'LINEAR_LAST': False},        
]


dict_results_LSTM = {}
for i in range(len(params)):
    print('running', i)
    train_loss_, test_loss_, train_acc_, test_acc_, time_run = run_LSTM_with_param(params[i])
    dict_results_LSTM[i] = {'train_loss_':train_loss_, 'test_loss_':test_loss_, 'train_acc_':train_acc_, 'test_acc_':test_acc_, 'time': time_run}


In [None]:
dict_results_LSTM

### For our best models, we get the test accuracy

Once we have fixed the hyperparameters, we run the models on the test set.

In [None]:
def get_test_accuracy(parameters, path_to_model):
    WORD_EMBEDDING=[]
    model_CBOW, model_Skip_Gram = get_word2vec_models(parameters)
    if parameters['USE_WORD2VEC'] != 'none':
        if parameters['USE_WORD2VEC'] == 'cbow':
            model=model_CBOW
        else:
            model=model_Skip_Gram
        WORD_EMBEDDING=np.concatenate((model.wv[model.wv.index2word], [np.random.rand((parameters['WORD_VEC_SIZE']))], [np.random.rand((parameters['WORD_VEC_SIZE']))]))
        WORD_EMBEDDING.astype(np.double)
    model = GenreLSTM(parameters['VOCAB_SIZE']+2, parameters['WORD_VEC_SIZE'], parameters['N_HIDDEN'], parameters['N_LAYERS'], parameters['BIDIRECTIONAL'], parameters['N_GENRES'], parameters['BATCH_SIZE'], WORD_EMBEDDING, parameters['DROPOUT'], parameters['USE_WORD2VEC'], parameters['MAX_WORDS'], parameters['LINEAR_LAST'])
    model.load_state_dict(torch.load(path_to_model))
    model.eval()
    model = model.to(device)
    model.double()
    test_dataset = SongsDataset(test_data_padded, test_labels)
    test_loader = DataLoader(test_dataset, batch_size=parameters['BATCH_SIZE'], shuffle=True)
    valid_dataset = SongsDataset(valid_data_padded, valid_labels)
    valid_loader = DataLoader(valid_dataset, batch_size=parameters['BATCH_SIZE'], shuffle=True)
    test_loss_ = []
    test_acc_ = []
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    for local_batch, local_labels in valid_loader:
        # Transfer to GPU
        local_batch, local_labels = local_batch.to(device), local_labels.to(device)

        genre_scores = model(local_batch)
        # calc testing acc
        _, predicted = torch.max(genre_scores.data, 1)
        total_acc += (predicted == local_labels).sum()
        total += len(local_labels)

    print("Accuracy:", total_acc / total)



In [None]:
parameters_bidirectional = {'TRAIN_SIZE': len(train_labels), 'VALID_SIZE': len(valid_labels), 'TEST_SIZE': len(test_labels), 'N_EPOCHS': 30, 'BATCH_SIZE': 50, 'N_GENRES': 10, 'N_HIDDEN': 32, 'WORD_VEC_SIZE': 100, 'LR': 0.01, 'DROPOUT': 0.5, 'PADDING_START': False, 'MAX_WORDS': 200, 'VOCAB_SIZE': 30000, 'USE_WORD2VEC': 'cbow', 'N_LAYERS': 1, 'BIDIRECTIONAL': True, 'LINEAR_LAST': False}
get_test_accuracy(parameters_bidirectional, "models/bidirectional.pth")

# Try with keras

Here is the code that was used in the referece paper (LSTM).

In [None]:
# from keras.models import Sequential
# from keras.callbacks import ModelCheckpoint, EarlyStopping
# from keras.layers import Embedding, LSTM, Dropout, GlobalMaxPooling1D
# from keras.layers import Dense, Bidirectional, GRU
# from keras import optimizers

# ## DEFINE PARAMETERS
# params = {
#     'TRAIN_SIZE': len(train_labels),
#     'VALID_SIZE': len(valid_labels),
#     'TEST_SIZE': len(test_labels),    

#     # Hyperparameters
#     'N_EPOCHS': 20,
#     'BATCH_SIZE': 50,
#     'N_GENRES': 10,
#     'N_HIDDEN': 32,
#     'WORD_VEC_SIZE': 100, # Size of the word embedding vector WARNING: if WORD_VEC_SIZE!=100 the word embedding has to be retrained
#     'LR': 0.01,
#     'DROPOUT': 0.5,
    
#     # LSTM
#     'PADDING_START': True,
#     'MAX_WORDS': 200, # max number of words in song
#     'VOCAB_SIZE': len(model_CBOW.wv.index2word), # number of words in vocabulary, -1 if use all the vocab. Does not take into account the padding and UNK
#     'USE_WORD2VEC': 'cbow', # 'none', 'cbow', 'skip_gram'
#     'N_LAYERS': 1, #1 or 2
#     'BIDIRECTIONAL': False,
#     'LINEAR_LAST': True
# }

# WORD_EMBEDDING=[]
# model_CBOW, model_Skip_Gram = get_word2vec_models(params)
# if params['USE_WORD2VEC'] != 'none':
#     if params['USE_WORD2VEC'] == 'cbow':
#         model=model_CBOW
#     else:
#         model=model_Skip_Gram
#     WORD_EMBEDDING=np.concatenate((model.wv[model.wv.index2word], [np.random.rand((params['WORD_VEC_SIZE']))], [np.random.rand((params['WORD_VEC_SIZE']))]))
#     WORD_EMBEDDING.astype(np.double)
# len(WORD_EMBEDDING)

# MOST_FREQUENT_WORDS, WORD_TO_INT, WORD_TO_INT_ALL = get_most_frequent_words(model_CBOW, params)
# embed_song_idxs_train, embed_song_idxs_valid, embed_song_idxs_test = get_song_idx_embedding(params, train_lyrics, valid_lyrics, test_lyrics)
# train_data_padded, valid_data_padded, test_data_padded = get_padded_data(params, embed_song_idxs_train, embed_song_idxs_valid, embed_song_idxs_test)
# MAX_WORDS = params['MAX_WORDS']
# vocab_size = params['VOCAB_SIZE'] + 2
# learning_rate = .01
# training_epochs = 20
# batch_size = 64
# embed_size = params['WORD_VEC_SIZE']
# dropout = params['DROPOUT']
# n_hidden = params['N_HIDDEN'] # number of hidden states in LSTM
# print(np.shape(train_data_padded))
# print(np.shape(WORD_EMBEDDING))
# print('MAX_WORDS', MAX_WORDS, '; vocab_size', vocab_size, '; learning_rate', learning_rate, '; training_epochs', training_epochs, '; batch_size', batch_size, '; embed_size', embed_size, '; dropout', dropout, '; n_hidden', n_hidden)

# num_genres = 10
# num_classes = num_genres


# model = Sequential()
# model.add(Embedding(vocab_size, embed_size, input_length=MAX_WORDS, trainable=True)) # weights=[WORD_EMBEDDING]
# model.add(LSTM(n_hidden, activation='sigmoid', return_sequences=True))
# # model.add(Bidirectional(GRU(n_hidden, return_sequences=True, init='he_normal', inner_init='he_normal', inner_activation='sigmoid'), name='bidirect_word'))
# model.add(Dropout(dropout))
# model.add(GlobalMaxPooling1D())
# model.add(Dense(num_genres, activation='softmax'))

# optimizer = optimizers.RMSprop(lr=learning_rate)
# model.compile(loss='sparse_categorical_crossentropy',
#               optimizer='rmsprop',
#               metrics=['acc'])

# print("model fitting - Baseline LSTM")
# print(model.summary())
# earlystopping = EarlyStopping(monitor='val_loss', patience=3)
# # checkpointer = ModelCheckpoint(filepath='results/lstm/lstmbest.hdf5',verbose=1,save_best_only=True)
# hist = model.fit(train_data_padded, train_labels, validation_data=(valid_data_padded, valid_labels),
#           nb_epoch=training_epochs, batch_size=batch_size, callbacks=[earlystopping])
# print(hist.history)
# # model.save('results/lstm/lstm.h5')

# # evals = model.evaluate(test_data, test_labels)
# # print("Test accuracy:", evals)


# Keras - BERT

In [None]:
# #Installing keras-bert and keras adapter
# !pip install -q keras-bert keras-rectified-adam
# !wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
# !unzip -o uncased_L-12_H-768_A-12.zip

# # Parameters
# SEQ_LEN = 128
# BATCH_SIZE = 128
# EPOCHS = 30
# LR = 1e-4

# # Pretrained model path
# import os

# pretrained_path = 'uncased_L-12_H-768_A-12'
# config_path = os.path.join(pretrained_path, 'bert_config.json')
# checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
# vocab_path = os.path.join(pretrained_path, 'vocab.txt')

# # TF_KERAS must be added to environment variables in order to use TPU
# os.environ['TF_KERAS'] = '1'

# # Initialize TPU strategy
# import tensorflow as tf
# from keras_bert import get_custom_objects
# TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
# resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)
# tf.contrib.distribute.initialize_tpu_system(resolver)
# strategy = tf.contrib.distribute.TPUStrategy(resolver)

# # load bert model
# import codecs
# from keras_bert import load_trained_model_from_checkpoint
# token_dict = {}
# with codecs.open(vocab_path, 'r', 'utf8') as reader:
#     for line in reader:
#         token = line.strip()
#         token_dict[token] = len(token_dict)
# with strategy.scope():
#     model = load_trained_model_from_checkpoint(
#         config_path,
#         checkpoint_path,
#         training=True,
#         trainable=True,
#         seq_len=SEQ_LEN,
#     )
# model.summary()

# import os
# import numpy as np
# from tqdm import tqdm
# from keras_bert import Tokenizer
# import pandas as pd
# import tensorflow as tf
# from google.colab import files
# import json

# train_data = np.load('train_data_colab_bert.npy', allow_pickle=True)
# train_labels = np.load('train_labels_colab_bert.npy', allow_pickle=True)
# valid_data = np.load('valid_data_colab_bert.npy', allow_pickle=True)
# valid_labels = np.load('valid_labels_colab_bert.npy', allow_pickle=True)
# train_x = [train_data, np.zeros_like(train_data)]
# test_x = [valid_data, np.zeros_like(valid_data)]
# train_y = train_labels
# test_y = valid_labels

# # Build Custom Model
# from tensorflow.python import keras
# from keras_radam import RAdam

# with strategy.scope():
#     inputs = model.inputs[:2]
#     dense = model.get_layer('NSP-Dense').output
#     outputs = keras.layers.Dense(units=20, activation='softmax')(dense)
#     model = keras.models.Model(inputs, outputs)
#     model.compile(
#         RAdam(lr=LR),
#         loss='sparse_categorical_crossentropy',
#         metrics=['sparse_categorical_accuracy'],
#     )

# #  Initialize Variables
# import tensorflow as tf
# import tensorflow.keras.backend as K
# sess = K.get_session()
# uninitialized_variables = set([i.decode('ascii') for i in sess.run(tf.report_uninitialized_variables())])
# init_op = tf.variables_initializer(
#     [v for v in tf.global_variables() if v.name.split(':')[0] in uninitialized_variables]
# )
# sess.run(init_op)

# # Fit
# model.fit(
#     train_x,
#     train_y,
#     epochs=EPOCHS,
#     batch_size=BATCH_SIZE,
#     validation_data=(test_x, test_y)
# )

# # @title Predict
# predicts = model.predict(test_x, verbose=True).argmax(axis=-1)

# # @title Accuracy
# print(np.sum(test_y == predicts) / test_y.shape[0])

In [None]:
# train_data = np.load('train_data_colab_bert.npy', allow_pickle=True)
# train_labels = np.load('train_labels_colab_bert.npy', allow_pickle=True)
# valid_data = np.load('valid_data_colab_bert.npy', allow_pickle=True)
# valid_labels = np.load('valid_labels_colab_bert.npy', allow_pickle=True)
# train_x = [train_data, np.zeros_like(train_data)]
# test_x = [valid_data, np.zeros_like(valid_data)]
# train_y = train_labels
# test_y = valid_labels

# from google.colab import files
# import json

# f=open("LSTM_BERT.txt","w+")

# f.write(json.dumps(hist.history))
# f.write("predicts")
# predicts = model.predict(test_x, verbose=True).argmax(axis=-1)
# f.write(str(predicts))
# f.write("accuracy")
# f.write(str(np.sum(test_y == predicts) / test_y.shape[0]))
# f.close()

# time.sleep(300)
# files.download('LSTM_BERT.txt')

# Feed forward neural network

In [None]:
# from __future__ import unicode_literals, print_function, division
# from io import open 
# import glob 
# import os 
# import pandas as pd 
# from nltk.corpus import stopwords, words 
# import re 
# from nltk import tokenize
# import numpy as np 
# from sklearn.utils import shuffle 
# from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score, accuracy_score, auc, f1_score, recall_score

# import nltk
# nltk.download('words')

# os.environ['KMP_DUPLICATE_LIB_OK']='True'

# def balance(df): 
    
#     lengths = [] 

#     pop = df.loc[df['genre'] == 'Pop']
#     rock = df.loc[df['genre'] == 'Rock']
#     hh = df.loc[df['genre'] == 'Hip-Hop']
#     metal = df.loc[df['genre'] == 'Metal']
#     country = df.loc[df['genre'] == 'Country']
#     elec = df.loc[df['genre'] == 'Electronic']
#     folk = df.loc[df['genre'] == 'Folk']
#     rb = df.loc[df['genre'] == 'R&B']
#     indie = df.loc[df['genre'] == 'Indie']

#     genre_list = ['Pop', 'Rock', 'Hip-Hop', 'Metal', 'Country', 'Electronic', 'Folk', 'R&B', 'Indie']
#     print(len(pop))

#     for c in genre_list: 
#         subset = df.loc[df['genre'] == c]
#         lengths.append(len(subset))
    
#     print(lengths)
#     bNum = min(lengths)
#     # bNum = 1000
#     pop = shuffle(pop, random_state = 2)
#     rock = shuffle(rock, random_state = 2)
#     hh = shuffle(hh, random_state = 2)
#     metal = shuffle(metal, random_state= 2)
#     country = shuffle(country, random_state = 2)
#     elec = shuffle(elec, random_state = 2)
#     folk = shuffle(folk, random_state= 2)
#     rb = shuffle(rb, random_state = 2)
#     indie = shuffle(indie, random_state = 2)

#     pop = pop.iloc[:bNum]
#     rock = rock.iloc[:bNum]
#     hh = hh.iloc[:bNum]
#     metal = metal.iloc[:bNum]
#     country = country.iloc[:bNum]
#     elec = elec.iloc[:bNum]
#     folk = folk.iloc[:bNum]
#     rb = rb.iloc[:bNum]
#     indie = indie.iloc[:bNum]

#     df2 = pd.concat([pop, rock, hh, metal, country, elec, folk, rb, indie], axis=0)
#     lengths = []
#     for c in genre_list: 
#         subset = df2.loc[df['genre'] == c]['lyrics']
#         lengths.append(len(subset))
#     print(lengths)
  
#     return df2

# def remove_unknown(row):
#     genre = row['genre']
    
#     if genre == 'Not Available': return None 
#     if genre == 'Other': return None


# #LOAD DATA

# df = pd.read_csv('./lyrics.csv')
# df = df[df.lyrics != 'Instrumental']
# # df['genre'] = df.apply(remove_unknown, axis=1)
# df.dropna(axis = 0, how="any", inplace=True)
# #df = balance(df)


# REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
# BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
# STOPWORDS = set(stopwords.words('english'))
# ENGLISH = set(nltk.corpus.words.words())


# def write_to_txt(df, name): 
#     lyrics = df.loc[df['genre'] == name]['lyrics']
#     np_lyrics = lyrics.to_numpy(dtype='str')
#     full_text = ""
#     with open('./lyrics/'+name+'_lyrics.txt', "w+") as f: 
#         for row in range(len(np_lyrics)):
#             try: 
#                 l = np_lyrics[row].replace('\n', ' ').lower()
#                 l = REPLACE_BY_SPACE_RE.sub(' ', l) 
#                 l = BAD_SYMBOLS_RE.sub('', l)
#                 l = ' '.join(word for word in l.split() if word not in STOPWORDS)
#                 f.write(l + '\n')
#             except: print(l)
#         print('done loading', name)   
# genre_list = ['Pop', 'Rock', 'Hip-Hop', 'Metal', 'Country', 'Electronic', 'Folk', 'R&B', 'Indie']

# for g in genre_list: 
#     if not os.path.exists('./lyrics/'+g+"_lyrics.txt"): 
#         write_to_txt(df, g)

# def findFiles(path): return glob.glob(path)

# print(findFiles('lyrics/*.txt'))

# import unicodedata
# import string 

# all_letters = string.ascii_letters + " .,;'"
# n_letters = len(all_letters)

# #turn unicode string to plain ASCII 
# def unicodeToAscii(s): 
#     return ''.join(
#         c for c in unicodedata.normalize('NFD', s)
#         if unicodedata.category(c) != 'Mn'
#         and c in all_letters
#     )
# #print(unicodeToAscii('Ślusàrski'))

# #Build category liens dictionary, a list of names per language 
# category_lines = {} 
# all_categories = [] 

# #Read a file and split into lines 
# def readLines(filename): 
#     lines = open(filename, encoding="utf-8").read().strip().split('\n')
#     return [unicodeToAscii(line) for line in lines]

# for filename in findFiles('lyrics/*.txt'): 
#     category = os.path.splitext(os.path.basename(filename))[0]
#     all_categories.append(category)
#     lines = readLines(filename)
#     category_lines[category] = lines
# n_categories = len(all_categories)

# print("EVALUTATING LYRICS FOR", all_categories)

# #CONVERT DATA TO TENSOR
# import torch 
# def letterToIndex(letter): 
#     return all_letters.find(letter)

# def lineToTensor(line): #one hot encoding
#     tensor = torch.zeros(len(line), 1, n_letters)
#     for li, letter in enumerate(line): 
#         tensor[li][0][letterToIndex(letter)] = 1 
#     return tensor 

# print(lineToTensor(category_lines['Hip-Hop_lyrics'][:1]).size())

# #BUILD NETWORK 
# import torch.nn as nn 

# class FF(nn.Module): 
#     def __init__(self, input_size, hidden_size, output_size):
#         super(FF, self).__init__()

#         self.hidden_size = hidden_size

#         self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
#         self.i2o = nn.Linear(input_size + hidden_size, output_size)
#         self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, input, hidden):
#         combined = torch.cat((input, hidden), 1)
#         hidden = self.i2h(combined)
#         output = self.i2o(combined)
#         output = self.softmax(output)
#         return output, hidden

#     def initHidden(self):
#         return torch.zeros(1, self.hidden_size)

# n_hidden = 128
# rnn = RNN(n_letters, n_hidden, n_categories)

# print("net param", rnn)

# #TESTING RNN
# # input = lineToTensor(category_lines['hh_lyrics'][:1])
# # hidden = torch.zeros(1, n_hidden)
# # output, next_hidden = rnn(input[0], hidden)
# # print(output)

# #TRAINING RNN
# def categoryFromOutput(output): 
#     top_n, top_i = output.topk(1)
#     category_i = top_i[0].item() 
#     return all_categories[category_i], category_i

# # print(categoryFromOutput(output))

# import random 

# def randomChoice(l): 
#     return l[random.randint(0, len(l) - 1)]

# def randomTrainingExample(): 
#     category = randomChoice(all_categories)
#     line = randomChoice(category_lines[category])
#     category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
#     line_tensor = lineToTensor(line)
#     return category, line, category_tensor, line_tensor
# criterion = nn.NLLLoss()

# #Each loop of training will: 
#     #Create input and target tensors 
#     #Create zeroed initial hidden state 
#     #Read each letter in and keep hidden state for next letter 
#     #Compare final output to target 
#     #Back propagate 
#     #Return output and loss 

# learning_rate = 0.01

# def train(category_tensor, line_tensor): 
#     hidden = rnn.initHidden()
#     rnn.zero_grad() 
#     for i in range(line_tensor.size()[0]): 
        
#         output, hidden = rnn(line_tensor[i], hidden)

    
#     loss = criterion(output, category_tensor)
#     loss.backward() 
    
#     for p in rnn.parameters(): 
#         p.data.add_(-learning_rate, p.grad.data)

#     return output,loss.item() 

# import time 
# import math 

# n_iters = 1000 #100000
# print_every = 50 #5000
# plot_every = 10 #1000 

# current_loss = 0 
# all_losses = [] 

# def timeSince(since): 
#     now = time.time() 
#     s = now - since 
#     m = math.floor(s/60)
#     s -= m * 60 
#     return '%dm %ds' % (m, s)
# start = time.time() 
# print('STARTING TRAINING')
# y_pred, y_true = [] , [] 
# for iter in range(1, n_iters+1): 
#     category, line, category_tensor, line_tensor = randomTrainingExample()
#     try: 
#         output, loss = train(category_tensor, line_tensor)
#         current_loss += loss 

#         guess, guess_i = categoryFromOutput(output)
#         y_pred.append(guess)
#         y_true.append(category)
#         if iter % print_every == 0:
#             guess, guess_i = categoryFromOutput(output)
#             correct = '✓' if guess == category else '✗ (%s)' % category
#             print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line[:20], guess, correct))
#             print('accuracy', accuracy_score(y_true, y_pred))
#         # Add current loss avg to list of losses
#         if iter % plot_every == 0:
#             all_losses.append(current_loss / plot_every)
#             current_loss = 0
#     except: print(category, line)

# #PLOT TRAINING LOSS 
# import matplotlib.pyplot as plt
# import matplotlib.ticker as ticker

# plt.figure()
# plt.plot(all_losses)
# plt.show()

# print("EVALUATING PERFORMANCE")

# #EVALUATE PERFORMANCE 
# confusion = torch.zeros(n_categories, n_categories)
# n_confusion = 1000 #10000

# def evaluate(line_tensor): 
#     hidden = rnn.initHidden()
#     for i in range(line_tensor.size()[0]): 
#         output, hidden = rnn(line_tensor[i], hidden)
#     return output 

# for i in range(n_confusion): 
#     category, line, category_tensor, line_tensor = randomTrainingExample()
#     output = evaluate(line_tensor)
#     guess, guess_i = categoryFromOutput(output)
#     category_i = all_categories.index(category)
#     confusion[category_i][guess_i] += 1

# for i in range(n_categories): 
#     confusion[i] = confusion[i]/ confusion[i].sum() 

# # Set up plot
# fig = plt.figure()
# ax = fig.add_subplot(111)
# cax = ax.matshow(confusion.numpy())
# fig.colorbar(cax)

# # Set up axes
# ax.set_xticklabels([''] + all_categories, rotation=90)
# ax.set_yticklabels([''] + all_categories)

# # Force label at every tick
# ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
# ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

# # sphinx_gallery_thumbnail_number = 2
# plt.show()