# COMP550 - Final Project
---

Links:
- https://www.kaggle.com/ficklemaverick/lyrics-generator
- https://www.kaggle.com/danofer/music-lyrics-clean-export

## Table of content
[1. Imports](#imports)  
[2. Import & Cleaning data and Exploratory Data Analysis](#imports-clean)  
[3. Preprocessing steps](#preprocessing)  
[4. Naïve majority model](#naive-model)   
[5. Logistic Regression](#log-reg)  
[6. Naïve Bayes](#naive-bayes)  
[7. Support Vector Machine](#SVM)  
[8. Sequencial model - LSTM](#LSTM)  

# 1. Imports  <a class="anchor" id="imports"></a>

In [None]:
IN_GOOGLE_COLAB = False
root_path = 'data/'
if IN_GOOGLE_COLAB:
    !pip install langdetect
    from google.colab import drive
    drive.mount('/content/drive')
    root_path = 'drive/My Drive/COMP550-Project/data/'

In [None]:
import pandas as pd
import numpy as np
import string
import time
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from langdetect import detect
from scipy.stats import uniform
import warnings
import multiprocessing
cores = multiprocessing.cpu_count()


# nltk imports
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

# sklearn imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC

# gensim imports
import gensim
from gensim.models import Word2Vec
import gensim.downloader as api

# pytorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable

# 2. Import & Cleaning data and Exploratory Data Analysis   <a class="anchor" id="imports-clean"></a>

**NOTE**: Detecting the language of all the songs is very long (15 minutes). To avoid this step we import directly the preprocessed english that is split into 3 sets: training, validation and test.

In [None]:
USE_CLEANED_DATA = True
cleaned_data_path = root_path + "cleaned_data.csv"
data_path = root_path + "lyrics.csv"
data_raw = pd.read_csv(data_path)
print(len(data_raw), "songs in the dataset")
print(data_raw.head())

The dataset has the following columns:
- **index** (int): index of the song in the dataset
- **song** (string): name of the song
- **year** (float) -> (int): release year
- **artist** (string): artist of the song
- **genre** (string): the genre, this is the label we want to predict
- **lyrics** (string): the lyrics of the song. This is the data we will use to predict the genre. We need to preprocess this data.

If we remove the null elements, we are left with **265701** songs. We convert the year from float to int.

In [None]:
data_all = data_raw[pd.notnull(data_raw)]
data_all = data_all.dropna(how='any',axis=0)
data_all['year'] = pd.to_numeric(data_all['year'], downcast='integer')
data_all['index'] = pd.to_numeric(data_all['index'], downcast='integer')
data_all = data_all.reset_index(drop=True)
data_all

We keep only English songs, using the `langdetect` library. There are **237,363 English songs** in the previous 265,701 songs.

In [None]:
if not USE_CLEANED_DATA:
    en_songs = []
    for song in data_all['lyrics']:
        try:
            lang = detect(song)
            if lang == 'en':
                en_songs.append(True)
            else:
                en_songs.append(False)
        except:
            en_songs.append(False)
    data_en = data_all[en_songs]
    data_en.reset_index(drop=True)
else:
    data_en = pd.read_csv(cleaned_data_path)
    
data_en.head()

In [None]:
data_en['genre'].unique()

We remove songs where the labels are "Other" or "Not Available". This reduces the number of songs from 237363 to **215,825 songs**.

In [None]:
labels = data_en['genre'].tolist()
keep_song = [genre not in ['Not Available', 'Other'] for genre in data_en['genre'].tolist()]
data_en = data_en[keep_song]
data_en = data_en.reset_index(drop=True)
data_en['genre'].value_counts()

Above is the the number of songs in each genre category. **46,5% of the songs are Rock songs**. 

#### We split the data in training, validation and test sets
The english songs are smplit into 3 sets.

In [None]:
GENRE_TO_INT = {'Pop':0, 'Hip-Hop':1, 'Rock':2, 'Metal':3, 'Country':4, 'Jazz':5, 'Electronic':6, 'Folk':7, 'R&B':8, 'Indie':9}
INT_TO_GENRE = {0:'Pop', 1:'Hip-Hop', 2:'Rock', 3:'Metal', 4:'Country', 5:'Jazz', 6:'Electronic', 7:'Folk', 8:'R&B', 9:'Indie'}
lyrics = data_en['lyrics'].tolist()
labels = np.array([GENRE_TO_INT[genre] for genre in data_en['genre'].tolist()])
lyrics_train, lyrics_test, labels_train, labels_test = train_test_split(lyrics, labels, test_size=0.1, shuffle=True, random_state=43, stratify=labels)
lyrics_train, lyrics_valid, labels_train, labels_valid = train_test_split(lyrics_train, labels_train, test_size=0.1, shuffle=True, random_state=43, stratify=labels_train)

# Smaller dataset for wuick training
# lyrics_selected and labels_selected contain the 25,000 songs that we consider in our project
# lyrics_train and labels_train contain the 20,000 songs of the training set (80%)
# lyrics_valid and labels_valid contain the 2,500 songs of the training set (10%)
# lyrics_test and labels_test contain the 2,500 songs of the test set (10%)

tmp_1, lyrics_selected, tmp_2, labels_selected = train_test_split(lyrics_train, labels_train, test_size=25000, shuffle=True, random_state=43, stratify=labels_train)
lyrics_train, lyrics_selected_2, labels_train, labels_selected_2 = train_test_split(lyrics_selected, labels_selected, test_size=5000, shuffle=True, random_state=43, stratify=labels_selected)
lyrics_valid, lyrics_test, labels_valid, labels_test = train_test_split(lyrics_selected_2, labels_selected_2, test_size=2500, shuffle=True, random_state=43, stratify=labels_selected_2)

# print("Light training set length:", len(lyrics_light))
print("Training set length:", len(lyrics_train))
print("Validation set length:", len(lyrics_valid))
print("Test set length:", len(lyrics_test))

In [None]:
lyrics_train[0]

# 3. Preprocessing steps <a class="anchor" id="preprocessing"></a>
The lyrics need to be cleaned before we can use them.
- remove \n line breaks
- remove punctuation
- lowercase the lyrics
- remove verse and chorus indications that are under the form [verse x]
- remove tokens that have a null length

In [None]:
# replace line breaks, removes punctuation, set everything to lowercase
# removes word if length <= 2, [verse X] or [chorus y] indication
# remove stopwords
def my_preprocessor(song):
    song = song.replace('\n', ' ')
    song = song.translate(str.maketrans('', '', string.punctuation))
    song = song.lower()
    song_token = song.split(' ')
    song_token = [w for w in song_token if (len(w) >= 3 and w[0] != '[' and w[-1] != ']')]
    song_token = [w for w in song_token if not any(c.isdigit() for c in w)]
    stop_words = set(stopwords.words('english'))
    song_token = [w for w in song_token if (w not in stop_words and u'\x9d' not in w)]
    song = ' '.join(song_token)
    return song

# tokenize the song
def my_tokenizer(song): 
    tokens = song.split(' ')
    return tokens

# tokenize the song and stems its tokens
def my_tokenizer_stem(song): 
    tokens = song.split(' ') 
    stemmer = PorterStemmer() 
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

# tokenize the song and lemmas its tokens
def my_tokenizer_lemma(song):
    song_with_pos = pos_tag(song.split(' '))
    POS_correspondance = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}
    lemmatizer = WordNetLemmatizer()
    lemmatized_song = [lemmatizer.lemmatize(w[0], POS_correspondance.get(w[1][0], wordnet.NOUN)) for w in song_with_pos]
    return lemmatized_song

print(my_tokenizer(my_preprocessor(lyrics_train[2]))[:10])
print(my_tokenizer_stem(my_preprocessor(lyrics_train[2]))[:10])
print(my_tokenizer_lemma(my_preprocessor(lyrics_train[2]))[:10])

#### Create csvs for train, valid and test data
In the rest of the jupyter notebook we directly import the preprocessed data.  
There are 3 csvs:
- `train_data.csv` containing 20,000 preprocessed english songs
- `valid_data.csv` containing 2,500 preprocessed english songs
- `test_data.csv` containing 2,500 preprocessed english songs

In [None]:
# tuples_to_preprocess = [
#     (lyrics_train, labels_train, "train_data_2.csv"),
#     (lyrics_valid, labels_valid, "valid_data_2.csv"),
#     (lyrics_test, labels_test, "test_data_2.csv"),      
# ]
# for lyrics, labels, csv_name in tuples_to_preprocess:
#     lyrics_preprocessed = [my_preprocessor(song) for song in lyrics]
#     lyrics_stemmed = [' '.join(my_tokenizer_stem(song)) for song in lyrics_preprocessed]
#     lyrics_lemmad = [' '.join(my_tokenizer_lemma(song)) for song in lyrics_preprocessed]
#     tmp_df = pd.DataFrame.from_dict({
#         "lyrics": lyrics_preprocessed, "lyrics_stemmed": lyrics_stemmed, "lyrics_lemmad": lyrics_lemmad, "genre": labels
#     })
#     tmp_df.to_csv(root_path+csv_name, index=False)

# data_en_all_preprocessed = [my_preprocessor(song) for song in data_en['lyrics']]
# data_en_all_df = pd.DataFrame.from_dict({"lyrics": data_en_all_preprocessed, "genre": np.array([GENRE_TO_INT[genre] for genre in data_en['genre'].tolist()])})
# data_en_all_df.to_csv(root_path+"data_en_all_data.csv", index=False)

train_df = pd.read_csv(root_path + "train_data.csv")
valid_df = pd.read_csv(root_path + "valid_data.csv")
test_df = pd.read_csv(root_path + "test_data.csv")
data_en_all_df = pd.read_csv(root_path + "data_en_all_data.csv")

# 4. Naïve Majority Model  <a class="anchor" id="naive-model"></a>
In this naïve majority model, we guess that all the songs have the genre 'Rock', which is the genre that has the majority of songs. This is a first baseline model, that we can use to compare the results of logistic regression, naive bayes, ...

In [None]:
print(classification_report(train_df['genre'], [2]*len(train_df['lyrics']), target_names=list(GENRE_TO_INT.keys())))

The precision for our baseline model is **47%**.

# 5. Logistic Regression  <a class="anchor" id="log-reg"></a>
We test different forms of the vectorised data: stemmed, lemmatized and no token transformation. The step to vectorize the data is quite long so we decide to test different hyperparameters of a model AFTER the vectorization is performed.

##### With no hyperparameter tuning:
Accuracy on training set: 0.9474%  
Accuracy on validation set: 0.5452%  

#### Best model with k-fold cross validation
Best accuracy with the following model : **58,6%** no stemming or lemmatization, TD-IDF vectorization, regularization strength C: 0.1, max_df: 0.7, max_features: 150000, ngram_range: bigram}.
On development set: **59,52%**


### Grid search
With a small dataset (10,000 songs) we grid search on the hyperparameters.

In [None]:
train_lyrics = train_df['lyrics']
lyrics_stemmed = train_df['lyrics_stemmed']
lyrics_lemmad = train_df['lyrics_lemmad']
train_labels = train_df['genre']
lyrics_preprocessed = train_lyrics

In [None]:
# Define a pipeline combining a text feature extractor with a simple classifier
GRID_SEARCH_ON = False
pipeline = Pipeline([
    # ('vect', CountVectorizer()),
    ('vect', TfidfVectorizer()),    
    ('clf', LogisticRegression(multi_class='auto', solver='lbfgs', penalty='l2', max_iter=100)),
])

parameters = {
    'vect__max_df': [0.8],
    'vect__max_features': [210000],
    'vect__ngram_range': [(1,2)],
    'vect__norm': ['l2'],
    'clf__C': [2.5, 2.6, 2.7, 2.8, 2.9],
}

# find the best parameters for both the feature extraction and the classifier
if GRID_SEARCH_ON:
    grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
    start = time.time()
    grid_search.fit(lyrics_lemmad, train_labels)
    end = time.time()
    print("done in %0.3fs" % (end - start))
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

#### Results grid search 
The different hyperparameters and ranges why are testing:
- **tokenizer**: [my_tokenizer, my_tokenizer_stem, my_tokenizer_lemma].
- **max_df**: range(0.3, 1)
- **max_features**: range(10000, 200000)
- **ngram_range**: unigrams, bigrams. Unigram and bigram models tend to have the best results.
- **C**: range(0.01, 3). The regularization strength is the most important parameter to finetune. A value around 0.1 increases the accuracy up to 10% compared to a bad choice of strength. When TF-IDF is on, the strength needs to be around 2.
- **TFIDF**: on or off (depends on which vectorizer we use). When turned on, the accuracy is higher.
- **norm**: when TFIDF=on defines the unit norm of each row.

There are a lot of different possible combinations. Here is the methodology for grid search.
0. preprocessing = none
1. Try out 3 different values for each hyperparameter (min, max, middle) and see which parameters modify the most the accuracy. For example the tokenizer doesn't change the accuracy that much, but the regularization strength affects a lot the accuracy.
2. For each hyperparameter that doesn't have a big impacy, chose the value that gives the highest accuracy. If there is no trend (for example the hyperparameter sometimes give better results with a certain value and other times a worst result, take the value that has the smallest computation time).
3. The value of regularization strength is the most important hyperparameter to determine. A value around 0.1 is a good choice.
4. Little by little, trim the ranges of the hyperparameter choices, taking each time the one that affects the most the accuracy.
5. Repeat from 0 for preprocessing = stemming, lemmatization
6. Repeat from 0 for TFIDF = on

**TFIDF=off**  
Best with no tokenization modification: **55,7%** {C: 0,07, max_df: 0,7, max_features: 100000, ngram_range: bigram}  
Best with stemming: **55,9%** {C: 0.1, max_df: 0.7, max_features: 150000, ngram_range: bigram}  
Best with lemmatization: **55,5%** {C: 0.14, max_df: 0.7, max_features: 150000, ngram_range: bigram}

**TFIDF=on**  
Best with no tokenization modification:  **58,6%** {C: 2.6, max_df: 0.5, max_features: 210000, ngram_range: bigram, norm='l2'}  
Best with stemming: **58,2%** {C: 2.2, max_df: 0.5, max_features: 25000, ngram_range: bigram, norm='l2'}  
Best with lemmatization:  **58,5** {C: 2.8, max_df: 0.8, max_features: 210000, ngram_range: bigram, norm='l2'}  

In [None]:
# grid_search_df = pd.DataFrame.from_dict(grid_search.cv_results_)
# grid_search_df.to_csv(root_path+"result_reglog_tfidf_preprocessed_6.csv", sep=';', decimal=',')

#### We compute the accuracy of the best model on the validation set

In [None]:
# vectorizer = TfidfVectorizer(max_df=0.8, max_features=210000, ngram_range=(1, 2), norm='l2')
# classifier = LogisticRegression(multi_class='auto', solver='lbfgs', penalty='l2', C=2.8, max_iter=1000)
# lyrics_train_vec = vectorizer.fit_transform(train_df['lyrics'])
# lyrics_valid_vec = vectorizer.transform(valid_df['lyrics'])
# classifier.fit(lyrics_train_vec, train_df['genre'])
# print("Accuracy on training set:", accuracy_score(train_df['genre'], classifier.predict(lyrics_train_vec)))
# print("Accuracy on validation set:", accuracy_score(valid_df['genre'], classifier.predict(lyrics_valid_vec)))

# 6. Naïve Bayes Model  <a class="anchor" id="naive-bayes"></a>
We test different forms of the vectorised data: stemmed, lemmatized and no token transformation. The step to vectorize the data is quite long so we decide to test different hyperparameters of a model AFTER the vectorization is performed.

#### With no hyperparameter tuning:
Accuracy on training set: 0.6843%  
Accuracy on validation set: 0.5856%  

#### Best model with k-fold cross validation
Best accuracy with TFIDF and stemming on training set: **63,91%**  
On validation set: **59,76%**.

In [None]:
# Define a pipeline combining a text feature extractor with a simple classifier
GRID_SEARCH_ON = False
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    #('vect', TfidfVectorizer()),    
    ('clf', MultinomialNB()),
])

parameters = { 
    'vect__max_df': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'vect__max_features': [100000,150000,200000],
    'vect__ngram_range': [(1,1),(1,2)],
    #'vect__norm': ['l2'], #not a parameter for CountVectorizer()
    'clf__fit_prior': [True],
    'clf__alpha': [1],
}

# find the best parameters for both the feature extraction and the classifier
if GRID_SEARCH_ON:
    grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
    start = time.time()
    grid_search.fit(lyrics_lemmad, train_labels)
    end = time.time()
    print("done in %0.3fs" % (end - start))
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

#### Results grid search 
The different hyperparameters and ranges why are testing:
- **tokenizer**: [my_tokenizer, my_tokenizer_stem, my_tokenizer_lemma]. 
- **max_df**: range(0.3, 1). This value is pretty much 0.2 across the board for all the following experiments.
- **max_features**: range(10000, 200000). For TFIDF = off, this value is in the upper end of the range. For TFIDF = on, this value is typically much lower (see results)
- **ngram_range**: unigrams, unigrams and bigrams, bigrams. (1,2) is the best performing parameter in the following tests.
- **alpha**: float - controls smoothing; 0 is no smoothing, 1 is Laplace smoothing. 
- **fit_prior**: bool - whether to learn class priors. All test run best with TRUE.
- **TFIDF**: on or off (depends on which vectorizer we use).
- **norm**: when TFIDF=on defines the unit norm of each row.

There are a lot of different possible combinations. Here is the methodology for grid search.
0. preprocessing = none
1. Try out 3 different values for each hyperparameter (min, max, middle) and see which parameters modify the most the accuracy. For example the tokenizer doesn't change the accuracy that much, but the regularization strength affects a lot the accuracy.
2. For each hyperparameter that doesn't have a big impacy, chose the value that gives the highest accuracy. If there is no trend (for example the hyperparameter sometimes give better results with a certain value and other times a worst result, take the value that has the smallest computation time).
3. The value of regularization strength is the most important hyperparameter to determine. A value around 0.1 is a good choice.
4. Little by little, trim the ranges of the hyperparameter choices, taking each time the one that affects the most the accuracy.
5. Repeat from 0 for preprocessing = stemming, lemmatization
6. Repeat from 0 for TFIDF = on

Initial parameters run as: parameters = { 
    'vect__max_df': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'vect__max_features': [10000,100000,200000],
    'vect__ngram_range': [(1,1),(1,2),(2,2)],
    'vect__norm': ['l2','l1'], #not a parameter for CountVectorizer()
    'clf__fit_prior': [True,False],
    'clf__alpha': [0,0.5,1],
From here we refine our selection criteria per tokenized set.

**TFIDF=off**

Best with no tokenization modification: **57.12%** {'clf__alpha': 1, 'clf__fit_prior': True, 'vect__max_df': 0.2, 'vect__max_features': 150000, 'vect__ngram_range': (1, 2)}

Best with stemming: **56.76%**  {clf__alpha: 1,	clf__fit_prior: True, vect__max_df: 0.2, vect__max_features: 200000, vect__ngram_range: (1, 2)}

Best with lemmatization: **56.92%** {'clf__alpha': 1, 'clf__fit_prior': True, 'vect__max_df': 0.3, 'vect__max_features': 150000, 'vect__ngram_range': (1, 2)}

**TFIDF=on**

Best with no tokenization modification: **58.20%** {'clf__alpha': 0.1, 'clf__fit_prior': True, 'vect__max_df': 0.2, 'vect__max_features': 10000, 'vect__ngram_range': (1, 2), 'vect__norm': 'l2'}

Best with stemming: **58.36%** {'clf__alpha': 0.1, 'clf__fit_prior': True, 'vect__max_df': 0.2, 'vect__max_features': 7500, 'vect__ngram_range': (1, 2), 'vect__norm': 'l2'}

Best with lemmatization: **58.25%** {'clf__alpha': 0.1, 'clf__fit_prior': True, 'vect__max_df': 0.2, 'vect__max_features': 7500, 'vect__ngram_range': (1, 2), 'vect__norm': 'l2'}

In [None]:
# grid_search_df = pd.DataFrame.from_dict(grid_search.cv_results_)
# grid_search_df.to_csv(root_path+"result_tuned_lemmad.csv", sep=';', decimal=',')

#### We compute the accuracy of the best model on the validation set

In [None]:
# vectorizer = TfidfVectorizer(max_df=0.2, max_features=7500, ngram_range=(1, 2), norm='l2')#add best parameters
# classifier = MultinomialNB(alpha=0.1,fit_prior=True)#add best parameters
# train_lyrics_stemmed = [' '.join(my_tokenizer_stem(song)) for song in train_df['lyrics']]
# valid_lyrics_stemmed = [' '.join(my_tokenizer_stem(song)) for song in valid_df['lyrics']]
# lyrics_train_vec = vectorizer.fit_transform(train_lyrics_stemmed)
# lyrics_valid_vec = vectorizer.transform(valid_lyrics_stemmed)
# classifier.fit(lyrics_train_vec, train_df['genre'])
# print("Accuracy on training set:", accuracy_score(train_df['genre'], classifier.predict(lyrics_train_vec)))
# print("Accuracy on validation set:", accuracy_score(valid_df['genre'], classifier.predict(lyrics_valid_vec)))

# 7. Support Vector Machine  <a class="anchor" id="SVM"></a>
We test different forms of the vectorised data: stemmed, lemmatized and no token transformation. The step to vectorize the data is quite long so we decide to test different hyperparameters of a model AFTER the vectorization is performed.

SVMs are 2 class classifiers. With the LinearSVC there is multiclass support according to a one-vs-the-rest scheme. We also try out other kernels such as XXX

#### With no hypterparameter tuning:
Accuracy on training set: 0.9945%  
Accuracy on validation set: 0.4756%  

#### Best model with k-fold cross validation
Best accuracy: with TFIDF and lemmatization, for linear kernel.  
Accuracy on training set: **93,17%**  
Accuracy on validation set: **59,56%**

In [None]:
[i/10 for i in range(1, 10)]
# [i*1000 for i in range(150, 250, 30)]

In [None]:
# Define a pipeline combining a text feature extractor with a simple classifier
GRID_SEARCH_ON = False
pipeline = Pipeline([
    # ('vect', CountVectorizer()),
    ('vect', TfidfVectorizer()),    
    # ('clf', LinearSVC()),
    ('clf', SVC()),    
])

parameters = { 
    'vect__max_df': [0.5, 1],
    # 'vect__max_features': [600000, 700000, 800000,],
    'vect__ngram_range': [(1,2)],
    'clf__C': [0.01, 0.1, 1, 2, 3],
    'clf__kernel': ['poly'], # 'rbf', 'sigmoid'
    'clf__gamma': ['scale', 'auto']    
}

# find the best parameters for both the feature extraction and the classifier
if GRID_SEARCH_ON:
    grid_search = GridSearchCV(pipeline, parameters, cv=2, n_jobs=-1, verbose=1)
    start = time.time()
    grid_search.fit(lyrics_stemmed, train_labels)
    end = time.time()
    print("done in %0.3fs" % (end - start))
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
# grid_search_df = pd.DataFrame.from_dict(grid_search.cv_results_)
# grid_search_df.to_csv(root_path+"result_SVC_tfidf_stemmed_1.csv", sep=';', decimal=',')

#### Results grid search 
The different hyperparameters and ranges why are testing:
- **tokenizer**: [my_tokenizer, my_tokenizer_stem, my_tokenizer_lemma]. 
- **max_df**: [0.2, 0.5, 0.8]
- **max_features**: [10000, 50000, 100000, 150000, 200000, 250000]
- **ngram_range**: unigrams, unigrams and bigrams.
- **C**: [0.5, 1, 2].
- **TFIDF**: on or off (depends on which vectorizer we use).
- **norm**: when TFIDF=on defines the unit norm of each row.

Initial parameters run as: parameters = { 
    'vect__max_df': [0.2, 0.5, 0.8],
    'vect__max_features': [10000, 50000, 100000, 200000],
    'vect__ngram_range': [(1, 1), (1,2)],
    'clf__C': [0.5, 1, 2],
From here we refine our selection criteria per tokenized set.

**TFIDF=off**  
Best with no tokenization modification: **58,56%** {max_df=1, ngram: bigram, C=0.0006}  
Best with stemming: **57,4%** {max_df=0.5, max_features=100000, ngram: bigram, C=0.001}  
Best with lemmatization: **57,3%** {max_df=0.5, ngram: bigram, C=0.0009}  

**TFIDF=on**
Best with no tokenization modification: **58,2%** {max_df=0.4, ngram: bigram, C=0.3}  
Best with stemming: **58,4%** {max_df=0.3, ngram: bigram, C=0.4}  
Best with lemmatization: **58,5%** {max_df=0.6, ngram: bigram, C=0.305}  

In [None]:
# start = time.time()
# # vectorizer = TfidfVectorizer(max_df=0.6, ngram_range=(1, 2), norm='l2')#add best parameters
# vectorizer = TfidfVectorizer(ngram_range=(1, 2))#add best parameters
# # classifier = LinearSVC(C=0.305) #add best parameters
# classifier = SVC() #add best parameters
# # train_lyrics_lemmad = [' '.join(my_tokenizer_lemma(song)) for song in train_df['lyrics']]
# train_lyrics_lemmad = train_df['lyrics']
# valid_lyrics_lemmad = valid_df['lyrics']
# # valid_lyrics_lemmad = [' '.join(my_tokenizer_lemma(song)) for song in valid_df['lyrics']]
# lyrics_train_vec = vectorizer.fit_transform(train_lyrics_lemmad)
# lyrics_valid_vec = vectorizer.transform(valid_lyrics_lemmad)
# classifier.fit(lyrics_train_vec, train_df['genre'])
# print("Accuracy on training set:", accuracy_score(train_df['genre'], classifier.predict(lyrics_train_vec)))
# print("Accuracy on validation set:", accuracy_score(valid_df['genre'], classifier.predict(lyrics_valid_vec)))
# print("Done in {:03.2f} seconds".format(time.time()-start))

# 8. Sequenciel model - LSTM  <a class="anchor" id="LSTM"></a>

## TODO:
- change var names, functions, ...

In [None]:
TRAIN_SIZE = 20000 # in paper 395,722 not 20,000
VALID_SIZE = 2500 # in paper 2500 not 49,776

VOCAB_SIZE = 30000 # and 1 for unknown, and 1 for mask
WORD_VEC_SIZE = 100
MAX_WORDS = 200 # max number of words in song

torch.manual_seed(1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## 8.1 Word embedding

We create word embeddings with Word2Vec, from the gensim package. There are two possible models: continuous bag of words and Skip Gram. The models are trained on the whole data (the 215,824 preprocessed english songs). To avoid re-computing the models, they are saved in the files `model_CBOW_215824_en_songs.model` and `model_Skip_Gram_215824_en_songs.model`.
Helpful resources : 
- https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
- https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
- https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial#Training-the-model

In [None]:
LOAD_SAVED_W2V_MODELS = True
if LOAD_SAVED_W2V_MODELS:
    model_CBOW = Word2Vec.load(root_path+'model_CBOW_215824_en_songs.model')
    model_Skip_Gram = Word2Vec.load(root_path+'model_Skip_Gram_215824_en_songs.model')    
else:
    lyrics_en_all = data_en_all_df['lyrics']
    data = [song.split() for song in lyrics_en_all]

    # Create CBOW model with gensim https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
    # Parameters: min_count=Ignores all words with total frequency lower than this.
    start_CBOW=time.time()  
    model_CBOW = gensim.models.Word2Vec(data, min_count=5, size=WORD_VEC_SIZE, window=5, sg=0, workers=cores-1)
    end_CBOW=time.time()
    print("CBOW done in", end_CBOW-start_CBOW)
    # Create Skip Gram model
    start_SG=time.time()
    model_Skip_Gram = gensim.models.Word2Vec(data, min_count=5, size=WORD_VEC_SIZE, window=10, sg=1, workers=cores-1)
    end_SG=time.time()
    print("Skip Gram done in:", end_SG-start_SG)

    # Save models for later
    # model_CBOW.save(root_path+'model_CBOW_215824_en_songs.model')
    # model_Skip_Gram.save(root_path+'model_Skip_Gram_215824_en_songs.model')
    
    
print("Cosine similarity between 'love' and 'girl' - CBOW : ", model_CBOW.wv.similarity('love', 'girl'))
print("Cosine similarity between 'love' and 'pasta' - CBOW : ", model_CBOW.wv.similarity('love', 'pasta')) 
print("Cosine similarity between 'love' and 'girl' - Skip Gram : ", model_Skip_Gram.wv.similarity('love', 'girl'))
print("Cosine similarity between 'love' and 'pasta' - Skip Gram : ", model_Skip_Gram.wv.similarity('love', 'pasta'))
print("{}: {:.4f}".format(*model_CBOW.wv.most_similar(positive=['woman', 'queen'], negative=['woman'])[0]))

### Keeping most common words
We want to keep the vocab_size most common words (for example **30,000** like in the paper). There are **66,401** embedded words.

In [None]:
w2c = {}
for item in model_CBOW.wv.vocab:
    w2c[item]=model_CBOW.wv.vocab[item].count
w2cSorted=dict(sorted(w2c.items(), key=lambda x: x[1],reverse=True))
w2cSortedList = list(w2cSorted.keys())
MOST_FREQUENT_WORDS = w2cSortedList[:VOCAB_SIZE]
print(MOST_FREQUENT_WORDS[:10])

WORD_TO_INT = {word:i+1 for i,word in enumerate(MOST_FREQUENT_WORDS)}
INT_TO_WORD = {i+1:word for i,word in enumerate(MOST_FREQUENT_WORDS)}

We transform each song so that each word becomes the index of that word in the list `MOST_FREQUENT_WORDS`. The index n°**30001** corresponds to the *UNK* words and the index n°**0** corresponds to the pad (used later on so that all songs are 600 words long)

In [None]:
LOAD_EMBEDDING_IDXS = True

def get_embedded_song_idx(song):
    embedded_song = np.array([], dtype = int)
    for word in song:
        idx=WORD_TO_INT.get(word, VOCAB_SIZE+1)
        embedded_song = np.append(embedded_song, idx)
    return embedded_song

def embed_songs_idx(songs):
    return np.array([get_embedded_song_idx(song) for song in songs])

if LOAD_EMBEDDING_IDXS:
    embed_song_idxs_train = np.load(root_path + 'CBOW_embedded_train_idxs.npy', allow_pickle=True)
    embed_song_idxs_valid = np.load(root_path + 'CBOW_embedded_valid_idxs.npy', allow_pickle=True)
else:
    embed_song_idxs_train=embed_songs_idx([song.split() for song in train_df['lyrics']])
#     np.save(root_path + 'CBOW_embedded_train_idxs', embed_song_idxs_train)
    embed_songs_indexes_valid=embed_songs_idx([song.split() for song in valid_df['lyrics']])
#     np.save(root_path + 'CBOW_embedded_valid_idxs', embed_songs_indexes_valid)

### Pad data for songs to be same length
Som songs are short and longer than other. We cut the long songs and pad the shorter ones.

In [None]:
reviews_len = [len(x) for x in embed_song_idxs_train]
pd.Series(reviews_len).hist()
plt.show()
pd.Series(reviews_len).describe()

In [None]:
def pad_line(song):
    '''Pads/truncates a song line to have length MAX_WORDS'''
    size = min(MAX_WORDS, len(song))
    to_add = MAX_WORDS-size
    # new_line = np.concatenate((np.zeros(to_add,), song[:size]))
    new_line = np.concatenate((song[:size], np.zeros(to_add,)))
    return new_line

# Get train data
train_data = np.zeros((TRAIN_SIZE, MAX_WORDS), dtype = int)
train_labels = train_df['genre'].to_numpy()
for i, song in enumerate(embed_song_idxs_train):
    arr = pad_line(song)
    train_data[i,:] = arr

# Get dev data
valid_labels = valid_df['genre'].to_numpy()
valid_data = np.zeros((VALID_SIZE, MAX_WORDS), dtype = int)
for i, song in enumerate(embed_song_idxs_valid):
    arr = pad_line(song)
    valid_data[i,:] = arr

## LSTM Model - WIP

Useful ressources:
- https://towardsdatascience.com/sentiment-analysis-using-lstm-step-by-step-50d074f09948
- https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
- https://towardsdatascience.com/moving-from-keras-to-pytorch-f0d4fff4ce79
- model.eval(): https://discuss.pytorch.org/t/model-eval-vs-with-torch-no-grad/19615

Other:
- https://mlwhiz.com/blog/2018/12/17/text_classification/
- https://towardsdatascience.com/moving-from-keras-to-pytorch-f0d4fff4ce79
- https://towardsdatascience.com/sentiment-analysis-using-lstm-step-by-step-50d074f09948
- https://github.com/samarth-agrawal-86/sentiment-analysis-pytorch/blob/master/sentiment_model_class.py
- https://github.com/lukysummer/Movie-Review-Sentiment-Analysis-LSTM-Pytorch
- https://github.com/lukysummer/Movie-Review-Sentiment-Analysis-LSTM-Pytorch/blob/master/sentiment_analysis_LSTM.py

#### Create PyTorch dataset

In [None]:
# TRAIN_SIZE = 20000
# VALID_SIZE = 2500
# VOCAB_SIZE = 30000 # and 1 for unknown, and 1 for mask
# WORD_VEC_SIZE = 100
# MAX_WORDS = 200 # max number of words in song


## parameter setting
N_EPOCHS = 200
BATCH_SIZE = 200
N_HIDDEN = 32
N_GENRES = 10
LR = 0.001
WORD_EMBEDDING = np.concatenate(([np.zeros(100, dtype=np.float32)], model_CBOW.wv[w2cSortedList[:VOCAB_SIZE+1]]))

In [None]:
class SongsDataset(data.Dataset):
    def __init__(self, dataset, labels):
        self.dataset = dataset
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        X = torch.tensor(self.dataset[index], dtype=torch.long)
        X = X.to(device)
        y = torch.tensor(self.labels[index], dtype=torch.long)
        y = y.to(device)        
        return X, y

train_dataset = SongsDataset(train_data, train_labels)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataset = SongsDataset(valid_data, valid_labels)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
class GenreLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_layers, n_directions, output_size, batch_size, word_embeddings, drop_p=0.5):
        super(GenreLSTM, self).__init__()
        self.n_hidden = n_hidden
        self.batch_size = batch_size
        self.n_directions = n_directions
        self.n_layers = n_layers
        
        self.word_embeddings = nn.Embedding(n_vocab, n_embed)
        # self.word_embeddings.weight.data.copy_(torch.from_numpy(word_embeddings))
        self.word_embeddings.weight = nn.Parameter(torch.from_numpy(word_embeddings), requires_grad=False)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, dropout=drop_p, batch_first=True, bidirectional=n_directions==2)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_directions*n_hidden, output_size)
        # self.hidden = self.init_hidden()
        
    # def init_hidden(self):
    #     h0 = Variable(torch.zeros(self.n_layers*self.n_directions, self.batch_size, self.n_hidden).cuda())
    #     c0 = Variable(torch.zeros(self.n_layers*self.n_directions, self.batch_size, self.n_hidden).cuda())
    #     return (h0, c0)
    
    def forward(self, sentence):
        batch_size = sentence.size(0)
        # (batch_size, seq_length)
        out = self.word_embeddings(sentence)
        # (batch_size, seq_length, n_embed)
        # out, self.hidden = self.lstm(out, self.hidden)
        out, _ = self.lstm(out)
        # (batch_size, seq_length, n_directions*n_hidden)
        out = self.dropout(out)
        # (batch_size, seq_length, n_directions*n_hidden)
        out = self.fc(out[:, -1])
        # (batch_size, n_output)
        # out = F.log_softmax(out, dim=1)
        return out

In [None]:
model = GenreLSTM(VOCAB_SIZE+2, WORD_VEC_SIZE, N_HIDDEN, 1, 1, N_GENRES, BATCH_SIZE, WORD_EMBEDDING)
model = model.to(device)
loss_function = nn.CrossEntropyLoss()
# loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=LR)


start = time.time()
train_loss_ = []
test_loss_ = []
train_acc_ = []
test_acc_ = []
### training proc

for epoch in range(N_EPOCHS):
    print(epoch, time.time() - start)
                    
    # Training
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    for local_batch, local_labels in train_loader:
        # Transfer to GPU
        local_batch, local_labels = local_batch.to(device), local_labels.to(device)

        # Model computations
        model.zero_grad()
        model.batch_size = len(local_labels)
        # model.hidden = model.init_hidden()

        genre_scores = model(local_batch)
        loss = loss_function(genre_scores, local_labels)    
        loss.backward()
        optimizer.step()
        
        # calc training acc
        _, predicted = torch.max(genre_scores.data, 1)
        # print("train", predicted)
        total_acc += (predicted == local_labels).sum()
        total += len(local_labels)
        total_loss += loss.item()

    train_loss_.append(total_loss / total)
    train_acc_.append(total_acc / total)
    
    # Validation
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    with torch.set_grad_enabled(False):
        for local_batch, local_labels in valid_loader:
            # Transfer to GPU
            local_batch, local_labels = local_batch.to(device), local_labels.to(device)

            genre_scores = model(local_batch)
            loss = loss_function(genre_scores, local_labels)
            
            # calc testing acc
            _, predicted = torch.max(genre_scores.data, 1)
            # print("valid", predicted)
            total_acc += (predicted == local_labels).sum()
            total += len(local_labels)
            total_loss += loss.item()
            
        test_loss_.append(total_loss / total)
        test_acc_.append(total_acc / total)
        print('[Epoch: %3d/%3d] Training Loss: %.6f, Testing Loss: %.6f, Training Acc: %.3f, Testing Acc: %.3f'
              % (epoch, N_EPOCHS, train_loss_[epoch], test_loss_[epoch], train_acc_[epoch], test_acc_[epoch]))
        


In [None]:
reviews_len = [len(x) for x in embed_song_idxs_train]
pd.Series(reviews_len).hist()
plt.show()
pd.Series(reviews_len).describe()

### Pytorch tutorial

In [None]:
print(np.shape(train_data))
model = Sequential()
model.add(Embedding(vocab_size + 2, embed_size, weights=[embedding_matrix], input_length=MAX_WORDS, trainable=True))
model.add(LSTM(n_hidden, activation='sigmoid', return_sequences=True))
model.add(Dropout(dropout))
model.add(GlobalMaxPooling1D())
model.add(Dense(num_genres, activation='softmax'))

optimizer = optimizers.RMSprop(lr=learning_rate)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print "model fitting - Baseline LSTM"
print model.summary()
earlystopping = EarlyStopping(monitor='val_loss', patience=3)
checkpointer = ModelCheckpoint(filepath='results/lstm/lstmbest.hdf5',verbose=1,save_best_only=True)
hist = model.fit(train_data, train_labels, validation_data=(dev_data, dev_labels),
          nb_epoch=training_epochs, batch_size=batch_size, callbacks=[checkpointer, earlystopping])
print hist.history
model.save('results/lstm/lstm.h5')

evals = model.evaluate(test_data, test_labels)
print "Test accuracy:", evals
