In [64]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if len(device_name) > 0:
    print("Found GPU at: {}".format(device_name))
else:
    device_name = "/device:CPU:0"
    print("No GPU, using {}.".format(device_name))
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

No GPU, using /device:CPU:0.


In [65]:
# data processing
import pandas as pd
import re
import spacy
import string
import numpy as np

# nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# data splitting
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV
import keras_tuner as kt

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# tensorflow deep learning 
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Flatten, SpatialDropout1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# RandomForest classifier
from sklearn.ensemble import RandomForestClassifier

# ignore warnings 
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

[nltk_data] Downloading package wordnet to /Users/holys/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/holys/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/holys/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/holys/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [66]:
#Create a function to transform valence sentiment labels
def create_genre_label(genre):
   if genre == 'blues':
     return 1
   elif genre == 'country':
     return 2
   elif genre == 'jazz':
     return 3
   elif genre == 'pop':
     return 4
   elif genre == 'reggae':
     return 5
   else:
     return 0

# def create_sentiment_from_valence(valence):
#    if valence < 0.5:
#      return 0
#    else:
#      return 1

In [67]:
df = pd.read_csv('lyrics_all_combined.csv')

In [68]:
df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

In [69]:
df['lyrics'] = df['lyrics'].apply(str)

# Convert all string of lyrics to lowercase.
df['lyrics'] = df['lyrics'].str.lower()

In [70]:
# Tokenize text from lyrics. 
df['tokenized'] = df['lyrics'].apply(word_tokenize)

In [71]:
# All punctuations
punc_marks = list(string.punctuation)
# Remove all punctuations.
df['tokenized'] = df['tokenized'].apply(lambda x: [word for word in x if word not in punc_marks])

In [72]:
df['genre'].value_counts()

jazz       2000
pop        2000
country    2000
blues      2000
rock       2000
reggae     1978
Name: genre, dtype: int64

In [73]:
# All stopwords of nltk
stopwords = nltk.corpus.stopwords.words('english')

# Customized list of stop words.
stopwords.extend(["'m", "'s", "'d", "hi", "im", "wa", "n't",'wan','na','u','gon' ,'ahah','ayo',"'get", "'ll", "'re", "'ve", "get", "still", "mmm", "ooh", "oooh", "yah", "yeh","mmm", "hmm","i'm"])

# Remove the stop words from the dataset and save the result to new column. 
df['cleaned_stopwords'] = df['tokenized'].apply(lambda x: [word for word in x if word not in stopwords])

In [74]:
# Function to attain part of speech of words.
def determine_wordnet_speech(word_tag):
    if word_tag.startswith('J'):
        return wordnet.ADJ
    elif word_tag.startswith('V'):
        return wordnet.VERB
    elif word_tag.startswith('N'):
        return wordnet.NOUN
    elif word_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
# Add part of speech tags and save the result to new column.
df['pos_tags'] = df['cleaned_stopwords'].apply(nltk.tag.pos_tag)

# Function to get part of speech in WordNet format.
df['pos_tags'] = df['pos_tags'].apply(lambda x: [(word, determine_wordnet_speech(pos_tag)) for (word, pos_tag) in x])

# Lemmatize words and save the result to new column.
word_lemmatizer = WordNetLemmatizer()
df['lyrics_lemmatized'] = df['pos_tags'].apply(lambda x: [word_lemmatizer.lemmatize(word, tag) for word, tag in x])

# Convert list to string datatype. 
df['lyrics_cleaned'] = [' '.join(map(str,l)) for l in df['lyrics_lemmatized']]

# Check few rows.
df.head(3)

Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,valence,tokenized,cleaned_stopwords,pos_tags,lyrics_lemmatized,lyrics_cleaned
0,santana,wham!,1978,blues,cold chamber smoke kush gettin higher plane sw...,198,0.857791,"[cold, chamber, smoke, kush, gettin, higher, p...","[cold, chamber, smoke, kush, gettin, higher, p...","[(cold, a), (chamber, n), (smoke, v), (kush, n...","[cold, chamber, smoke, kush, gettin, high, pla...",cold chamber smoke kush gettin high plane swan...
1,marvin sease,show me what you got,1991,blues,public service announcement weezy baby best ra...,198,0.79493,"[public, service, announcement, weezy, baby, b...","[public, service, announcement, weezy, baby, b...","[(public, a), (service, n), (announcement, n),...","[public, service, announcement, weezy, baby, b...",public service announcement weezy baby best ra...
2,the robert cray band,1040 blues,1993,blues,gotta rapper today forget fuck smokin brain ce...,198,0.613561,"[got, ta, rapper, today, forget, fuck, smokin,...","[got, ta, rapper, today, forget, fuck, smokin,...","[(got, v), (ta, a), (rapper, n), (today, n), (...","[get, ta, rapper, today, forget, fuck, smokin,...",get ta rapper today forget fuck smokin brain c...


In [75]:
# Transform valence to sentiment labels
df['genre'] = df['genre'].apply(create_genre_label)

In [76]:
df['genre'].unique()

array([1, 2, 3, 4, 5, 0])

In [77]:
# Extract labels 
y = df['genre']
# Extract independent variables
X = df['lyrics_cleaned']
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)
# Create 3 cv for cross-validation 
cv = StratifiedKFold(n_splits=3, random_state=8, shuffle=True).split(X_train, y_train)

In [78]:
# Instantiate CountVectorizer
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.2, max_df=0.7, max_features=100)
# Learn a vocabulary dictionary of all tokens in the training set.
vectorizer.fit(X_train)
# Transform training set and testing set to document-term matrix.
X_train_count_vectorized = vectorizer.transform(X_train)
X_test_count_vectorized = vectorizer.transform(X_test)

In [80]:
# Instantiate TfidfTransformer
tfidf_transformer = TfidfTransformer()
# Learn vocabulary and idf from training set.
tfidf_transformer.fit(X_train_count_vectorized)
# Transform a train and set count matrix to a tf-idf representation.
X_train_tfidf = tfidf_transformer.transform(X_train_count_vectorized)
X_test_tfidf = tfidf_transformer.transform(X_test_count_vectorized)

In [34]:
rf1 = RandomForestClassifier(random_state=8, n_estimators=50).fit(X_train_tfidf, y_train)

In [35]:
np.mean([estimator.tree_.max_depth for estimator in rf1.estimators_])

47.58

In [36]:
hyperparams_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [40, 55, 60],
    'min_samples_split': [2, 4, 8]
    }
hyperparams_grid

{'n_estimators': [50, 100, 150],
 'max_depth': [40, 55, 60],
 'min_samples_split': [2, 4, 8]}

In [37]:
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=8), hyperparams_grid, cv=3, verbose=2, scoring='accuracy')
grid_search_rf.fit(X_train_tfidf, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END .max_depth=40, min_samples_split=2, n_estimators=50; total time=   1.9s
[CV] END .max_depth=40, min_samples_split=2, n_estimators=50; total time=   1.9s
[CV] END .max_depth=40, min_samples_split=2, n_estimators=50; total time=   1.9s
[CV] END max_depth=40, min_samples_split=2, n_estimators=100; total time=   3.8s
[CV] END max_depth=40, min_samples_split=2, n_estimators=100; total time=   3.8s
[CV] END max_depth=40, min_samples_split=2, n_estimators=100; total time=   3.8s
[CV] END max_depth=40, min_samples_split=2, n_estimators=150; total time=   5.6s
[CV] END max_depth=40, min_samples_split=2, n_estimators=150; total time=   5.7s
[CV] END max_depth=40, min_samples_split=2, n_estimators=150; total time=   5.7s
[CV] END .max_depth=40, min_samples_split=4, n_estimators=50; total time=   2.0s
[CV] END .max_depth=40, min_samples_split=4, n_estimators=50; total time=   1.8s
[CV] END .max_depth=40, min_samples_split=4, n_e

In [38]:
best_params = grid_search_rf.best_params_
best_params

{'max_depth': 40, 'min_samples_split': 8, 'n_estimators': 150}

In [39]:
grid_search_rf.best_score_

0.2521432402924937

In [40]:
rf_best = RandomForestClassifier(random_state=8, max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'], n_estimators=best_params['n_estimators']).fit(X_train_tfidf, y_train)

In [41]:
y_test_preds = rf_best.predict(X_test_tfidf)

In [45]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
accuracy_score(y_test_preds, y_test)
#ConfusionMatrixDisplay.from_estimator(rf_best, y_test_preds, y_test, cmap="Blues", normalize='true')
#plt.title("RFC with Count Vectorizer");

0.25242070116861437

In [46]:
# Calculate the length of our vocabulary
word_tokenizer = Tokenizer()

# Create a dict of word and index from the list of sentences. Required before texts_to_sequences
word_tokenizer.fit_on_texts(X)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

40420

In [81]:
# Pad sequences
all_lyrics = X.values
longest_lyrics = max(all_lyrics, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_lyrics))

# texts_to_sequences: Transforms each text in texts to a sequence of integers (integers = index of word by fit_on_texts)
padded_lyrics = pad_sequences(
    word_tokenizer.texts_to_sequences(all_lyrics),
    length_long_sentence, 
    padding='post'
)

In [82]:
# Split dataset into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(padded_lyrics, y, random_state=42, stratify = y)

In [83]:
print(y_test.unique())

[5 0 1 4 2 3]


In [87]:
def model_builder(hp):
  with tf.device(device_name):
    model = Sequential()
    # embedding
    hp_vector_size = hp.Int('vector_size', 
                            min_value=50, 
                            max_value=200, 
                            step=50)
    model.add(
        Embedding(input_dim=vocab_length,
                  output_dim=hp_vector_size,
                  input_length=length_long_sentence))

    # first lstm
    hp_lstm_units1 = hp.Int('lstm_units1', 
                            min_value=100, 
                            max_value=101, 
                            step=1)
    model.add(LSTM(hp_lstm_units1, return_sequences=True))

    # drop out
    hp_dropout_rate = hp.Float('dropout_rate', 
                               min_value=0.4, 
                               max_value=0.8, 
                               step=0.2)
    model.add(Dropout(hp_dropout_rate))

    # second lstm
    hp_lstm_units2 = hp.Int('lstm_units2', 
                            min_value=100, 
                            max_value=101, 
                            step=1)
    model.add(LSTM(hp_lstm_units2))

    # dense layer
    # model.add(Dense(units = 100, activation='relu'))
    model.add(Dense(units = 50, activation='relu'))
    model.add(Dense(units = 25, activation='relu'))
    model.add(Dense(6,activation='softmax'))
    
    # compile
    hp_learning_rate = hp.Choice('learning_rate', 
                                 values=[5e-1])
    model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [88]:
tuner = kt.Hyperband(model_builder,
                     objective=kt.Objective("val_accuracy", direction="max"),
                     factor=3,
                     max_epochs=100,
                     directory="model_trials_1",
                     project_name="emotion_detector_1",
                     overwrite = True
                     )
                     
stop_early = EarlyStopping(monitor='loss', patience=5)

tuner.search(X_train, 
             y_train, 
             epochs=50,
             validation_data=(X_test, y_test), 
             callbacks=[stop_early]
             )

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

Trial 48 Complete [00h 03m 14s]
val_accuracy: 0.16694490611553192

Best val_accuracy So Far: 0.16694490611553192
Total elapsed time: 02h 32m 35s
INFO:tensorflow:Oracle triggered exit


In [None]:
# prediction = model.predict(X_test)