# Applied Natural Language Processing - AT2 - HDInnovators
__Content:__

1. Import packages and create functions
2. Load and explore the dataset
3. Data preparation
4. Genres classification
  
  4.1. Bert
5. Sentiment analysis

  5.1. RandomForest 
  
  5.2. Neural network

  5.3 Bert

6. Topic Modelling

In [1]:
!pip install -q -U keras-tuner

## 1. Import packages and create function

In [2]:
# import tensorflow as tf
# device_name = tf.test.gpu_device_name()
# if len(device_name) > 0:
#     print("Found GPU at: {}".format(device_name))
# else:
#     device_name = "/device:CPU:0"
#     print("No GPU, using {}.".format(device_name))
# import locale
# def getpreferredencoding(do_setlocale = True):
#     return "UTF-8"
# locale.getpreferredencoding = getpreferredencoding

ModuleNotFoundError: No module named 'tensorflow'

In [4]:
# data processing
import pandas as pd
import re
import string
import numpy as np

# nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# cross-validation
from sklearn.model_selection import train_test_split

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# # drive
# from google.colab import drive
# drive.mount('/content/drive')

# tensorflow vectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras_tuner as kt

# tensorflow deep learning models
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# ignore warnings 
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nguyenthao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nguyenthao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nguyenthao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nguyenthao/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


ModuleNotFoundError: No module named 'tensorflow'

In [5]:
# Create a function to transform valence sentiment labels
def create_sentiment_from_valence(valence):
   if valence >= 0 and valence < 1/3:
     return 0
   elif valence >= 1/3 and valence <  2/3:
     return 2
   else:
     return 1

## 2. Load and explore the dataset

__Load dataset__

In [6]:
# Loading dataset
df = pd.read_csv('./Dataset with 2k each genre.csv')

In [7]:
df

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,valence
0,13923,santana,wham!,1978,blues,cold chamber smoke kush gettin higher plane sw...,198,0.857791
1,14958,marvin sease,show me what you got,1991,blues,public service announcement weezy baby best ra...,198,0.794930
2,15045,the robert cray band,1040 blues,1993,blues,gotta rapper today forget fuck smokin brain ce...,198,0.613561
3,15691,carl sims,it's just a party,2004,blues,girls knock ghetto ballin real thugs unngghhhh...,198,0.705276
4,16322,rufus thomas,sixty minute man,2011,blues,diddle devil yabba double walk feel body feel ...,198,0.651690
...,...,...,...,...,...,...,...,...
11973,26756,newsboys,your love never fails,2011,rock,ohhh separate away fail know mistake cause fai...,61,0.550701
11974,26866,mumford & sons,where are you now,2012,rock,come hear walk city streets say word finally e...,61,0.101401
11975,26875,snow patrol,just say yes,2013,rock,run ways want stay okay pretend tell today han...,61,0.236397
11976,27024,starset,carnivore,2014,rock,life know inside beast grow wait chew rope cha...,61,0.025556


## 3. Data preparation

__Lowercase__

In [8]:
# Convert all data of lyrics to string type.
df['lyrics'] = df['lyrics'].apply(str)

# Convert all string of lyrics to lowercase.
df['lyrics'] = df['lyrics'].str.lower()

 __Tokenize__

In [9]:
# Tokenize text from lyrics. 
df['tokenized'] = df['lyrics'].apply(word_tokenize)

__Remove punctuations__

In [10]:
# All punctuations
punc_marks = list(string.punctuation)
# Remove all punctuations.
df['tokenized'] = df['tokenized'].apply(lambda x: [word for word in x if word not in punc_marks])

__Remove stopwords__

In [11]:
# All stopwords of nltk
stopwords = nltk.corpus.stopwords.words('english')

# Customized list of stop words.
stopwords.extend(["'m", "'s", "'d", "hi", "im", "wa", "n't", "'get", "'ll", "'re", "'ve", "get", "still", "mmm", "ooh", "oooh", "yah", "yeh","mmm", "hmm"])

# Remove the stop words from the dataset and save the result to new column. 
df['cleaned_stopwords'] = df['tokenized'].apply(lambda x: [word for word in x if word not in stopwords])

__Lemmatize__

In [12]:
# Function to attain part of speech of words.
def determine_wordnet_speech(word_tag):
    if word_tag.startswith('J'):
        return wordnet.ADJ
    elif word_tag.startswith('V'):
        return wordnet.VERB
    elif word_tag.startswith('N'):
        return wordnet.NOUN
    elif word_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
# Add part of speech tags and save the result to new column.
df['pos_tags'] = df['cleaned_stopwords'].apply(nltk.tag.pos_tag)

# Function to get part of speech in WordNet format.
df['pos_tags'] = df['pos_tags'].apply(lambda x: [(word, determine_wordnet_speech(pos_tag)) for (word, pos_tag) in x])

# Lemmatize words and save the result to new column.
word_lemmatizer = WordNetLemmatizer()
df['lyrics_lemmatized'] = df['pos_tags'].apply(lambda x: [word_lemmatizer.lemmatize(word, tag) for word, tag in x])

# Convert list to string datatype. 
df['lyrics_cleaned'] = [' '.join(map(str,l)) for l in df['lyrics_lemmatized']]

# Check few rows.
df.head(3)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,valence,tokenized,cleaned_stopwords,pos_tags,lyrics_lemmatized,lyrics_cleaned
0,13923,santana,wham!,1978,blues,cold chamber smoke kush gettin higher plane sw...,198,0.857791,"[cold, chamber, smoke, kush, gettin, higher, p...","[cold, chamber, smoke, kush, gettin, higher, p...","[(cold, a), (chamber, n), (smoke, v), (kush, n...","[cold, chamber, smoke, kush, gettin, high, pla...",cold chamber smoke kush gettin high plane swan...
1,14958,marvin sease,show me what you got,1991,blues,public service announcement weezy baby best ra...,198,0.79493,"[public, service, announcement, weezy, baby, b...","[public, service, announcement, weezy, baby, b...","[(public, a), (service, n), (announcement, n),...","[public, service, announcement, weezy, baby, b...",public service announcement weezy baby best ra...
2,15045,the robert cray band,1040 blues,1993,blues,gotta rapper today forget fuck smokin brain ce...,198,0.613561,"[got, ta, rapper, today, forget, fuck, smokin,...","[got, ta, rapper, today, forget, fuck, smokin,...","[(got, v), (ta, a), (rapper, n), (today, n), (...","[get, ta, rapper, today, forget, fuck, smokin,...",get ta rapper today forget fuck smokin brain c...


__Encode label__

In [13]:
# Transform valence to sentiment labels
df['sentiment'] = df['valence'].apply(create_sentiment_from_valence)

__Data splitting__

In [14]:
# Extract labels 
y = df['sentiment']
# Extract independent variables
X = df['lyrics_cleaned']

## 5. Sentiment analysis 

### 5.1 RandomForest

### 5.2 LTSM

__Vectorization__

In [15]:
# Calculate the length of our vocabulary
word_tokenizer = Tokenizer()

# Create a dict of word and index from the list of sentences. Required before texts_to_sequences
word_tokenizer.fit_on_texts(X)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

40422

In [16]:
# Pad sequences
all_lyrics = X.values
longest_lyrics = max(all_lyrics, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_lyrics))

# texts_to_sequences: Transforms each text in texts to a sequence of integers (integers = index of word by fit_on_texts)
padded_lyrics = pad_sequences(
    word_tokenizer.texts_to_sequences(all_lyrics),
    length_long_sentence, 
    padding='post'
)

In [17]:
# Split dataset into train set and test set
X_train, X_test, y_train, y_test = train_test_split(padded_lyrics, y, random_state=42, stratify = y)

__Modelling__

In [21]:
def model_builder(hp):
  with tf.device(device_name):
    model = Sequential()
    # embedding
    hp_vector_size = hp.Int('vector_size', 
                            min_value=50, 
                            max_value=200, 
                            step=50)
    model.add(
        Embedding(input_dim=vocab_length,
                  output_dim=hp_vector_size,
                  input_length=length_long_sentence))

    # first lstm
    hp_lstm_units1 = hp.Int('lstm_units1', 
                            min_value=256, 
                            max_value=320, 
                            step=32)
    model.add(LSTM(hp_lstm_units1, return_sequences=True))

    # drop out
    hp_dropout_rate = hp.Float('dropout_rate', 
                               min_value=0.4, 
                               max_value=0.8, 
                               step=0.2)
    model.add(Dropout(hp_dropout_rate))

    # second lstm
    hp_lstm_units2 = hp.Int('lstm_units2', 
                            min_value=128, 
                            max_value=256, 
                            step=32)
    model.add(LSTM(hp_lstm_units2))

    # dense layer
    model.add(Dense(units = 100, activation='relu'))
    model.add(Dense(units = 50, activation='relu'))
    model.add(Dense(units = 25, activation='relu'))
    model.add(Dense(3,activation='softmax'))
    
    # compile
    hp_learning_rate = hp.Choice('learning_rate', 
                                 values=[1e-3, 1e-2])
    model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective=kt.Objective("val_accuracy", direction="max"),
                     factor=3,
                     max_epochs=100,
                     directory="model_trials_1",
                     project_name="emotion_detector_1",
                     overwrite = True
                     )
                     
stop_early = EarlyStopping(monitor='val_loss', patience=5)

tuner.search(X_train, 
             y_train, 
             epochs=50,
             validation_data=(X_test, y_test), 
             callbacks=[stop_early]
             )

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

Trial 22 Complete [00h 02m 27s]
val_accuracy: 0.4023372232913971

Best val_accuracy So Far: 0.4036727845668793
Total elapsed time: 00h 46m 18s

Search: Running Trial #23

Value             |Best Value So Far |Hyperparameter
150               |100               |vector_size
288               |288               |lstm_units1
0.4               |0.8               |dropout_rate
128               |128               |lstm_units2
0.01              |0.01              |learning_rate
2                 |2                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
4                 |4                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/2

In [19]:
# prediction = model.predict(X_test)

In [20]:
# prediction
# prediction2 = []
# for i in prediction:
#   max_value = max(i)
#   prediction2.append(list(i).index(max_value))