In [1]:
import numpy as np
import pandas as pd
import csv

In [2]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from nltk.corpus import stopwords

In [4]:
# loading from file now that combined data file has been created
lyric_df = pd.read_csv('lyric_data_combined_nostopwords.csv',  usecols=range(1,3), header = 0, delimiter=",", quoting=csv.QUOTE_NONE, 
                       encoding='utf-8')
lyric_df.columns = ['genre', 'lyrics']

In [14]:
# now want to test classifying between just two distinct genres
lyric_df = lyric_df.loc[(lyric_df['genre'] == 'Metal') | (lyric_df['genre'] == 'R&B')].astype(str)

In [None]:
lyric_df.head

In [11]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
MAX_NB_WORDS = 5000
MAX_SEQ_LEN = 250
EMBEDDING_DIM = 100

In [16]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) 
tokenizer.fit_on_texts(lyric_df['lyrics'].values)
word_index = tokenizer.word_index

print("Unique tokens = ", len(word_index))

Unique tokens =  53278


In [17]:
# making seqs more uniform, len at least
X = tokenizer.texts_to_sequences(lyric_df['lyrics'].values)
X = pad_sequences(X, maxlen = MAX_SEQ_LEN)
print('Data shape = ', X.shape)

Data shape =  (23217, 250)


In [18]:
# class labels to nums
y = pd.get_dummies(lyric_df['genre']).values
print("Label shape = ", y.shape)

Label shape =  (23217, 2)


In [19]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
# train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [20]:
# lstm model
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax')) # replace this num with final number of classes classifying between/in dataset used if this changes

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 8
batch_size = 64

history = model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, validation_split=0.5, 
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8


In [21]:
# saving trained model and tokenizer
import pickle
from pickle import dump
model.save('model_metal_rnb.h5')
dump(tokenizer, open('tokenizer_metal_rnb.pkl', 'wb'))

In [22]:
# testing model accuracy
accuracy = model.evaluate(X_test, y_test)
print("Loss: ", accuracy[0])
print("Accuracy: ", accuracy[1])

Loss:  0.3681314289569855
Accuracy:  0.8776916265487671
