In [12]:
import numpy as np
import pandas as pd
import csv

In [13]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords

In [14]:
lyric_df = pd.read_csv('trainclean.csv',  usecols=range(0,5), header = 0, delimiter=",", quoting=csv.QUOTE_NONE, 
                       encoding='utf-8')
lyric_df.columns = ['artist', 'song', 'genre', 'lang', 'lyrics']

In [15]:
# multi class data prep
# 'Rock', 'Metal', 'Pop', 'Indie', 'R&B', 'Folk', 'Electronic', 'Jazz' are the classes to target, and only the english language lyrics
# as ided from runnin genres = [i for i in lyric_df['genre'].unique() if isinstance(i, str)]
lyrics_metal = lyric_df.loc[(lyric_df['genre'] == 'Metal') & (lyric_df['lang'] == 'en')]
lyrics_rb = lyric_df.loc[(lyric_df['genre'] == 'R&B') & (lyric_df['lang'] == 'en')]
lyrics_rock = lyric_df.loc[(lyric_df['genre'] == 'Rock') & (lyric_df['lang'] == 'en')]
lyrics_pop = lyric_df.loc[(lyric_df['genre'] == 'Pop') & (lyric_df['lang'] == 'en')]
lyrics_indie = lyric_df.loc[(lyric_df['genre'] == 'Indie') & (lyric_df['lang'] == 'en')]
lyrics_folk = lyric_df.loc[(lyric_df['genre'] == 'Folk') & (lyric_df['lang'] == 'en')]
lyrics_elec = lyric_df.loc[(lyric_df['genre'] == 'Electronic') & (lyric_df['lang'] == 'en')]
lyrics_jazz = lyric_df.loc[(lyric_df['genre'] == 'Jazz') & (lyric_df['lang'] == 'en')]

lyric_df = pd.concat([lyrics_metal, lyrics_rb, lyrics_rock, lyrics_pop, lyrics_indie, lyrics_folk, lyrics_elec, lyrics_jazz])

In [22]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
MAX_NB_WORDS = 5000
MAX_SEQ_LEN = 250
EMBEDDING_DIM = 100

In [20]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) 
tokenizer.fit_on_texts(lyric_df['lyrics'].values) #filtered data made above
word_index = tokenizer.word_index

print("Unique tokens = ", len(word_index))

Unique tokens =  196618


In [24]:
# making seqs more uniform, len at least
X = tokenizer.texts_to_sequences(lyric_df['lyrics'].values)
X = pad_sequences(X, maxlen = MAX_SEQ_LEN)
print('Data shape = ', X.shape)

Data shape =  (246065, 250)


In [25]:
# class labels to nums
y = pd.get_dummies(lyric_df['genre']).values
print("Label shape = ", y.shape)

Label shape =  (246065, 8)


In [32]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
# train test split - may later combine train test files if doing it this way, to get more data and still be able to test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state = 42)

In [34]:
# lstm model
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(8, activation='softmax')) # replace dis num with final number of classes classifying between/in dataset used if this changes

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 10
batch_size = 64

history = model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, validation_split=0.5, 
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [35]:
# saving trained model and tokenizer
import pickle
from pickle import dump
model.save('multigenre_model.h5')
dump(tokenizer, open('multigenre_tokenizer.pkl', 'wb'))

In [37]:
# testing model accuracy
accuracy = model.evaluate(X_test, y_test)
print("Loss: ", accuracy[0])
print("Accuracy: ", accuracy[1])

Loss:  1.1617070436477661
Accuracy:  0.6082839369773865
