In [None]:
import numpy as np
import pandas as pd
import csv

In [None]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

In [None]:
lyric_df = pd.read_csv('trainclean.csv',  usecols=range(0,5), header = 0, delimiter=",", quoting=csv.QUOTE_NONE, 
                       encoding='utf-8')
lyric_df.columns = ['artist', 'song', 'genre', 'lang', 'lyrics']

In [None]:
# multi class data prep
# 'Rock', 'Metal', 'Pop', 'Indie', 'R&B', 'Folk', 'Electronic', 'Jazz' are the classes to target, and only the english language lyrics
# as ided from runnin genres = [i for i in lyric_df['genre'].unique() if isinstance(i, str)]
lyrics_metal = lyric_df.loc[(lyric_df['genre'] == 'Metal') & (lyric_df['lang'] == 'en')]
lyrics_rb = lyric_df.loc[(lyric_df['genre'] == 'R&B') & (lyric_df['lang'] == 'en')]
lyrics_rock = lyric_df.loc[(lyric_df['genre'] == 'Rock') & (lyric_df['lang'] == 'en')]
lyrics_pop = lyric_df.loc[(lyric_df['genre'] == 'Pop') & (lyric_df['lang'] == 'en')]
lyrics_indie = lyric_df.loc[(lyric_df['genre'] == 'Indie') & (lyric_df['lang'] == 'en')]
lyrics_folk = lyric_df.loc[(lyric_df['genre'] == 'Folk') & (lyric_df['lang'] == 'en')]
lyrics_elec = lyric_df.loc[(lyric_df['genre'] == 'Electronic') & (lyric_df['lang'] == 'en')]
lyrics_jazz = lyric_df.loc[(lyric_df['genre'] == 'Jazz') & (lyric_df['lang'] == 'en')]

lyric_df = pd.concat([lyrics_metal, lyrics_rb, lyrics_rock, lyrics_pop, lyrics_indie, lyrics_folk, lyrics_elec, lyrics_jazz])

In [None]:
# basically want to remove stopwords and save again as a new csv, then use that

stop_words = stopwords.words('english')

lyric_df['lyrics'] = lyric_df['lyrics'].apply(lambda x: ' '.join([word for word in x.lower().split() if word not in (stop_words)]))

In [None]:
with open('lyric_data_nostopwords_lower.csv', 'a', newline='') as file:
      lyric_df.to_csv(file)

In [None]:
lyric_df.head

<bound method NDFrame.head of                        artist  ...                                             lyrics
100              3 doors down  ...  A hundred days made older Since last time I sa...
101              3 doors down  ...  There's another world inside may never see The...
102              3 doors down  ...  I took walk around world To ease troubled mind...
103              3 doors down  ...  One kiss could best thing one lie could worst ...
104              3 doors down  ...  He spends nights California Watching stars big...
...                       ...  ...                                                ...
291031          george benson  ...  Strollin' park watching winter turn spring Wal...
291035          alexa lusader  ...  Oooooo Fly high Just let fly kite starry night...
291050                 g love  ...  This song coffee Y'all like Coffee? I like Hm ...
291095      delbert mcclinton  ...  I learned swim daddy threw river The army taug...
291099  cherry poppin da

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
MAX_NB_WORDS = 5000
MAX_SEQ_LEN = 250
EMBEDDING_DIM = 100

In [None]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) 
tokenizer.fit_on_texts(lyric_df['lyrics'].values) #filtered data made above
word_index = tokenizer.word_index

print("Unique tokens = ", len(word_index))

Unique tokens =  196223


In [None]:
# making seqs more uniform, len at least
X = tokenizer.texts_to_sequences(lyric_df['lyrics'].values)
X = pad_sequences(X, maxlen = MAX_SEQ_LEN)
print('Data shape = ', X.shape)

Data shape =  (246065, 250)


In [None]:
# class labels to nums
y = pd.get_dummies(lyric_df['genre']).values
print("Label shape = ", y.shape)

Label shape =  (246065, 8)


In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
# train test split - may later combine train test files if doing it this way, to get more data and still be able to test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [None]:
# lstm model
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(8, activation='softmax')) # replace this num with final number of classes classifying between/in dataset used if this changes

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

history = model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, validation_split=0.5, 
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# saving trained model and tokenizer
import pickle
from pickle import dump
model.save('multigenre_model_nostopwords.h5')
dump(tokenizer, open('multigenre_tokenizer_nostopwords.pkl', 'wb'))

In [None]:
# testing model accuracy
accuracy = model.evaluate(X_test, y_test)
print("Loss: ", accuracy[0])
print("Accuracy: ", accuracy[1])

Loss:  1.075553059577942
Accuracy:  0.6258102655410767
