In [1]:
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Flatten
from tensorflow.keras.layers import Embedding
from keras.models import Sequential
from tensorflow.keras import layers, models, losses, optimizers

from keras.layers import Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D,LSTM,SpatialDropout1D
from keras.utils import pad_sequences

from keras.utils.np_utils import to_categorical
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

from gensim.models import Word2Vec
stopwords = stopwords.words('english')

In [2]:
df = pd.read_csv('data/labeled_lyrics_w_genres.csv')
df.shape

(145250, 7)

In [6]:
df_dropped = df[(df['genre'] == 'No_genre') | (df['genre'] == 'Non-Music')|(df['genre'] == 'Rap')  ].index
df.drop(df_dropped, inplace=True, axis='index')



In [7]:
df.shape

(96326, 7)

In [8]:
lemmatizer = WordNetLemmatizer()
# 1. function that makes all text lowercase.
def make_lowercase(test_string):
    return test_string.lower()

# 2. function that removes all punctuation. 
def remove_punc(test_string):
    test_string = re.sub(r'[^\w\s]', '', test_string)
    return test_string

# 3. function that removes all stopwords.
def remove_stopwords(test_string):
    # Break the sentence down into a list of words
    words = word_tokenize(test_string)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords. Stopwords was imported from nltk.corpus
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string

# 4. function to break words into their stem words
def stem_words(a_string):
    # Initalize our Stemmer
    porter = PorterStemmer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in words:
        # Stem the word
        stemmed_word = porter.stem(word) #from nltk.stem import PorterStemmer
        
        # Append stemmed word to our valid_words
        valid_words.append(stemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 
def lem_words(a_string):
    # Initalize our Stemmer
    lemmatizer = WordNetLemmatizer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in words:
        # Stem the word
        lemmed_word = lemmatizer.lemmatize(word) #from nltk.stem import PorterStemmer
        
        # Append stemmed word to our valid_words
        valid_words.append(lemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 

In [9]:
test = "crying and cry"
lem_words(test)

'cry and cry'

In [10]:
# Pipeline

def text_processing_pipeline(a_string):
    a_string = make_lowercase(a_string)
    a_string = remove_punc(a_string)
    a_string = remove_stopwords(a_string)
    a_string = lem_words(a_string)
    return a_string



In [11]:
df['clean_lyrics'] = df.seq.apply(text_processing_pipeline)

df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,artist,seq,song,label,genre,clean_lyrics
0,0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626,R&B,aint ever trapped bando oh lord dont get wrong...
1,1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63,Pop,drink go smoke go feel got let go care get los...
2,2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24,R&B,dont live planet earth found love venus thats ...
3,3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536,R&B,trippin grigio mobbin light low trippin grigio...
4,4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371,R&B,see midnight panther gallant brave found found...


In [13]:
df.genre.value_counts()

Pop        57357
Rock       26756
Country     7440
R&B         4773
Name: genre, dtype: int64

In [14]:
def getNum(gen):
    if gen == 'Pop':
        return 3
    elif gen == 'Rock':
        return 2
    elif gen == 'Country':
        return 1
    else :
        return 0

In [15]:
df['genre'] = df.genre.apply(lambda x: getNum(x))

In [16]:
df = df.sample(frac=1)

X = df['clean_lyrics']

y = df['genre'].values

y = to_categorical( y )
X_text = X

In [17]:
# Limiting our tokenizers vocab size
max_words = 10000
 
    
# create the tokenizer
tokenizer = Tokenizer(num_words=max_words)


# Fit the tokenizer
tokenizer.fit_on_texts(X)


# Create the sequences for each sentence, basically turning each word into its index position
sequences = tokenizer.texts_to_sequences(X)


index_word = tokenizer.index_word


# # Limiting our sequencer to only include 500 words
max_length = 300


# # Convert the sequences to all be the same length of 500
X = pad_sequences(sequences, maxlen=max_length, padding='post')
print(X.shape)

(96326, 300)


In [18]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 32, input_length=max_length) ) 





model.add(LSTM(50,dropout =0.2))


model.add(Dense(4, activation='softmax'))
optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) 

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 32)           320000    
                                                                 
 lstm (LSTM)                 (None, 50)                16600     
                                                                 
 dense (Dense)               (None, 4)                 204       
                                                                 
Total params: 336,804
Trainable params: 336,804
Non-trainable params: 0
_________________________________________________________________


  super().__init__(name, **kwargs)


In [19]:
hist = model.fit(X, y, 
                 validation_split=0.2, 
                 epochs=30, batch_size=20)

Epoch 1/30
Epoch 2/30
Epoch 3/30

KeyboardInterrupt: 

In [69]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 32, input_length=max_length) ) 




model.add(LSTM(50, return_sequences=True,dropout =0.2))
model.add(LSTM(50,dropout =0.2))
model.add(Dense(32))

model.add(Dense(5, activation='softmax'))
optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) 

model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 200, 32)           160000    
                                                                 
 lstm_32 (LSTM)              (None, 200, 50)           16600     
                                                                 
 lstm_33 (LSTM)              (None, 50)                20200     
                                                                 
 dense_20 (Dense)            (None, 32)                1632      
                                                                 
 dense_21 (Dense)            (None, 5)                 165       
                                                                 
Total params: 198,597
Trainable params: 198,597
Non-trainable params: 0
_________________________________________________________________


  super().__init__(name, **kwargs)


In [70]:
hist = model.fit(X, y, 
                 validation_split=0.2, 
                 epochs=30, batch_size=20)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30

KeyboardInterrupt: 

In [86]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 100, input_length=max_length) ) 

model.add(SpatialDropout1D(0.2))


model.add(LSTM(100,dropout =0.2,recurrent_dropout=0.2))
model.add(Dense(5, activation='softmax'))

optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) 

model.summary()

Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_21 (Embedding)    (None, 250, 100)          5000000   
                                                                 
 spatial_dropout1d_4 (Spatia  (None, 250, 100)         0         
 lDropout1D)                                                     
                                                                 
 lstm_41 (LSTM)              (None, 100)               80400     
                                                                 
 dense_30 (Dense)            (None, 5)                 505       
                                                                 
Total params: 5,080,905
Trainable params: 5,080,905
Non-trainable params: 0
_________________________________________________________________


In [87]:
hist = model.fit(X, y, 
                 validation_split=0.2, 
                 epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50

KeyboardInterrupt: 

In [17]:
# This creates the Neural Network
model = Sequential() 

# This embedding layer basically will automatically create the word2vec vectors based on your text data.
model.add( Embedding(max_words, 32, input_length=max_length) ) 



model.add(LSTM(50,return_sequences = True,dropout =0.2,recurrent_dropout=0.2))
model.add(LSTM(50,return_sequences = True,dropout =0.2,recurrent_dropout=0.2))
model.add(LSTM(50,return_sequences = True,dropout =0.2,recurrent_dropout=0.2))
model.add(LSTM(50,return_sequences = True,dropout =0.2,recurrent_dropout=0.2))
model.add(LSTM(50,return_sequences = True,dropout =0.2,recurrent_dropout=0.2))
model.add(LSTM(50,dropout =0.2,recurrent_dropout=0.2))
model.add(Dense(5, activation='softmax'))

optimizer = optimizers.Adam(lr=0.003)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) 

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 300, 32)           320000    
                                                                 
 lstm_10 (LSTM)              (None, 300, 50)           16600     
                                                                 
 lstm_11 (LSTM)              (None, 300, 50)           20200     
                                                                 
 lstm_12 (LSTM)              (None, 300, 50)           20200     
                                                                 
 lstm_13 (LSTM)              (None, 300, 50)           20200     
                                                                 
 lstm_14 (LSTM)              (None, 300, 50)           20200     
                                                                 
 lstm_15 (LSTM)              (None, 50)               

In [None]:
hist = model.fit(X, y, 
                 validation_split=0.2, 
                 epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
 296/1279 [=====>........................] - ETA: 22:08 - loss: 1.1786 - accuracy: 0.5575