In [1]:
import numpy as np 
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from nltk import word_tokenize

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# All Rock
rock1 = open('/content/drive/My Drive/Colab Notebooks/AllRock.txt', 'r').read()
rock = ''.join([i for i in rock1 if not i.isdigit()]).replace("\n", " ").lower().replace(".", "").replace("!", "").replace("?", "").replace(",", "").replace("", "").replace("'", "").replace(")", "").replace("(", "").split(' ')
# All Pop
pop1 = open('/content/drive/My Drive/Colab Notebooks/AllPop.txt', 'r').read()
pop = ''.join([i for i in pop1 if not i.isdigit()]).replace("\n", " ").lower().replace(".", "").replace("!", "").replace("?", "").replace(",", "").replace("", "").replace("'", "").replace(")", "").replace("(", "").split(' ')
# All Country
country1 = open('/content/drive/My Drive/Colab Notebooks/AllCountry.txt', 'r').read()
country = ''.join([i for i in country1 if not i.isdigit()]).replace("\n", " ").lower().replace(".", "").replace("!", "").replace("?", "").replace(",", "").replace("", "").replace("'", "").replace(")", "").replace("(", "").split(' ')
# All Rap
rap1 = open('/content/drive/My Drive/Colab Notebooks/AllLyrics.txt', 'r').read()
rap = ''.join([i for i in rap1 if not i.isdigit()]).replace("\n", " ").lower().replace(".", "").replace("!", "").replace("?", "").replace(",", "").replace("", "").replace("'", "").replace(")", "").replace("(", "").split(' ')

In [4]:
# create samples of 10 words each for each genre - this is our estimate length of line
SONG_LENGTH = 10
# Rock
Rock = [rock[i*SONG_LENGTH:(i+1)*SONG_LENGTH] for i in range(0,int(len(rock)/int(SONG_LENGTH)))]
# Country
Country = [country[i*SONG_LENGTH:(i+1)*SONG_LENGTH] for i in range(0,int(len(country)/int(SONG_LENGTH)))]
# Pop
Pop = [pop[i*SONG_LENGTH:(i+1)*SONG_LENGTH] for i in range(0,int(len(pop)/int(SONG_LENGTH)))]
# Rap
Rap = [rap[i*SONG_LENGTH:(i+1)*SONG_LENGTH] for i in range(0,int(len(rap)/int(SONG_LENGTH)))]

In [5]:
ds_rock = [' '.join(Rock[i]) for i in range(len(Rock))]
ds_country = [' '.join(Country[i]) for i in range(len(Country))]
ds_pop = [' '.join(Pop[i]) for i in range(len(Pop))]
ds_rap = [' '.join(Rap[i]) for i in range(len(Rap))]

In [6]:
# rock 0, country 1, pop 2, rap 3
ds_ro = []
genre = 0
for sample in ds_rock:
  ds_ro.append([genre, sample])

ds_co = []
genre = 1
for sample in ds_country:
  ds_co.append([genre, sample])

ds_po = []
genre = 2
for sample in ds_pop:
  ds_po.append([genre, sample])

ds_ra = []
genre = 3
for sample in ds_rap:
  ds_ra.append([genre, sample])

ds = ds_ro+ds_co+ds_po+ds_ra

ds = np.array(ds)
print('Genres: ', ds[:, 0])
print('Lyrics: ', ds[:, 1])

Genres:  ['0' '0' '0' ... '3' '3' '3']
Lyrics:  ['yesterday all my troubles seemed so far away now it'
 'looks as though theyre here to stay oh i believe'
 'in yesterday suddenly im not half the man i used' ...
 'kick me when im down but im up again scorchin'
 'hot forcin my way up in the door to kill'
 'the bullpoop like a matador keep your hands high what']


In [12]:
# max words we use
maxwords = 50000
# max words per lyric (should be 200 anyway as set up)
maxlyricwords = 200
# tokenize here
embedding_dimension = 100
tokenizer = Tokenizer(num_words=maxwords, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(ds[:, 1])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 16926 unique tokens.


In [13]:
X = tokenizer.texts_to_sequences(ds[:, 1])
X = pad_sequences(X, maxlen=maxlyricwords)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (52549, 200)


In [14]:
Y = pd.get_dummies(ds[:, 0])
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (52549, 4)


In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(47294, 200) (47294, 4)
(5255, 200) (5255, 4)


In [16]:
model = Sequential()
model.add(Embedding(maxwords, embedding_dimension, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 10
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [17]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.712
  Accuracy: 0.759


In [18]:
lyric = ['Bumpin i meant for you call my ninja like']
seq = tokenizer.texts_to_sequences(lyric)
padded = pad_sequences(seq, maxlen=maxlyricwords)
pred = model.predict(padded)
labels = ['rock', 'country', 'pop', 'rap']
print(pred, labels[np.argmax(pred)])

[[1.9792731e-06 1.5739771e-06 4.8857914e-06 9.9999154e-01]] rap


In [19]:
def classify_string(input):
  lyric = [str(input)]
  seq = tokenizer.texts_to_sequences(lyric)
  padded = pad_sequences(seq, maxlen=maxlyricwords)
  pred = model.predict(padded)
  labels = ['rock', 'country', 'pop', 'rap']
  print(labels[np.argmax(pred)])

In [20]:
def classify_list(input):
  for lyric in input:
    classify_string(lyric)

In [21]:
# markov generated lyrics
lyrics = ['Bumpin i meant for you call my ninja like',
 'Biz dont take their baby mommas ninja frick you nasty boy you',
 'Shifty sticks and pray and flee the frick all of you',
 'Glocks but all ill die slow',
 'Wondering if im askin blunt sip champagne range rover been outside for',
 'And youre so take that crown two pounds you know',
 'Publishing i thought i get witcha can i could cop',
 'Miss the more cause you in the right one',
 'Onyx and them hoes i love',
 'Gat call me puff daddy biggie gots ta like',
 'Everything around me shit b***** in ya imma stay yappin when',
 'Hum all about fingers in the loot im',
 'Rollem up heard whos this yeah keep on top sky is',
 'Drunk of ninjaz from now drop to',
 'Declinin windin like flypaper neighbor slow down',
 'Expensive cars i tote my crew i only got enough heart',
 'Lame dudes whos next move but the drugs to spit phrases thatll',
 'Guy well its cool and your poop so hard to',
 'Clap wit my life in ma little nasty boy',
 'Dial you should too much better man played',
 'Lali like that you frick doin all mcs have']

classify_list(lyrics)

rap
rap
rap
rap
rap
rap
rap
rap
rap
rap
rap
rap
rap
rap
rap
rap
rap
rap
rap
rock
rap


In [22]:
# lstm generated lyrics
lyrics2 = ['in the veins hard to explain how i maintain', 
  'to put my back in the house so i can i wanna flaunt you thats right', 
  'with the grime of my ninja frick',
  'with the ds crept in blastin him you dont want to slit the clits alot',
  'used to lick the clits a lot of problems never be the beamer with the goldie sound',
  'like a steelo not my steelo oh no thats not my my steelo oh i steelo not my steelo oh no',
  'thats not my no steelo bust my no dough day but this sittin bodies not my']

classify_list(lyrics2)

rap
rap
rap
rap
rap
rap
rap


In [23]:
# something else for testing
denver = ['almost heaven west virginia',
'blue ridge mountains shenandoah river',
'life is old there older than the trees',
'younger than the mountains growin like a breeze',
'country roads take me home',
'to the place i belong',
'west virginia mountain mama',
'take me home country roads']

classify_list(denver)

pop
country
pop
country
pop
rock
country
pop
