In [3]:
import tensorflow as tf
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Flatten
from tensorflow.keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import Dense, Input, Flatten, Dropout
import tensorflow_hub as hub
import tensorflow_text as text

from keras.utils import pad_sequences
from keras.utils.np_utils import to_categorical
import pandas as pd

from tqdm import tqdm
import numpy as np

In [4]:
import pandas as pd
import numpy as np

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from nltk import word_tokenize

from sklearn.model_selection import train_test_split

from gensim.models import Word2Vec
stopwords = stopwords.words('english')


from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression as log 
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn import metrics    
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('labeled_lyrics_w_genres.csv')
df.head()

df = df.drop(columns = ['Unnamed: 0','Unnamed: 0.1'],axis = 1)

df_dropped = df[(df['genre'] == 'No_genre') | (df['genre'] == 'Non-Music')].index
df.drop(df_dropped, inplace=True, axis='index')

df['genre'].value_counts()

Pop        57357
Rock       26756
Country     7440
Rap         5959
R&B         4773
Name: genre, dtype: int64

In [6]:
df_balanced = ""

cond = df['genre'] == 'Pop'
df_pop = df[cond]
df_pop = df_pop[0:500]

cond = df['genre'] == 'Rock'
df_rock = df[cond]
df_rock = df_rock[0:500]
df_rock.shape

cond = df['genre'] == 'Country'
df_country = df[cond]
df_country = df_country[0:500]
df_country.shape

cond = df['genre'] == 'Rap'
df_rap = df[cond]
df_rap = df_rap[0:500]
df_rap.shape

cond = df['genre'] == 'R&B'
df_r_b = df[cond]
df_r_b = df_r_b[0:500]
df_r_b.shape

(500, 5)

In [7]:
df_balanced = pd.concat([df_pop, df_rock, df_country, df_rap, df_r_b], axis = 0)

df_balanced['genre'].value_counts()

Pop        500
Rock       500
Country    500
Rap        500
R&B        500
Name: genre, dtype: int64

In [18]:
#Remove numbers
def remove_numbers(input):
    input = word_tokenize(input)
    without_sw = [word for word in input 
                  if word.isalpha()]
    return ' '.join(without_sw)

# 1. function that makes all text lowercase.
def make_lowercase(test_string):
    return test_string.lower()

# 2. function that removes all punctuation. 
def remove_punc(test_string):
    test_string = re.sub(r'[^\w\s]', '', test_string)
    return test_string

# 3. function that removes all stopwords.
def remove_sw(input):
    input = word_tokenize(input)
    without_sw = [word for word in input 
                  if word not in stopwords]
    return ' '.join(without_sw)

# 4. function to break words into their stem words
def stem_words(input):
    stemming = PorterStemmer()
    tokenized_words = word_tokenize(input)
    
    stemmed_words = [stemming.stem(word) for word in tokenized_words]
    return ' '.join(stemmed_words)


def lemmatize_words(input):
    lemmatizer = WordNetLemmatizer()
    tokenized_words = word_tokenize(input)
    
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenized_words]
    return ' '.join(lemmatized_words)

In [19]:
# Pipeline

def text_processing_pipeline(a_string):
    a_string = make_lowercase(a_string)
    a_string = remove_numbers(a_string)
    a_string = remove_punc(a_string)
    a_string = remove_sw(a_string)
    #a_string = stem_words(a_string)
    return a_string

In [20]:
df_balanced['seq_clean'] = df_balanced['seq'].apply(lambda x: text_processing_pipeline(x))

In [8]:
df_balanced.sample(5)

Unnamed: 0,artist,seq,song,label,genre
4866,DJ Jazzy Jeff & the Fresh Prince,"Here's a little story 'bout a Friday night,\r\...",Code Red,0.755,Rap
10936,Ryan Adams,Well I went down to Houston and I stopped in S...,Oh My Sweet Carolina,0.306,Country
6922,Gym Class Heroes,This is not novelty\r\nThis is nothing delicat...,Biters Block,0.616,Rap
3088,LL Cool J,"I know a honey named Millie, raised out in Phi...",After School,0.793,Rap
4169,The Brand New Heavies,"Together we got love, we got peace\r\nTogether...",Day by Day,0.67,R&B


In [62]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df
df_balanced = create_dummies(df_balanced,"genre")

In [9]:
label_encoder = LabelEncoder()
df_balanced['genre']= label_encoder.fit_transform(df_balanced['genre'])

In [10]:
df_balanced.genre.value_counts()

1    500
4    500
0    500
3    500
2    500
Name: genre, dtype: int64

In [29]:
y_features = df_balanced.columns[5:11] 
y_features

Index([], dtype='object')

In [11]:
X = df_balanced['seq']
y = df_balanced['genre']

In [12]:
y

1        1
7        1
8        1
9        1
10       1
        ..
18179    2
18185    2
18214    2
18217    2
18257    2
Name: genre, Length: 2500, dtype: int32

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=.80)

In [14]:
X_train.shape

(2000,)

## BERT

In [15]:
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [16]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")



In [17]:
bert_encoder

<tensorflow_hub.keras_layer.KerasLayer at 0x24a2aab53c0>

In [77]:
def get_sentense_embedding(sentenses):
    preprocessed_text = bert_preprocess(sentenses)
    vector = bert_encoder(preprocessed_text)['pooled_output']
    return vector

get_sentense_embedding([
    "500$ discount",
    'I eat mango, sweet mango'
])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.74583364, -0.3479747 , -0.09725112, ...,  0.11525007,
        -0.68303424,  0.85167116],
       [-0.8676579 , -0.2502348 ,  0.42193842, ...,  0.25290197,
        -0.6358296 ,  0.91478306]], dtype=float32)>

In [86]:
e = get_sentense_embedding([
    "banana",
    "mango",
    'television',
    "Elon Musk",
    "Tesla"
])

In [87]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[3]], [e[4]])

array([[0.9801978]], dtype=float32)

# Define Model

In [18]:
# BERT Layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural Network Layers
l = Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = Dense(1, activation='softmax', name="output")(l)

# Final model

model = tf.keras.Model(inputs=[text_input], outputs=[l])

In [19]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [20]:
model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
 3/50 [>.............................] - ETA: 29:20 - loss: -38.2153 - accuracy: 0.1667

In [1]:
y_train

NameError: name 'y_train' is not defined