## MSDI-MXM Lyric Analysis

In [None]:
import pandas as pd
import numpy as np
import tables
import h5py
import pickle

import math
import random

import os, sys, glob
from pathlib import Path

import keras
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten, Conv1D, Dropout, MaxPooling1D
#from keras.optimizers import Adam

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import requests

def download_file_from_google_drive(id, destination):
    def get_confirm_token(response):
        for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                return value

        return None

    def save_response_content(response, destination):
        CHUNK_SIZE = 3276800

        with open(destination, "wb") as f:
            for chunk in response.iter_content(CHUNK_SIZE):
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)

    URL = "https://drive.google.com/u/0/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    




In [None]:
# TAKE ID FROM SHAREABLE LINK
#https://drive.google.com/file/d/1wduOo4DLWGEAF44odjv6BQlsxYsO_0c6/view?usp=sharing
file_id = r'1wduOo4DLWGEAF44odjv6BQlsxYsO_0c6'
# DESTINATION FILE ON YOUR DISK
destination = 'shortlisted_tracks_df.pkl' #pickle4
download_file_from_google_drive(file_id, destination)

In [None]:
with open('shortlisted_tracks_df' +'.pkl', 'rb') as handle:
    shortlisted_tracks_df = pickle.load(handle)

In [None]:
shortlisted_tracks_df

Unnamed: 0_level_0,Genre
TrackId,Unnamed: 1_level_1
TREQVVI128F427C38E,Country
TRYCYSD128F148CF20,Country
TRLCZET12903D03F70,Country
TRJORPT128F42BAA8D,Country
TROBJYA128F42A2984,Country
...,...
TRQFZEO128E07930BC,Rock
TRNSWUN128F4247F87,Rock
TRLENOU128F4281E94,Rock
TRDTSQP128F421C665,Rock


In [None]:
mxm_file_train = open("mxm_dataset_train.txt", "r")
mxm_wordvec_list = []
msm_top_words_train = None

for lines in mxm_file_train:
    if lines[0] == 'T':
        fields = lines.strip('\n').split(',')
        track_id = fields[0]
        if track_id in shortlisted_tracks_df.index.values:
            wordvec = [0]*5000
            for values in fields[2:]:
                word_index, occurance_count = values.split(':')
                wordvec[int(word_index)-1] = int(occurance_count)
            mxm_wordvec_list.append([track_id, wordvec])
    if lines[0] == '%':
        msm_top_words_train = lines[1:]

In [None]:
mxm_file_test = open("mxm_dataset_test.txt", "r")
msm_top_words_test = None

for lines in mxm_file_test:
    if lines[0] == 'T':
        fields = lines.strip('\n').split(',')
        track_id = fields[0]
        if track_id in shortlisted_tracks_df.index.values:
            wordvec = [0]*5000
            for values in fields[2:]:
                word_index, occurance_count = values.split(':')
                wordvec[int(word_index)-1] = int(occurance_count)
            mxm_wordvec_list.append([track_id, wordvec])
    if lines[0] == '%':
        msm_top_words_test = lines[1:]   

In [None]:
msm_top_words_train = None
msm_top_words_test = None

In [None]:
wordvec_df = pd.DataFrame(data=mxm_wordvec_list, columns=['TrackId', 'wordvec'])

In [None]:
wordvec_df

Unnamed: 0,TrackId,wordvec
0,TRAAHSY128F147BB5C,"[5, 5, 10, 4, 3, 4, 4, 3, 3, 1, 2, 5, 0, 1, 0,..."
1,TRAALDI128EF35F6DD,"[9, 3, 8, 7, 0, 0, 0, 3, 1, 0, 0, 0, 1, 1, 0, ..."
2,TRAAMES128F42AF068,"[10, 8, 1, 8, 7, 3, 2, 3, 3, 1, 6, 3, 1, 1, 0,..."
3,TRAAORZ128F421CB5C,"[21, 10, 13, 7, 4, 8, 3, 5, 5, 2, 0, 6, 1, 0, ..."
4,TRAAQCK128F92E8C33,"[29, 6, 30, 17, 8, 0, 7, 15, 7, 4, 8, 2, 5, 0,..."
...,...,...
4895,TRZRKXA128F1492976,"[0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0,..."
4896,TRZSEAS128F14534C5,"[12, 4, 5, 5, 2, 0, 12, 8, 23, 0, 2, 3, 1, 11,..."
4897,TRZTVNT12903CCE72A,"[0, 2, 14, 3, 8, 4, 8, 5, 2, 1, 0, 2, 0, 7, 0,..."
4898,TRZUUMK12903CC7903,"[13, 10, 11, 4, 1, 3, 0, 2, 4, 1, 4, 1, 2, 0, ..."


In [None]:
lyric_df = shortlisted_tracks_df.join(wordvec_df.set_index('TrackId'), how='inner')

In [None]:
lyric_df

Unnamed: 0_level_0,Genre,wordvec
TrackId,Unnamed: 1_level_1,Unnamed: 2_level_1
TREQVVI128F427C38E,Country,"[8, 3, 7, 4, 9, 2, 3, 3, 2, 2, 7, 1, 0, 0, 7, ..."
TRYCYSD128F148CF20,Country,"[14, 4, 37, 0, 3, 0, 1, 0, 0, 3, 4, 0, 4, 3, 2..."
TRLCZET12903D03F70,Country,"[5, 17, 0, 6, 8, 14, 1, 0, 0, 8, 4, 1, 7, 0, 0..."
TRJORPT128F42BAA8D,Country,"[16, 3, 14, 7, 1, 6, 3, 2, 9, 0, 0, 4, 0, 3, 7..."
TROBJYA128F42A2984,Country,"[13, 5, 10, 8, 4, 2, 0, 1, 7, 1, 0, 1, 2, 2, 5..."
...,...,...
TRQFZEO128E07930BC,Rock,"[35, 7, 23, 10, 5, 1, 0, 9, 6, 0, 0, 4, 0, 5, ..."
TRNSWUN128F4247F87,Rock,"[8, 9, 0, 4, 3, 6, 0, 0, 0, 8, 1, 3, 0, 0, 2, ..."
TRLENOU128F4281E94,Rock,"[0, 0, 0, 0, 0, 24, 62, 0, 0, 0, 0, 0, 0, 0, 0..."
TRDTSQP128F421C665,Rock,"[19, 12, 0, 5, 7, 0, 9, 7, 0, 1, 0, 0, 2, 1, 1..."


In [None]:
genre_counts = 7
train_sample_count = 500
test_sample_count = 200

train_dfs = []
test_dfs = []

start_index = 0
for x in range(genre_counts):
    temp_df_train = lyric_df.iloc[start_index:start_index+train_sample_count]
    temp_df_test = lyric_df.iloc[start_index+train_sample_count:start_index+train_sample_count+test_sample_count]
    start_index += (train_sample_count + test_sample_count)
    train_dfs.append(temp_df_train)
    test_dfs.append(temp_df_test)

In [None]:
train_df = pd.concat(train_dfs)
test_df = pd.concat(test_dfs)

In [None]:
train_df

Unnamed: 0_level_0,Genre,wordvec
TrackId,Unnamed: 1_level_1,Unnamed: 2_level_1
TREQVVI128F427C38E,Country,"[8, 3, 7, 4, 9, 2, 3, 3, 2, 2, 7, 1, 0, 0, 7, ..."
TRYCYSD128F148CF20,Country,"[14, 4, 37, 0, 3, 0, 1, 0, 0, 3, 4, 0, 4, 3, 2..."
TRLCZET12903D03F70,Country,"[5, 17, 0, 6, 8, 14, 1, 0, 0, 8, 4, 1, 7, 0, 0..."
TRJORPT128F42BAA8D,Country,"[16, 3, 14, 7, 1, 6, 3, 2, 9, 0, 0, 4, 0, 3, 7..."
TROBJYA128F42A2984,Country,"[13, 5, 10, 8, 4, 2, 0, 1, 7, 1, 0, 1, 2, 2, 5..."
...,...,...
TRBJTSX128F92C26A3,Rock,"[21, 26, 1, 6, 2, 0, 0, 3, 1, 1, 3, 2, 2, 0, 2..."
TRSWYEM128F424B404,Rock,"[20, 8, 0, 1, 9, 12, 3, 6, 3, 8, 14, 8, 6, 0, ..."
TRHEVIY128F4263512,Rock,"[11, 11, 3, 2, 1, 7, 7, 3, 6, 7, 1, 4, 0, 0, 0..."
TRDPCZP12903CDF9D5,Rock,"[6, 5, 1, 1, 6, 0, 3, 2, 1, 0, 1, 3, 1, 0, 3, ..."


In [None]:
test_df

Unnamed: 0_level_0,Genre,wordvec
TrackId,Unnamed: 1_level_1,Unnamed: 2_level_1
TRXIUWQ128F42BAA8B,Country,"[8, 4, 11, 1, 0, 4, 0, 4, 1, 4, 1, 0, 0, 6, 2,..."
TRBDMCC128F9358267,Country,"[23, 18, 3, 5, 12, 4, 12, 8, 1, 8, 7, 0, 4, 0,..."
TRJPVZR128F92E7198,Country,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, ..."
TRSCLDV128F92FE706,Country,"[3, 2, 1, 3, 2, 1, 2, 3, 2, 0, 2, 5, 1, 0, 1, ..."
TROZBYN128F92F87D6,Country,"[5, 5, 5, 3, 1, 0, 1, 1, 0, 0, 2, 2, 2, 1, 1, ..."
...,...,...
TRQFZEO128E07930BC,Rock,"[35, 7, 23, 10, 5, 1, 0, 9, 6, 0, 0, 4, 0, 5, ..."
TRNSWUN128F4247F87,Rock,"[8, 9, 0, 4, 3, 6, 0, 0, 0, 8, 1, 3, 0, 0, 2, ..."
TRLENOU128F4281E94,Rock,"[0, 0, 0, 0, 0, 24, 62, 0, 0, 0, 0, 0, 0, 0, 0..."
TRDTSQP128F421C665,Rock,"[19, 12, 0, 5, 7, 0, 9, 7, 0, 1, 0, 0, 2, 1, 1..."


In [None]:
train_df['Genre'] = train_df['Genre'].astype('category')
test_df['Genre'] = test_df['Genre'].astype('category')

In [None]:
x_train = np.array(train_df['wordvec'].tolist())
y_train = keras.utils.to_categorical(train_df['Genre'].cat.codes, genre_counts)

In [None]:
x_test = np.array(test_df['wordvec'].tolist())
y_test = keras.utils.to_categorical(test_df['Genre'].cat.codes, genre_counts)

In [None]:
#Defining a function to save the objects as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [None]:
save_obj(x_train, 'x_train' + '_lyric')
save_obj(y_train, 'y_train' + '_lyric')

In [None]:
save_obj(x_test, 'x_test' + '_lyric')
save_obj(y_test, 'y_test' + '_lyric')

In [None]:
# TAKE ID FROM SHAREABLE LINK
#https://drive.google.com/file/d/1Or8iZjqWFFY9je-tlU0cAyeCR8HOfL9p/view?usp=sharing
file_id = r'1Or8iZjqWFFY9je-tlU0cAyeCR8HOfL9p'
# DESTINATION FILE ON YOUR DISK
destination = 'x_train_lyric.pkl'
download_file_from_google_drive(file_id, destination)

In [None]:
# TAKE ID FROM SHAREABLE LINK
#https://drive.google.com/file/d/1rUNA7qmL5c26LLjSy-vzuwvMBlj01EHt/view?usp=sharing
file_id = r'1rUNA7qmL5c26LLjSy-vzuwvMBlj01EHt'
# DESTINATION FILE ON YOUR DISK
destination = 'x_test_lyric.pkl'
download_file_from_google_drive(file_id, destination)

In [None]:
# TAKE ID FROM SHAREABLE LINK
#https://drive.google.com/file/d/1ByZBQv7N47hHr2We2cbqCaceRlalqxVv/view?usp=sharing
file_id = r'1ByZBQv7N47hHr2We2cbqCaceRlalqxVv'
# DESTINATION FILE ON YOUR DISK
destination = 'y_train_lyric.pkl'
download_file_from_google_drive(file_id, destination)

In [None]:
# TAKE ID FROM SHAREABLE LINK
#https://drive.google.com/file/d/1ylI-R1dy6_aFDK9EQyRbbxtvSJS4dCtn/view?usp=sharing
file_id = r'1ylI-R1dy6_aFDK9EQyRbbxtvSJS4dCtn'
# DESTINATION FILE ON YOUR DISK
destination = 'y_test_lyric.pkl'
download_file_from_google_drive(file_id, destination)

In [None]:
with open('x_train' + '_lyric' +'.pkl', 'rb') as handle:
    x_train = pickle.load(handle)

with open('y_train' + '_lyric' +'.pkl', 'rb') as handle:
    y_train = pickle.load(handle)

In [None]:
with open('x_test' + '_lyric' +'.pkl', 'rb') as handle:
    x_test = pickle.load(handle)

with open('y_test' + '_lyric' +'.pkl', 'rb') as handle:
    y_test = pickle.load(handle)

In [None]:
x_train[0].shape

(5000,)

In [None]:
y_train.shape

(3500, 7)

In [None]:
y_train_t = [np.argmax(x) for x in y_train]

In [None]:
y_test_t = [np.argmax(x) for x in y_test]

In [None]:
min(y_train_t)

0

In [None]:
genre_counts = 7

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(multi_class='multinomial')



In [None]:
model.fit(x_train, y_train_t)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(multi_class='multinomial')

In [None]:
y_test_pred = model.predict(x_test)

In [None]:
len(y_test_pred)

1400

In [None]:
sum(y_test_pred==y_test_t)

589

In [None]:
print('accuracy: ', sum(y_test_pred==y_test_t)/len(y_test_pred))

accuracy:  0.4207142857142857


In [None]:
import keras
from keras.layers import Dense, Activation, Flatten, Conv1D, Dropout, MaxPooling1D

model = Sequential()
model.add(Dense(64, input_shape=x_train[0].shape, activation='relu', kernel_initializer='he_uniform'))

model.add(Dense(64, activation='relu', kernel_initializer='he_uniform'))
#model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))

#model.add(Flatten())

model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
model.add(Dropout(0.5))

model.add(Dense(genre_counts, activation='softmax', kernel_initializer='he_uniform'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer='adam',
              metrics=['accuracy'])
model.summary

<bound method Model.summary of <keras.engine.sequential.Sequential object at 0x7f03db692810>>

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=4)
history = model.fit(x_train, y_train, epochs=100, batch_size=16, callbacks=[callback], verbose=1, validation_data=(x_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78