In [10]:
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv("Spotify Million Song Dataset_exported.csv")
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [12]:
df=df.drop(['link'],axis=1)

Removing Indentations

In [13]:
indent = ["\n", "\r", "\t"]
def remove_indents(text, indent):
    for elements in indent:
        text = text.replace(elements, '')
    return text

df['text'] = df['text'].apply(lambda elements: remove_indents(elements, indent))

Making Lowercase

In [14]:
df['text'] = df['text'].str.lower()

Removing possesive words

In [15]:
import re

def remove_words_with_apostrophes(input_string):
    pattern = r'\b\w+\'\w+\b'
    result = re.sub(pattern, '', input_string)
    return result

df['text'] = df['text'].apply(lambda elements: remove_words_with_apostrophes(elements))

Removing Punctuation

In [16]:
import string
def remove_punctuation(input_string):
    translation_table = str.maketrans('', '', string.punctuation)
    result = input_string.translate(translation_table)
    return result

df['text'] = df['text'].apply(lambda elements: remove_punctuation(elements))

In [17]:
df['text'][0]

'look at her face  a wonderful face  and it means something special to me  look at the way that she smiles when she sees me  how lucky can one fellow be     just my kind of girl she makes me feel fine  who could ever believe that she could be mine   just my kind of girl without her  blue  and if she ever leaves me what could i do what could i do    and when we go for a walk in the park  and she holds me and squeezes my hand   go on walking for hours and talking  about all the things that we plan     just my kind of girl she makes me feel fine  who could ever believe that she could be mine   just my kind of girl without her  blue  and if she ever leaves me what could i do what could i do'

Tokenize the Lyrics

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])

In [19]:
df['text'][0]

'look at her face  a wonderful face  and it means something special to me  look at the way that she smiles when she sees me  how lucky can one fellow be     just my kind of girl she makes me feel fine  who could ever believe that she could be mine   just my kind of girl without her  blue  and if she ever leaves me what could i do what could i do    and when we go for a walk in the park  and she holds me and squeezes my hand   go on walking for hours and talking  about all the things that we plan     just my kind of girl she makes me feel fine  who could ever believe that she could be mine   just my kind of girl without her  blue  and if she ever leaves me what could i do what could i do'

Padding and sequencing

In [20]:
# Pad the sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [21]:
padded_sequences[0]

array([  117,    63,    61,   158,     6,   953,   158,     5,    10,
         768,   157,   855,     4,     7,   117,    63,     1,    60,
          13,    52,  1511,    28,    52,  1286,     7,    72,   794,
          32,    40,  3449,    17,    24,     8,   339,    11,   100,
          52,   303,     7,    76,   357,   106,    82,   124,   141,
          13,    52,    82,    17,   188,    24,     8,   339,    11,
         100,   196,    61,   220,     5,    37,    52,   124,   819,
           7,    31,    82,     2,    33,    31,    82,     2,    33,
           5,    28,    22,    41,    19,     6,   198,     9,     1,
        1239,     5,    52,  1493,     7,     5, 18393,     8,   194,
          41,    14,   460,    19,   911,     5,   504,   104,    15,
           1,   132,    13,    22,   918,    24,     8,   339,    11,
         100,    52,   303,     7,    76,   357,   106,    82,   124,
         141,    13,    52,    82,    17,   188,    24,     8,   339,
          11,   100,

Multi Labeling

In [23]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['artist_encoded'] = label_encoder.fit_transform(df['artist'])
df['song_encoded'] = label_encoder.fit_transform(df['song'])

Train Tesst Split

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train_artist, y_test_artist, y_train_song, y_test_song = train_test_split(
    padded_sequences, df['artist_encoded'], df['song_encoded'], test_size=0.2, random_state=42)

LSTM Model creation 

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [26]:
embedding_dim = 300
lstm_out = 300
units = 100

In [27]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=units))
model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))

In [28]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [29]:
from tensorflow.keras.callbacks import EarlyStopping
Log_history = model.fit(X_train, y_train_artist, epochs=25, batch_size=64, validation_split=0.2, callbacks=[EarlyStopping(patience=3)])

Epoch 1/25
Epoch 2/25
Epoch 3/25

In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test_artist)
print(f'Test loss: {test_loss}, Test accuracy: {test_accuracy}')