In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv("Spotify Million Song Dataset_exported.csv")
df=df.drop(['link'],axis=1)
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [3]:
df_new = pd.DataFrame(columns=['artist', 'song', 'text'])

In [4]:
indent = ["\n", "\r", "\t"]
def remove_indents(text, indent):
    for elements in indent:
        text = text.replace(elements, '')
    return text

df['text'] = df['text'].apply(lambda elements: remove_indents(elements, indent))

In [5]:
df['text'] = df['text'].str.lower()


In [6]:
import re
def remove_words_with_apostrophes(input_string):
    pattern = r'\b\w+\'\w+\b'
    result = re.sub(pattern, '', input_string)
    return result

df['text'] = df['text'].apply(lambda elements: remove_words_with_apostrophes(elements))

In [7]:
import string
def remove_punctuation(input_string):
    translation_table = str.maketrans('', '', string.punctuation)
    result = input_string.translate(translation_table)
    return result

df['text'] = df['text'].apply(lambda elements: remove_punctuation(elements))

In [8]:
len(df['text'][0])

694

In [27]:
df_new = pd.DataFrame(columns=['artist', 'song', 'text'])

# Create an empty list to store dictionaries
'''def split_text(text):
    length = len(text)
    split_length = length // 5
    return [text[i:i+split_length] for i in range(0, length, split_length)]'''

def split_text(text):
    words = text.split()
    num_words = len(words)
    split_length = num_words // 5
    return [' '.join(words[i:i+split_length]) for i in range(0, num_words, split_length)]

# Apply the function to the 'text' column and create a new dataframe
df_new = df.copy()
df_new['text'] = df_new['text'].apply(split_text)

# Explode the list of substrings into separate rows
df_new = df_new.explode('text')

# Reset index
df_new.reset_index(drop=True, inplace=True)

df_new


Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,look at her face a wonderful face and it means...
1,ABBA,Ahe's My Kind Of Girl,fellow be just my kind of girl she makes me fe...
2,ABBA,Ahe's My Kind Of Girl,and if she ever leaves me what could i do what...
3,ABBA,Ahe's My Kind Of Girl,squeezes my hand go on walking for hours and t...
4,ABBA,Ahe's My Kind Of Girl,ever believe that she could be mine just my ki...
...,...,...,...
334302,Zwan,Heartsong,write down these lines how can you say that i ...
334303,Zwan,Heartsong,must do come on hit the road up to my tricks i...
334304,Zwan,Heartsong,the same words to say the same things how can ...
334305,Zwan,Heartsong,to you what you must do how can you say that i...


In [28]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
lyrics = df_new['text'].apply(lambda x: ' '.join([ps.stem(word) for word in re.findall(r'\b\w+\b', x.lower()) if word not in stop_words]))

In [29]:
df_new['text'][0]

'look at her face a wonderful face and it means something special to me look at the way that she smiles when she sees me how lucky can one'

In [30]:
df_new['text'][1]

'fellow be just my kind of girl she makes me feel fine who could ever believe that she could be mine just my kind of girl without her blue'

In [31]:
df_new['text'][2]

'and if she ever leaves me what could i do what could i do and when we go for a walk in the park and she holds me and'

In [36]:
#Max length of a word
def max_word_length(string):
    words = string.split()
    max_length = 0
    for word in words:
        if len(word) > max_length:
            max_length = len(word)
    return max_length

# Test the function
max_length = 0
for i in range(334307):
    if max_word_length(df_new['text'][i])>max_length:
        max_length=max_word_length(df_new['text'][i])

max_length

104

In [33]:
#Max number of words in a lyric
max_no_words=len(df_new['text'][0])
for songs in df_new['text']:
    if len(songs)>max_no_words:
        max_no_words=len(songs)

max_no_words

862

In [37]:
print(max_length)
print(max_no_words)

104
862


In [38]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=max_no_words)
tokenizer.fit_on_texts(df_new['text'])
sequences = tokenizer.texts_to_sequences(df_new['text'])
X = pad_sequences(sequences, maxlen=max_length)

In [39]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

Y = label_encoder.fit_transform(df_new['song'])

In [40]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential

model = Sequential([
    Embedding(max_no_words, 32, input_length=max_length),
    Bidirectional(LSTM(32)),
    Dense(df_new['song'].nunique(), activation='softmax')
])

In [43]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [44]:
log_history = model.fit(X, Y, epochs=25, validation_split=0.2)

Epoch 1/25
1701/8358 [=====>........................] - ETA: 4:56 - loss: 10.6706 - accuracy: 5.5115e-04

In [None]:
input_snippet = df_new['text'][0]#"Look at her face, it's a wonderful face  \r\nAnd it means something special to me  \r\nLook at the way that she smiles when she sees me  \r\nHow lucky can one fellow be?"
input_sequence = tokenizer.texts_to_sequences([input_snippet])
input_sequence_padded = pad_sequences(input_sequence, maxlen=max_length)
predicted_index = np.argmax(model.predict(input_sequence_padded))
predicted_song = df_new['song'].unique()[predicted_index]

print(f'Predicted Song: {predicted_song}')