**IMPORTING THE LIBRARIES**

In [None]:
# STATS AND EDA
import numpy as np
import pandas as pd

In [4]:
# DATA VISUALIZATION
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud

In [42]:
# MODELLING
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import pickle
from keras.models import model_from_json

In [43]:
# PREDICTIONS
import pickle
from keras.models import model_from_json
from keras.preprocessing.sequence import pad_sequences
import numpy as np

**IMPORTING DATASET**

In [5]:
data = pd.read_csv("/kaggle/input/57651-spotify-songs/Spotify Million Song Dataset_exported.csv")

In [6]:
data.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [7]:
rows, cols = data.shape[0], data.shape[1]

print(f'Rows: {rows}')
print(f'Columns: {cols}')

Rows: 57650
Columns: 4


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


**Exploratory Data Analysis (EDA)**

In [9]:
data_eda = data.copy()
data_eda = data_eda.drop(['artist','link'] , axis=1)
data_eda.head()

Unnamed: 0,song,text
0,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd..."
1,"Andante, Andante","Take it easy with me, please \nTouch me gentl..."
2,As Good As New,I'll never know why I had to go \nWhy I had t...
3,Bang,Making somebody happy is a question of give an...
4,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [10]:
data_eda['song_length'] = data_eda['text'].apply(lambda x: len(x.split()))
fig = px.histogram(data_eda, x='song_length', title='Distribution of Song Lengths')
fig.show()

In [11]:
lyrics_text = ' '.join(data_eda['text'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lyrics_text)
fig = go.Figure(go.Image(z=wordcloud.to_array()))
fig.update_layout(title_text='Word Cloud of Most Common Words in Lyrics')
fig.show()

In [12]:
data_eda['song_name_length'] = data_eda['song'].apply(lambda x: len(x))
fig = px.histogram(data_eda, x='song_name_length', title='Distribution of Song Name Lengths')
fig.show()

In [13]:
word_freq = pd.Series(lyrics_text.split()).value_counts()[:10]
fig = px.bar(word_freq, x=word_freq.index, y=word_freq.values, title='Top 10 Most Common Words in Lyrics')
fig.show()

In [15]:
FreqOfWords = data_eda['text'].str.split(expand=True).stack().value_counts()
FreqOfWords_top300 = FreqOfWords[:300]

fig = px.treemap(FreqOfWords_top300, path=[FreqOfWords_top300.index], values=FreqOfWords_top300)
fig.update_layout(title_text='Frequency of Words in the Dataset',
                  title_x=0.5, title_font=dict(size=22)
                  )
fig.update_traces(textinfo="label+value")
fig.show()

In [16]:
data_eda['unique_words'] = data_eda['text'].apply(lambda x: len(set(x.split())))
fig = px.histogram(data_eda, x='unique_words', title='Distribution of Unique Words in Lyrics')
fig.update_layout(xaxis_title='Number of Unique Words', yaxis_title='Frequency')
fig.show()

In [17]:
fig = px.scatter(data_eda, x='song_length', y='unique_words', 
                 title='Relationship between Song Length and Unique Words in Lyrics')
fig.update_layout(xaxis_title='Song Length', yaxis_title='Number of Unique Words')
fig.show()

**MODELLING**

In [19]:
data_model = data.copy()

In [20]:
tokenizer = Tokenizer()

songs = []
for row in data_model["text"][:1000]:
    song = row.lower().replace("  ", "").split("\n")
    song = [word for word in song if word != ""]
    songs.append(song)
    
corpus = [element for innerList in songs for element in innerList]

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [22]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))


xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [23]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 24, 100)           934900    
                                                                 
 bidirectional (Bidirection  (None, 300)               301200    
 al)                                                             
                                                                 
 dense (Dense)               (None, 9349)              2814049   
                                                                 
Total params: 4050149 (15.45 MB)
Trainable params: 4050149 (15.45 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
# earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')

In [26]:
history = model.fit(xs, ys, epochs=60, verbose=1)

Epoch 1/30


I0000 00:00:1707892863.709931     135 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [28]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [29]:
model_json = model.to_json()
with open("model_architecture.json", "w") as json_file:
    json_file.write(model_json)

In [30]:
model.save_weights("model_weights.h5")

In [31]:
with open('vocabulary_size.txt', 'w') as f:
    f.write(str(total_words))

In [39]:
loss_trace = go.Scatter(x=history.epoch, y=history.history['loss'], mode='lines', name='Training Loss')
fig_loss = go.Figure(loss_trace)
fig_loss.update_layout(title='Training Loss Over Epochs',
                       xaxis_title='Epoch',
                       yaxis_title='Loss')
fig_loss.show()

In [40]:
accuracy_trace = go.Scatter(x=history.epoch, y=history.history['accuracy'], mode='lines', name='Training Accuracy')
fig_accuracy = go.Figure(accuracy_trace)
fig_accuracy.update_layout(title='Training Accuracy Over Epochs',
                           xaxis_title='Epoch',
                           yaxis_title='Accuracy')

fig_accuracy.show()

**GENERATIONS**

In [34]:
seed_text = "this is a good day"
next_words = 100
  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word



In [35]:
print(seed_text)

this is a good day give it now i'll get the end of now i feel i'm gonna do it along with my am barrel sleigh once in the air bells here till till you do the meatloaf song frolic do i do do he ibig now's along the touch of cars harshly me outside my mind and play it harshly up you up and you turn me parts the anger parts just use it surgical here here you now's me here surround me please you and the girls that you love me love you love you you me and you turn me in that


**IMPORTING MODEL EXTERNALY AND GENERATINNG**

In [36]:
with open('/kaggle/working/tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

with open("/kaggle/working/model_architecture.json", "r") as json_file:
    loaded_model_json = json_file.read()
model = model_from_json(loaded_model_json)

model.load_weights("/kaggle/working/model_weights.h5")

with open('/kaggle/working/vocabulary_size.txt', 'r') as f:
    total_words = int(f.read())

seed_text = "this is a good day"
next_words = 100
generated_text = seed_text

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
    generated_text += " " + output_word

print(generated_text)


this is a good day give it now i'll get the end of now i feel i'm gonna do it along with my am barrel sleigh once in the air bells here till till you do the meatloaf song frolic do i do do he ibig now's along the touch of cars harshly me outside my mind and play it harshly up you up and you turn me parts the anger parts just use it surgical here here you now's me here surround me please you and the girls that you love me love you love you you me and you turn me in that
