<div style="background-color: #f0f8ff; padding: 15px; border-radius: 5px; border-left: 5px solid #4169e1;">
<h2 style="color: #4169e1;">Project Overview</h2>
<p style="color: #333;">In this project, I aim to develop a model capable of generating song lyrics using Recurrent Neural Networks (RNNs). The objective is to explore the creative potential of RNNs in text generation, specifically within the context of songwriting. Building a lyrics generator has long been on my list of projects, and I am excited to finally bring this idea to life by leveraging deep learning techniques to produce original and creative text.</p>
</div>

<div style="background-color: #e6ffe6; padding: 10px; border-radius: 5px; border-left: 5px solid #228B22;">
<h2 style="color: #228B22;">Importing Libraries</h2>
</div>

In [26]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize

nltk.download("punkt")
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import pickle
import warnings

warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ansar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load Dataset

In [11]:
data = pd.read_csv("spotify_songs.csv")

data.head()

Unnamed: 0,track_id,track_name,track_artist,lyrics,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,language
0,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,Minsan pa Nang ako'y napalingon Hindi ko alam ...,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,2001-01-01,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,...,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,tl
1,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu...",28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,2017-11-21,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,...,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,en
2,00chLpzhgVjxs1zKC9UScL,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U...",0,6oZ6brjB8x3GoeSYdwJdPc,Gold,2005-01-01,"Back in the day - R&B, New Jack Swing, Swingbe...",3a9y4eeCJRmG9p4YKfqYIx,...,-7.504,0,0.216,0.00432,0.00723,0.489,0.65,111.904,262467,en
3,00cqd6ZsSkLZqGMlQCR0Zo,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...,41,3ssspRe42CXkhPxdc12xcp,CeeLo's Magic Moment,2012-10-29,Christmas Soul,6FZYc2BvF7tColxO8PBShV,...,-5.819,0,0.0341,0.689,0.0,0.0664,0.405,118.593,243067,en
4,00emjlCv9azBN0fzuuyLqy,Dumb Litty,KARD,Get up out of my business You don't keep me fr...,65,7h5X3xhh3peIK9Y0qI5hbK,KARD 2nd Digital Single ‘Dumb Litty’,2019-09-22,K-Party Dance Mix,37i9dQZF1DX4RDXswvP6Mj,...,-1.993,1,0.0409,0.037,0.0,0.138,0.24,130.018,193160,en


### To keep the model simple I am going to drop unneccesary columns. I will use track_name , trac_artist and lyrics columns

In [12]:
data = data[["track_name", "track_artist", "lyrics"]].rename(
    columns={"track_name": "Song_Title", "track_artist": "Artist", "lyrics": "Lyrics"}
)

In [13]:
data.head()

Unnamed: 0,Song_Title,Artist,Lyrics
0,Pangarap,Barbie's Cradle,Minsan pa Nang ako'y napalingon Hindi ko alam ...
1,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu..."
2,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U..."
3,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...
4,Dumb Litty,KARD,Get up out of my business You don't keep me fr...


In [14]:
data.isnull().sum()

Song_Title      0
Artist          0
Lyrics        260
dtype: int64

In [15]:
# filling missing values with empty string
data["Lyrics"] = data["Lyrics"].fillna("")
data = data[data["Lyrics"].str.strip() != ""]
data = data.reset_index(drop=True)

In [16]:
data.isnull().sum()

Song_Title    0
Artist        0
Lyrics        0
dtype: int64

In [17]:
# print the artists in the data
print("Artists in the data:\n",data.Artist.value_counts()) 

Artists in the data:
 Artist
Queen                 123
Don Omar               74
David Guetta           73
Drake                  65
Guns N' Roses          63
                     ... 
Christina Grimmie       1
Luke Bryan              1
Magic City Hippies      1
Rocco Hunt              1
Steady Rollin           1
Name: count, Length: 5946, dtype: int64


In [18]:
data.shape

(18194, 3)

### Extracting more information on the songs such as:

* Number of characters
* Number of words
* Number of lines

In [19]:
#Adding a column of numbers of Characters,words and sentences in each msg
data["No_of_Characters"] = data["Lyrics"].apply(len)
data["No_of_Words"]=data.apply(lambda row: nltk.word_tokenize(row["Lyrics"]), axis=1).apply(len)
data["No_of_Lines"] = data["Lyrics"].str.split('\n').apply(len)
data.describe()

Unnamed: 0,No_of_Characters,No_of_Words,No_of_Lines
count,18194.0,18194.0,18194.0
mean,2138.102067,510.558646,1.0
std,1713.122517,428.185625,0.0
min,4.0,1.0,1.0
25%,1164.0,269.0,1.0
50%,1693.0,400.0,1.0
75%,2572.0,617.75,1.0
max,27698.0,6748.0,1.0


### Removing very short and very long lyrics (e.g., less than 20 words or more than 2000 words).
#### Removing duplicates and empty lyrics.

In [20]:
# Remove very short or very long lyrics
data = data[(data['Lyrics'].str.split().apply(len) > 20) & (data['Lyrics'].str.split().apply(len) < 2000)]
data = data.drop_duplicates(subset=['Lyrics'])
data = data[data['Lyrics'].str.strip() != '']
data = data.reset_index(drop=True)

### Data Prepration

In [21]:
    from tensorflow.keras.preprocessing.text import Tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data['Lyrics'])

In [22]:
# Set a reasonable max sequence length
import random

MAX_SEQ_LEN = 40

sequences = []
for lyric in data['Lyrics']:
    token_list = tokenizer.texts_to_sequences([lyric])[0]
    n_grams = [token_list[:i+1] for i in range(1, min(len(token_list), MAX_SEQ_LEN))]
    if len(n_grams) > 10:
        n_grams = random.sample(n_grams, 10)
    sequences.extend(n_grams)

sequences = pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='pre')
X, y = sequences[:,:-1], sequences[:,-1]

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, 50, input_length=MAX_SEQ_LEN-1))
model.add(LSTM(64))
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

### Model Training

In [27]:
# Train the model
history = model.fit(X, y, epochs=10, validation_split=0.1, batch_size=128)

Epoch 1/10
[1m1108/1108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m631s[0m 567ms/step - accuracy: 0.0308 - loss: 8.0830 - val_accuracy: 0.0362 - val_loss: 7.0879
Epoch 2/10
[1m1108/1108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m650s[0m 587ms/step - accuracy: 0.0451 - loss: 6.6547 - val_accuracy: 0.0594 - val_loss: 6.8563
Epoch 3/10
[1m1108/1108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m665s[0m 600ms/step - accuracy: 0.0661 - loss: 6.2508 - val_accuracy: 0.0711 - val_loss: 6.7750
Epoch 4/10
[1m1108/1108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m662s[0m 597ms/step - accuracy: 0.0840 - loss: 6.0020 - val_accuracy: 0.0809 - val_loss: 6.7352
Epoch 5/10
[1m1108/1108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m657s[0m 593ms/step - accuracy: 0.0982 - loss: 5.7937 - val_accuracy: 0.0895 - val_loss: 6.7444
Epoch 6/10
[1m1108/1108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m674s[0m 608ms/step - accuracy: 0.1091 - loss: 5.6069 - val_accuracy: 0.0941 - val_loss:

### Lyrics generation Funcion

In [24]:
import numpy as np

def generate_lyrics(seed_text, next_words=50, max_seq_len=50):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == np.argmax(predicted):
                output_word = word
                break
        if output_word == "":
            break
        seed_text += " " + output_word
    return seed_text

### Saving Model

In [25]:
# Save model and tokenizer for deployment
model.save('lyrics_model.h5')
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

