<div style="background-color: #f0f8ff; padding: 15px; border-radius: 5px; border-left: 5px solid #4169e1;">
<h2 style="color: #4169e1;">Project Overview</h2>
<p style="color: #333;">In this project, I aim to develop a model capable of generating song lyrics using Recurrent Neural Networks (RNNs). The objective is to explore the creative potential of RNNs in text generation, specifically within the context of songwriting. Building a lyrics generator has long been on my list of projects, and I am excited to finally bring this idea to life by leveraging deep learning techniques to produce original and creative text.</p>
</div>

<div style="background-color: #e6ffe6; padding: 10px; border-radius: 5px; border-left: 5px solid #228B22;">
<h2 style="color: #228B22;">Importing Libraries</h2>
</div>

In [2]:
# importing the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string, os
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding, Attention, LayerNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import random
import io
import warnings
warnings.filterwarnings("ignore")


# Load Dataset

In [3]:
data = pd.read_csv(r"C:\Users\ansar\OneDrive\Desktop\Lyrics_Generator-RNN\spotify_songs.csv")

In [4]:
data.head()

Unnamed: 0,track_id,track_name,track_artist,lyrics,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,language
0,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,Minsan pa Nang ako'y napalingon Hindi ko alam ...,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,2001-01-01,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,...,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,tl
1,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu...",28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,2017-11-21,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,...,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,en
2,00chLpzhgVjxs1zKC9UScL,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U...",0,6oZ6brjB8x3GoeSYdwJdPc,Gold,2005-01-01,"Back in the day - R&B, New Jack Swing, Swingbe...",3a9y4eeCJRmG9p4YKfqYIx,...,-7.504,0,0.216,0.00432,0.00723,0.489,0.65,111.904,262467,en
3,00cqd6ZsSkLZqGMlQCR0Zo,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...,41,3ssspRe42CXkhPxdc12xcp,CeeLo's Magic Moment,2012-10-29,Christmas Soul,6FZYc2BvF7tColxO8PBShV,...,-5.819,0,0.0341,0.689,0.0,0.0664,0.405,118.593,243067,en
4,00emjlCv9azBN0fzuuyLqy,Dumb Litty,KARD,Get up out of my business You don't keep me fr...,65,7h5X3xhh3peIK9Y0qI5hbK,KARD 2nd Digital Single ‘Dumb Litty’,2019-09-22,K-Party Dance Mix,37i9dQZF1DX4RDXswvP6Mj,...,-1.993,1,0.0409,0.037,0.0,0.138,0.24,130.018,193160,en


In [None]:
# Keep only the required columns
data = data[['track_name', 'track_artist', 'lyrics']]

# Display the first few rows to verify
print("Shape of the new dataframe:", data.shape)
data.head()


In [5]:
data.shape

(18454, 25)

In [None]:
# Rename the columns
data = data.rename(columns={
    'track_name': 'Song_Title',
    'track_artist': 'Artist',
    'lyrics': 'Lyrics'
})

# Display the first few rows to verify the new column names
print("DataFrame with renamed columns:")
data.head()


In [6]:
print(data.columns)

Index(['track_id', 'track_name', 'track_artist', 'lyrics', 'track_popularity',
       'track_album_id', 'track_album_name', 'track_album_release_date',
       'playlist_name', 'playlist_id', 'playlist_genre', 'playlist_subgenre',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'language'],
      dtype='object')


### To keep the model simple I am going to drop unneccesary columns. I will use track_name , trac_artist and lyrics columns

In [8]:
# Keep only the required columns
data = data[['track_name', 'track_artist', 'lyrics']]

# Display the first few rows to verify
print("Shape of the new dataframe:", data.shape)
data.head()

Shape of the new dataframe: (18454, 3)


Unnamed: 0,track_name,track_artist,lyrics
0,Pangarap,Barbie's Cradle,Minsan pa Nang ako'y napalingon Hindi ko alam ...
1,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu..."
2,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U..."
3,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...
4,Dumb Litty,KARD,Get up out of my business You don't keep me fr...


In [None]:
# First, let's fill NaN values with empty string
data['Lyrics'] = data['Lyrics'].fillna('')

# Now add columns for number of characters, words, and lines
data["No_of_Characters"] = data["Lyrics"].str.len()
data["No_of_Words"] = data["Lyrics"].apply(lambda x: len(nltk.word_tokenize(x)) if x != '' else 0)
data["No_of_Lines"] = data["Lyrics"].str.split('\n').apply(len)

# Display summary statistics
print("\nSummary Statistics:")
data[["No_of_Characters", "No_of_Words", "No_of_Lines"]].describe()


In [9]:
# I will just rename the columns to make it easier to work with
# Rename the columns
data = data.rename(columns={
    'track_name': 'Song_Title',
    'track_artist': 'Artist',
    'lyrics': 'Lyrics'
})

# Display the first few rows to verify the new column names
print("DataFrame with renamed columns:")
data.head()

DataFrame with renamed columns:


Unnamed: 0,Song_Title,Artist,Lyrics
0,Pangarap,Barbie's Cradle,Minsan pa Nang ako'y napalingon Hindi ko alam ...
1,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu..."
2,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U..."
3,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...
4,Dumb Litty,KARD,Get up out of my business You don't keep me fr...


## Exploratory Data Analysis

## Printing the names of Artists

In [10]:
print("Artists in the data:\n",data.Artist.value_counts())

Artists in the data:
 Artist
Queen                       125
David Guetta                 74
Don Omar                     74
Martin Garrix                71
Drake                        65
                           ... 
Yeah Yeah Yeahs               1
Black Pistol Fire             1
Jack Vallier                  1
Jeremy Jordan                 1
Ponderosa Twins Plus One      1
Name: count, Length: 6031, dtype: int64


In [None]:
# Adding columns for text analysis
data["No_of_Characters"] = data["Lyrics"].str.len()
data["No_of_Words"] = data["Lyrics"].apply(lambda x: len(nltk.word_tokenize(x)))

# Fix line counting by properly handling newlines
data["No_of_Lines"] = data["Lyrics"].apply(lambda x: len([line for line in x.split('\n') if line.strip()]))

# Display basic statistics
print("Summary Statistics:")
stats = data[["No_of_Characters", "No_of_Words", "No_of_Lines"]].describe()
print(stats)

# Display additional useful information
print("\nAdditional Analysis:")
print(f"Number of songs with empty lyrics: {len(data[data['Lyrics'] == ''])}")
print(f"Number of songs with less than 50 words: {len(data[data['No_of_Words'] < 50])}")
print("\nSample of a typical song length (around median):")
median_idx = abs(data['No_of_Words'] - data['No_of_Words'].median()).idxmin()
print(f"Title: {data.loc[median_idx, 'Song_Title']}")
print(f"Artist: {data.loc[median_idx, 'Artist']}")
print(f"Words: {data.loc[median_idx, 'No_of_Words']}")
print(f"Lines: {data.loc[median_idx, 'No_of_Lines']}")


### Checking for null values

In [13]:
data.isnull().sum()

Song_Title      0
Artist          0
Lyrics        260
dtype: int64

In [14]:
# Fill mull values with empty string

In [15]:
data['Lyrics'] = data['Lyrics'].fillna('')

In [16]:
data.isnull().sum()

Song_Title    0
Artist        0
Lyrics        0
dtype: int64

### I will do a little feature engineering to extract more information on the songs such as:

* Number of characters
* Number of words
* Number of lines

In [None]:
#Adding a column of numbers of Characters,words and sentences in each msg
data["No_of_Characters"] = data["Lyrics"].apply(len)
data["No_of_Words"]=data.apply(lambda row: nltk.word_tokenize(row["Lyrics"]), axis=1).apply(len)
data["No_of_Lines"] = data["Lyrics"].str.split('\n').apply(len)

data.describe()

Unnamed: 0,No_of_Characters,No_of_Words,No_of_Lines
count,18454.0,18454.0,18454.0
mean,2107.978162,503.365341,1.0
std,1719.57611,429.395751,0.0
min,0.0,0.0,1.0
25%,1139.0,264.0,1.0
50%,1678.0,396.0,1.0
75%,2556.0,614.0,1.0
max,27698.0,6748.0,1.0


### *The average number of lines in each song is only, which is wrong because any songs has more than one line. This suggests the line breaks aren't being properly captured in your dataset
* There's significant variation (std dev of 429 words)

* So I'll try to fix the line counting issue and add better analysis

In [20]:
# Adding columns for text analysis
data["No_of_Characters"] = data["Lyrics"].str.len()
data["No_of_Words"] = data["Lyrics"].apply(lambda x: len(nltk.word_tokenize(x)))

# Fix line counting by properly handling newlines
data["No_of_Lines"] = data["Lyrics"].apply(lambda x: len([line for line in x.split('\n') if line.strip()]))

# Display basic statistics
print("Summary Statistics:")
stats = data[["No_of_Characters", "No_of_Words", "No_of_Lines"]].describe()
print(stats)

# Display additional useful information
print("\nAdditional Analysis:")
print(f"Number of songs with empty lyrics: {len(data[data['Lyrics'] == ''])}")
print(f"Number of songs with less than 50 words: {len(data[data['No_of_Words'] < 50])}")
print("\nSample of a typical song length (around median):")
median_idx = abs(data['No_of_Words'] - data['No_of_Words'].median()).idxmin()
print(f"Title: {data.loc[median_idx, 'Song_Title']}")
print(f"Artist: {data.loc[median_idx, 'Artist']}")
print(f"Words: {data.loc[median_idx, 'No_of_Words']}")
print(f"Lines: {data.loc[median_idx, 'No_of_Lines']}")

Summary Statistics:
       No_of_Characters   No_of_Words   No_of_Lines
count      18454.000000  18454.000000  18454.000000
mean        2107.978162    503.365341      0.985911
std         1719.576110    429.395751      0.117862
min            0.000000      0.000000      0.000000
25%         1139.000000    264.000000      1.000000
50%         1678.000000    396.000000      1.000000
75%         2556.000000    614.000000      1.000000
max        27698.000000   6748.000000      1.000000

Additional Analysis:
Number of songs with empty lyrics: 260
Number of songs with less than 50 words: 516

Sample of a typical song length (around median):
Title: Get the Party Started
Artist: P!nk
Words: 396
Lines: 1


In [None]:
# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^a-z0-9\s.,!?\'"-]', '', text)
    
    # Standardize spacing
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

# Apply preprocessing to lyrics
data['Processed_Lyrics'] = data['Lyrics'].apply(preprocess_text)

# Create sequences for both continuation-based and description-based generation
MAX_SEQUENCE_LENGTH = 50
VOCAB_SIZE = 20000

# Initialize tokenizer
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(data['Processed_Lyrics'])

# Create word-to-index and index-to-word mappings
word_to_index = tokenizer.word_index
index_to_word = {v: k for k, v in word_to_index.items()}

# Function to create input sequences
def create_sequences(texts, seq_length=MAX_SEQUENCE_LENGTH):
    sequences = []
    next_words = []
    
    for text in texts:
        words = text.split()
        for i in range(0, len(words) - seq_length):
            seq = words[i:i + seq_length]
            next_word = words[i + seq_length]
            sequences.append(' '.join(seq))
            next_words.append(next_word)
    
    return sequences, next_words

# Create training sequences
input_sequences, target_words = create_sequences(data['Processed_Lyrics'])

# Convert sequences to numerical form
X = tokenizer.texts_to_sequences(input_sequences)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
y = tokenizer.texts_to_sequences(target_words)
y = to_categorical(y, num_classes=VOCAB_SIZE)

print(f"Number of training sequences: {len(X)}")
print(f"Sequence shape: {X.shape}")
print(f"Target shape: {y.shape}")


In [None]:
# Define the enhanced model architecture
def create_lyrics_generator_model(vocab_size, seq_length, embedding_dim=256):
    # Input for word sequences
    sequence_input = Input(shape=(seq_length,))
    
    # Embedding layer
    x = Embedding(vocab_size, embedding_dim)(sequence_input)
    
    # First LSTM layer with self-attention
    lstm_out = LSTM(512, return_sequences=True)(x)
    attention = Attention()([lstm_out, lstm_out])
    
    # Add Layer Normalization
    normalized = LayerNormalization()(attention)
    
    # Second LSTM layer
    lstm_out2 = LSTM(256)(normalized)
    
    # Dense layers with dropouts
    x = Dense(512, activation='relu')(lstm_out2)
    x = Dropout(0.2)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    
    # Output layer
    output = Dense(vocab_size, activation='softmax')(x)
    
    # Create model
    model = Model(inputs=sequence_input, outputs=output)
    
    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Create and display model
model = create_lyrics_generator_model(VOCAB_SIZE, MAX_SEQUENCE_LENGTH)
model.summary()

# Set up callbacks for training
checkpoint = ModelCheckpoint(
    'lyrics_generator_model.h5',
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=5,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X, y,
    epochs=50,
    batch_size=64,
    validation_split=0.2,
    callbacks=[checkpoint, early_stopping]
)


In [None]:
# Function to generate lyrics from seed text
def generate_lyrics(seed_text, next_words=50, temperature=0.7):
    """
    Generate lyrics from a seed text.
    Args:
        seed_text: Initial text to start generation
        next_words: Number of words to generate
        temperature: Controls randomness (lower = more conservative, higher = more creative)
    """
    generated_text = seed_text
    
    for _ in range(next_words):
        # Tokenize and pad the current sequence
        token_list = tokenizer.texts_to_sequences([generated_text])[0]
        token_list = token_list[-MAX_SEQUENCE_LENGTH:]
        token_list = pad_sequences([token_list], maxlen=MAX_SEQUENCE_LENGTH)
        
        # Get model predictions
        predicted_probs = model.predict(token_list, verbose=0)[0]
        
        # Apply temperature
        predicted_probs = np.log(predicted_probs) / temperature
        predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
        
        # Sample from the predicted probabilities
        predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
        
        # Convert index to word
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        
        # Add the predicted word to the sequence
        generated_text += " " + output_word
    
    return generated_text

# Function to generate lyrics based on description
def generate_from_description(description, style_words=None, next_words=50, temperature=0.7):
    """
    Generate lyrics based on a description and optional style words.
    Args:
        description: Text description of the desired lyrics
        style_words: Optional list of style-related words to influence generation
        next_words: Number of words to generate
        temperature: Controls randomness
    """
    # Preprocess description
    seed_text = preprocess_text(description)
    
    # Add style words if provided
    if style_words:
        seed_text = " ".join([seed_text] + style_words)
    
    # Generate lyrics
    return generate_lyrics(seed_text, next_words, temperature)

# Example usage
print("Example 1 - Continuation-based generation:")
seed_text = "in the darkness of night"
generated_lyrics = generate_lyrics(seed_text, next_words=50)
print(f"Seed: {seed_text}")
print(f"Generated: {generated_lyrics}\n")

print("Example 2 - Description-based generation:")
description = "a happy love song about summer days"
style_words = ["romantic", "upbeat", "sunny"]
generated_lyrics = generate_from_description(description, style_words, next_words=50)
print(f"Description: {description}")
print(f"Generated: {generated_lyrics}")


In [None]:
def generate_lyrics_interactive():
    """
    Interactive function that allows users to generate lyrics either by:
    1. Providing initial words
    2. Describing the type of song they want
    """
    print("Welcome to the Lyrics Generator!")
    print("\nHow would you like to generate lyrics?")
    print("1. Enter initial words (the model will continue from there)")
    print("2. Describe the type of song you want")
    
    choice = input("\nEnter your choice (1 or 2): ")
    
    # Set generation parameters
    num_words = int(input("\nHow many words would you like to generate? (default: 50): ") or 50)
    temperature = float(input("Enter temperature (0.1-1.0, default: 0.7): ") or 0.7)
    
    if choice == "1":
        # Continuation-based generation
        seed_text = input("\nEnter your initial words: ")
        if not seed_text:
            print("Error: Please provide some initial words.")
            return
        
        print("\nGenerating lyrics from your initial words...")
        generated_lyrics = generate_lyrics(seed_text, next_words=num_words, temperature=temperature)
        
        print("\nGenerated Lyrics:")
        print("-" * 50)
        print(generated_lyrics)
        print("-" * 50)
        
    elif choice == "2":
        # Description-based generation
        description = input("\nDescribe the type of song you want (e.g., 'a happy love song about summer'): ")
        if not description:
            print("Error: Please provide a description.")
            return
        
        use_style = input("\nWould you like to add style words? (yes/no): ").lower()
        style_words = []
        
        if use_style == "yes":
            print("\nEnter style words one by one (press Enter without text when done):")
            while True:
                word = input("Style word (or press Enter to finish): ")
                if not word:
                    break
                style_words.append(word)
        
        print("\nGenerating lyrics based on your description...")
        generated_lyrics = generate_from_description(
            description,
            style_words=style_words if style_words else None,
            next_words=num_words,
            temperature=temperature
        )
        
        print("\nGenerated Lyrics:")
        print("-" * 50)
        print(f"Based on description: {description}")
        if style_words:
            print(f"Style words used: {', '.join(style_words)}")
        print("-" * 50)
        print(generated_lyrics)
        print("-" * 50)
        
    else:
        print("Invalid choice. Please enter 1 or 2.")

# Example usage
print("To generate lyrics, run:")
print("generate_lyrics_interactive()")


In [None]:
# Example of how to use the interactive lyrics generator
print("Example usage scenarios:\n")
print("Scenario 1 - Continuation-based:")
print("1. Choose option 1")
print("2. Enter initial words like: 'in the moonlight we dance'")
print("3. Choose number of words (e.g., 50)")
print("4. Set temperature (e.g., 0.7 for balanced creativity)")
print("\nScenario 2 - Description-based:")
print("1. Choose option 2")
print("2. Enter description like: 'a romantic song about first love'")
print("3. Add style words like: romantic, gentle, sweet")
print("4. Choose number of words and temperature")
print("\nTry it now:")
generate_lyrics_interactive()
