In [None]:
import pandas as pd

In [None]:
# Read the data
data = pd.read_csv('chords_and_lyrics.csv')

In [None]:
print(data.head())

In [None]:
# Filter out all non-English songs
df_filtered = data[data['lang'] == 'en']

# Drop unnecessary columns that aren't needed for model training
df_filtered = df_filtered.drop(columns=['Unnamed: 0', 'artist_id', 'followers', 'genres', 'popularity', 'name_e_chords', 'tabs'])

# Handle missing values (if any)
df_filtered = df_filtered.dropna(subset=['chords', 'lyrics'])

# Preview the cleaned dataset
print(df_filtered.head())


In [None]:
import ast

# Function to parse chord dictionary and get the chord progression
def parse_chords(chord_dict):
    try:
        chord_dict = ast.literal_eval(chord_dict)
        # Extract chord progression
        chords = ' '.join([chord for chord in chord_dict.values()])
        return chords
    except:
        return ""

# Function to parse lyrics dictionary and get the lyrics
def parse_lyrics(lyric_dict):
    try:
        lyric_dict = ast.literal_eval(lyric_dict)
        # Extract lyrics text
        lyrics = ' '.join([lyric for lyric in lyric_dict.values()])
        return lyrics
    except:
        return ""

# Apply parsing functions
df_filtered['chord_progression'] = df_filtered['chords'].apply(parse_chords)
df_filtered['lyrics_text'] = df_filtered['lyrics'].apply(parse_lyrics)

# Preview the results
print(df_filtered[['song_name', 'chord_progression', 'lyrics_text']].head())


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the lyrics
X_lyrics = vectorizer.fit_transform(df_filtered['lyrics_text'])

# Preview the vectorized lyrics
print(X_lyrics.shape)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout

# Parameters
num_chords = len(set(' '.join(df_filtered['chord_progression']).split()))  # Number of unique chords
max_lyrics_length = max(df_filtered['lyrics_text'].apply(lambda x: len(x.split())))  # Max length of lyrics

# Build LSTM model
model = Sequential()

# Embedding layer for lyrics input
model.add(Embedding(input_dim=len(vectorizer.get_feature_names_out()), output_dim=100, input_length=max_lyrics_length))

# LSTM layers
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

# Output layer for chord prediction
model.add(Dense(num_chords, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Encode chords into integers
encoder = LabelEncoder()
y_chords = encoder.fit_transform(df_filtered['chord_progression'])

# Convert to one-hot encoding
y_chords_onehot = to_categorical(y_chords)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_lyrics, y_chords_onehot, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

In [None]:
# Split data into input (lyrics) and target (chords)
lyrics = df_filtered['lyrics'].values
chords = df_filtered['chords'].values

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define tokenization function
def tokenize_lyrics(lyrics):
    return lyrics.split("\n")  # Split lyrics by newlines

def tokenize_chords(chords):
    return chords.split("\n")  # Split chords by newlines

# Tokenize the lyrics and chords
tokenized_lyrics = [tokenize_lyrics(lyric) for lyric in df_filtered['lyrics'].tolist()]
tokenized_chords = [tokenize_chords(chord) for chord in df_filtered['chords'].tolist()]

# Display example of tokenized lyrics and chords
print(tokenized_lyrics[0])
print(tokenized_chords[0])

In [None]:
# Tokenizer for lyrics and chords
lyrics_tokenizer = Tokenizer()
lyrics_tokenizer.fit_on_texts(tokenized_lyrics)

chords_tokenizer = Tokenizer()
chords_tokenizer.fit_on_texts(tokenized_chords)

# Convert tokenized text into sequences
lyrics_sequences = lyrics_tokenizer.texts_to_sequences(tokenized_lyrics)
chords_sequences = chords_tokenizer.texts_to_sequences(tokenized_chords)

# Pad sequences for consistency
max_lyrics_length = max(len(seq) for seq in lyrics_sequences)
max_chords_length = max(len(seq) for seq in chords_sequences)

lyrics_sequences_padded = pad_sequences(lyrics_sequences, maxlen=max_lyrics_length, padding='post')
chords_sequences_padded = pad_sequences(chords_sequences, maxlen=max_chords_length, padding='post')

# Display example of padded sequences
print(lyrics_sequences_padded[0])
print(chords_sequences_padded[0])

In [None]:
import numpy as np

# Ensure the model is compiled before fitting
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed
from tensorflow.keras.optimizers import Adam

# Define model parameters
vocab_size_lyrics = len(lyrics_tokenizer.word_index) + 1  # +1 for padding
vocab_size_chords = len(chords_tokenizer.word_index) + 1  # +1 for padding
embedding_dim = 100
hidden_units = 128

# Build the Seq2Seq model
model = Sequential()

# Encoder: LSTM layer for processing lyrics
model.add(Embedding(input_dim=vocab_size_lyrics, output_dim=embedding_dim))
model.add(LSTM(hidden_units, return_sequences=True))

# Decoder: LSTM layer for generating chords
model.add(LSTM(hidden_units, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size_chords, activation='softmax')))

# Compile the model (make sure this is done before fitting the model)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Explicitly build the model by calling model.build() (optional)
# This helps avoid issues when the input shape is inferred.
model.build(input_shape=(None, max_lyrics_length))  # Input shape: (batch_size, max_lyrics_length)

# Now the model is compiled and built, we can train it

# Prepare the target data by shifting the chords sequence by one step
chords_input = chords_sequences_padded[:, :-1]  # Input for the model (not used directly in training)
chords_output = chords_sequences_padded[:, 1:]  # Correct output for the model (shifted by one step)

# Make sure chords_output has shape (None, timesteps)
# Remove any extra dimensions (e.g., reshape from (None, timesteps, 1) to (None, timesteps))
# For sparse categorical crossentropy, the target should be a sequence of integers (shape: (None, timesteps))
# So, no need to expand the dimensions anymore.
# The output should be a sequence of integers, not one-hot encoded vectors.

# Train the model
model.fit(lyrics_sequences_padded, chords_output, epochs=10, batch_size=32, validation_split=0.2)


In [None]:
import numpy as np

# Prepare the target data by shifting the chords sequence by one step
chords_input = chords_sequences_padded[:, :-1]  # Input for the model (not used directly in training)
chords_output = chords_sequences_padded[:, 1:]  # Correct output for the model (shifted by one step)

# We need to reshape `chords_output` to have the shape (None, timesteps), where each timestep is a chord index
# The target should be an integer array where each element is the index of a chord in the vocabulary
chords_output = np.expand_dims(chords_output, -1)  # Add an extra dimension to make it (None, timesteps, 1)

# Train the model
model.fit(lyrics_sequences_padded, chords_output, epochs=10, batch_size=32, validation_split=0.2)


In [None]:
# Pad sequences to ensure consistent input length
max_lyrics_length = max([len(seq) for seq in lyrics_seq])
lyrics_seq_padded = pad_sequences(lyrics_seq, maxlen=max_lyrics_length)

In [None]:
max_chords_length = max([len(seq) for seq in chords_seq])
chords_seq_padded = pad_sequences(chords_seq, maxlen=max_chords_length)

In [None]:

# Define the number of unique chords (vocabulary size)
vocab_size_chords = len(tokenizer_chords.word_index) + 1  # Including padding

# Build the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer_lyrics.word_index) + 1, output_dim=128, input_length=max_lyrics_length))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dense(vocab_size_chords, activation='softmax'))  # Output layer should match vocab_size_chords

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


In [None]:
# Train the model (you will need to split your data and adjust batch sizes)
model.fit(lyrics_seq_padded, chords_seq_padded, epochs=10, batch_size=32)